From 65f118d72fa9045320d8a1e321f0da955a2d8e9a Mon Sep 17 00:00:00 2001 From: Dmitry Kovalev Date: Fri, 11 Oct 2013 18:27:12 -0700 Subject: [PATCH] Making input pointer of any inverse transform constant. Also renaming dest_stride to stride in some places. Change-Id: I75f602b623a5a7071d4922b747c45fa0b7d7a940 --- test/dct16x16_test.cc | 7 +- test/dct32x32_test.cc | 2 +- test/fdct8x8_test.cc | 7 +- vp9/common/arm/neon/vp9_idct16x16_neon.c | 44 +++--- vp9/common/vp9_blockd.h | 2 +- vp9/common/vp9_idct.c | 119 ++++++++------- vp9/common/vp9_idct.h | 18 ++- vp9/common/vp9_rtcd_defs.sh | 30 ++-- vp9/common/x86/vp9_idct_intrin_sse2.c | 178 ++++++++++++----------- 9 files changed, 206 insertions(+), 201 deletions(-) diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 11cbfd00e..3d61d40e0 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -21,7 +21,7 @@ extern "C" { #include "vp9/common/vp9_entropy.h" #include "./vp9_rtcd.h" -void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *output, int pitch); +void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch); } #include "vpx/vpx_integer.h" @@ -258,9 +258,10 @@ void reference_16x16_dct_2d(int16_t input[256], double output[256]) { } typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride); -typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride); +typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride); typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type); -typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type); +typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride, + int tx_type); void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) { vp9_short_fdct16x16_c(in, out, stride); diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 0df466d59..f456abcd1 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -75,7 +75,7 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], } typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride); -typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride); +typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *dst, int stride); class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) { public: diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 34066bfb8..728db6dc7 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -21,7 +21,7 @@ extern "C" { #include "vp9/common/vp9_entropy.h" #include "./vp9_rtcd.h" -void vp9_idct8x8_64_add_c(int16_t *input, uint8_t *output, int pitch); +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch); } #include "vpx/vpx_integer.h" @@ -29,9 +29,10 @@ using libvpx_test::ACMRandom; namespace { typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride); -typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride); +typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride); typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type); -typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type); +typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride, + int tx_type); void fdct8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) { vp9_short_fdct8x8_c(in, out, stride); diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c index 33aa4e001..0b9fc09ab 100644 --- a/vp9/common/arm/neon/vp9_idct16x16_neon.c +++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c @@ -11,31 +11,31 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" -extern void vp9_idct16x16_256_add_neon_pass1(int16_t *input, - int16_t *output, - int output_stride); -extern void vp9_idct16x16_256_add_neon_pass2(int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); -extern void vp9_idct16x16_10_add_neon_pass1(int16_t *input, - int16_t *output, - int output_stride); -extern void vp9_idct16x16_10_add_neon_pass2(int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); +void vp9_idct16x16_256_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vp9_idct16x16_256_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); +void vp9_idct16x16_10_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ extern void vp9_push_neon(int64_t *store); extern void vp9_pop_neon(int64_t *store); -void vp9_idct16x16_256_add_neon(int16_t *input, - uint8_t *dest, int dest_stride) { +void vp9_idct16x16_256_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; @@ -109,8 +109,8 @@ void vp9_idct16x16_256_add_neon(int16_t *input, return; } -void vp9_idct16x16_10_add_neon(int16_t *input, - uint8_t *dest, int dest_stride) { +void vp9_idct16x16_10_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index b1af13891..0538b37ac 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -221,7 +221,7 @@ typedef struct macroblockd { int lossless; /* Inverse transform function pointers. */ - void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob); + void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); struct subpix_fn_table subpix; diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 959f49cb2..52b039d99 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -18,13 +18,13 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_iwht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ int i; int16_t output[16]; int a1, b1, c1, d1, e1; - int16_t *ip = input; + const int16_t *ip = input; int16_t *op = output; for (i = 0; i < 4; i++) { @@ -60,21 +60,21 @@ void vp9_iwht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { c1 = e1 - c1; a1 -= b1; d1 += c1; - dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); - dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1); - dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1); - dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1); + dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); + dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); + dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); + dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); ip++; dest++; } } -void vp9_iwht4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { +void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { int i; int a1, e1; int16_t tmp[4]; - int16_t *ip = in; + const int16_t *ip = in; int16_t *op = tmp; a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -116,7 +116,7 @@ static void idct4_1d(const int16_t *input, int16_t *output) { output[3] = step[0] - step[3]; } -void vp9_idct4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[4 * 4]; int16_t *outptr = out; int i, j; @@ -135,12 +135,12 @@ void vp9_idct4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { temp_in[j] = out[j * 4 + i]; idct4_1d(temp_in, temp_out); for (j = 0; j < 4; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); } } -void vp9_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { int i; int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); @@ -201,7 +201,7 @@ static void idct8_1d(const int16_t *input, int16_t *output) { output[7] = step1[0] - step1[7]; } -void vp9_idct8x8_64_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[8 * 8]; int16_t *outptr = out; int i, j; @@ -220,12 +220,12 @@ void vp9_idct8x8_64_add_c(int16_t *input, uint8_t *dest, int dest_stride) { temp_in[j] = out[j * 8 + i]; idct8_1d(temp_in, temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); } } -void vp9_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { int i, j; int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); @@ -234,7 +234,7 @@ void vp9_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) dest[i] = clip_pixel(dest[i] + a1); - dest += dest_stride; + dest += stride; } } @@ -280,8 +280,8 @@ static void iadst4_1d(const int16_t *input, int16_t *output) { output[3] = dct_const_round_shift(s3); } -void vp9_iht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride, - int tx_type) { +void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { const transform_2d IHT_4[] = { { idct4_1d, idct4_1d }, // DCT_DCT = 0 { iadst4_1d, idct4_1d }, // ADST_DCT = 1 @@ -307,8 +307,8 @@ void vp9_iht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride, temp_in[j] = out[j * 4 + i]; IHT_4[tx_type].cols(temp_in, temp_out); for (j = 0; j < 4; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); } } static void iadst8_1d(const int16_t *input, int16_t *output) { @@ -395,8 +395,8 @@ static const transform_2d IHT_8[] = { { iadst8_1d, iadst8_1d } // ADST_ADST = 3 }; -void vp9_iht8x8_64_add_c(int16_t *input, uint8_t *dest, int dest_stride, - int tx_type) { +void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { int i, j; int16_t out[8 * 8]; int16_t *outptr = out; @@ -416,12 +416,12 @@ void vp9_iht8x8_64_add_c(int16_t *input, uint8_t *dest, int dest_stride, temp_in[j] = out[j * 8 + i]; ht.cols(temp_in, temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) - + dest[j * dest_stride + i]); } + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); + } } -void vp9_idct8x8_10_add_c(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[8 * 8] = { 0 }; int16_t *outptr = out; int i, j; @@ -441,8 +441,8 @@ void vp9_idct8x8_10_add_c(int16_t *input, uint8_t *dest, temp_in[j] = out[j * 8 + i]; idct8_1d(temp_in, temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); } } @@ -611,7 +611,7 @@ static void idct16_1d(const int16_t *input, int16_t *output) { output[15] = step2[0] - step2[15]; } -void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[16 * 16]; int16_t *outptr = out; int i, j; @@ -630,8 +630,8 @@ void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride) { temp_in[j] = out[j * 16 + i]; idct16_1d(temp_in, temp_out); for (j = 0; j < 16; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); } } @@ -813,8 +813,8 @@ static const transform_2d IHT_16[] = { { iadst16_1d, iadst16_1d } // ADST_ADST = 3 }; -void vp9_iht16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride, - int tx_type) { +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { int i, j; int16_t out[16 * 16]; int16_t *outptr = out; @@ -834,12 +834,11 @@ void vp9_iht16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride, temp_in[j] = out[j * 16 + i]; ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * dest_stride + i]); } + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); } } -void vp9_idct16x16_10_add_c(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[16 * 16] = { 0 }; int16_t *outptr = out; int i, j; @@ -859,13 +858,12 @@ void vp9_idct16x16_10_add_c(int16_t *input, uint8_t *dest, temp_in[j] = out[j*16 + i]; idct16_1d(temp_in, temp_out); for (j = 0; j < 16; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); } } -void vp9_idct16x16_1_add_c(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { int i, j; int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); @@ -874,7 +872,7 @@ void vp9_idct16x16_1_add_c(int16_t *input, uint8_t *dest, for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) dest[i] = clip_pixel(dest[i] + a1); - dest += dest_stride; + dest += stride; } } @@ -1245,7 +1243,7 @@ static void idct32_1d(const int16_t *input, int16_t *output) { output[31] = step1[0] - step1[31]; } -void vp9_idct32x32_1024_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[32 * 32]; int16_t *outptr = out; int i, j; @@ -1277,13 +1275,12 @@ void vp9_idct32x32_1024_add_c(int16_t *input, uint8_t *dest, int dest_stride) { temp_in[j] = out[j * 32 + i]; idct32_1d(temp_in, temp_out); for (j = 0; j < 32; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); } } -void vp9_idct32x32_1_add_c(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { int i, j; int a1; @@ -1294,12 +1291,12 @@ void vp9_idct32x32_1_add_c(int16_t *input, uint8_t *dest, for (j = 0; j < 32; ++j) { for (i = 0; i < 32; ++i) dest[i] = clip_pixel(dest[i] + a1); - dest += dest_stride; + dest += stride; } } // idct -void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) { +void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { if (eob > 1) vp9_idct4x4_16_add(input, dest, stride); else @@ -1307,14 +1304,14 @@ void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) { } -void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) { +void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { if (eob > 1) vp9_iwht4x4_16_add(input, dest, stride); else vp9_iwht4x4_1_add(input, dest, stride); } -void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob) { +void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. @@ -1333,7 +1330,8 @@ void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob) { } } -void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob) { +void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, + int eob) { /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ if (eob) { @@ -1347,7 +1345,8 @@ void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob) { } } -void vp9_idct32x32_add(int16_t *input, uint8_t *dest, int stride, int eob) { +void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, + int eob) { if (eob) { if (eob == 1) vp9_idct32x32_1_add(input, dest, stride); @@ -1357,16 +1356,16 @@ void vp9_idct32x32_add(int16_t *input, uint8_t *dest, int stride, int eob) { } // iht -void vp9_iht4x4_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride, - int eob) { +void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, + int stride, int eob) { if (tx_type == DCT_DCT) vp9_idct4x4_add(input, dest, stride, eob); else vp9_iht4x4_16_add(input, dest, stride, tx_type); } -void vp9_iht8x8_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, - int stride, int eob) { +void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, + int stride, int eob) { if (tx_type == DCT_DCT) { vp9_idct8x8_add(input, dest, stride, eob); } else { @@ -1376,8 +1375,8 @@ void vp9_iht8x8_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, } } -void vp9_iht16x16_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, - int stride, int eob) { +void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, + int stride, int eob) { if (tx_type == DCT_DCT) { vp9_idct16x16_add(input, dest, stride, eob); } else { diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 78aaabab5..2b3f35f0a 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -87,18 +87,20 @@ typedef struct { transform_1d cols, rows; // vertical and horizontal } transform_2d; -void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_idct32x32_add(int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int + eob); +void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, + int eob); -void vp9_iht4x4_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, +void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_iht8x8_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, +void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_iht16x16_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, +void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, int stride, int eob); diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index c9cd56ccb..21513d414 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -267,51 +267,51 @@ specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 # # dct # -prototype void vp9_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct4x4_1_add sse2 neon -prototype void vp9_idct4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct4x4_16_add sse2 neon -prototype void vp9_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct8x8_1_add sse2 neon -prototype void vp9_idct8x8_64_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct8x8_64_add sse2 neon -prototype void vp9_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct8x8_10_add sse2 neon -prototype void vp9_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct16x16_1_add sse2 neon -prototype void vp9_idct16x16_256_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct16x16_256_add sse2 neon -prototype void vp9_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct16x16_10_add sse2 neon -prototype void vp9_idct32x32_1024_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct32x32_1024_add sse2 neon -prototype void vp9_idct32x32_1_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct32x32_1_add sse2 -prototype void vp9_iht4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" +prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" specialize vp9_iht4x4_16_add sse2 neon -prototype void vp9_iht8x8_64_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" +prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" specialize vp9_iht8x8_64_add sse2 neon -prototype void vp9_iht16x16_256_add "int16_t *input, uint8_t *output, int pitch, int tx_type" +prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type" specialize vp9_iht16x16_256_add sse2 # dct and add -prototype void vp9_iwht4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_iwht4x4_1_add -prototype void vp9_iwht4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride" +prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_iwht4x4_16_add # diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 0bb52990c..cfec36b42 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -15,7 +15,7 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, @@ -26,10 +26,10 @@ void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) { __m128i input0, input1, input2, input3; // Rows - input0 = _mm_loadl_epi64((__m128i *)input); - input1 = _mm_loadl_epi64((__m128i *)(input + 4)); - input2 = _mm_loadl_epi64((__m128i *)(input + 8)); - input3 = _mm_loadl_epi64((__m128i *)(input + 12)); + input0 = _mm_loadl_epi64((const __m128i *)input); + input1 = _mm_loadl_epi64((const __m128i *)(input + 4)); + input2 = _mm_loadl_epi64((const __m128i *)(input + 8)); + input3 = _mm_loadl_epi64((const __m128i *)(input + 12)); // Construct i3, i1, i3, i1, i2, i0, i2, i0 input0 = _mm_shufflelo_epi16(input0, 0xd8); @@ -148,7 +148,7 @@ void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE4X4(dest, input3); } -void vp9_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a; @@ -264,16 +264,16 @@ static void iadst4_1d_sse2(__m128i *in) { in[3] = _mm_unpackhi_epi64(in[1], in[1]); } -void vp9_iht4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in[4]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - in[0] = _mm_loadl_epi64((__m128i *)input); - in[1] = _mm_loadl_epi64((__m128i *)(input + 4)); - in[2] = _mm_loadl_epi64((__m128i *)(input + 8)); - in[3] = _mm_loadl_epi64((__m128i *)(input + 12)); + in[0] = _mm_loadl_epi64((const __m128i *)input); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); switch (tx_type) { case 0: // DCT_DCT @@ -494,7 +494,7 @@ void vp9_iht4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride, dest += stride; \ } -void vp9_idct8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); @@ -514,14 +514,14 @@ void vp9_idct8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride) { int i; // Load input data. - in0 = _mm_load_si128((__m128i *)input); - in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); - in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); - in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); - in4 = _mm_load_si128((__m128i *)(input + 8 * 4)); - in5 = _mm_load_si128((__m128i *)(input + 8 * 5)); - in6 = _mm_load_si128((__m128i *)(input + 8 * 6)); - in7 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in0 = _mm_load_si128((const __m128i *)input); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); // 2-D for (i = 0; i < 2; i++) { @@ -562,7 +562,7 @@ void vp9_idct8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } -void vp9_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a; @@ -883,21 +883,21 @@ static void iadst8_1d_sse2(__m128i *in) { } -void vp9_iht8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in[8]; const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1<<4); // load input data - in[0] = _mm_load_si128((__m128i *)input); - in[1] = _mm_load_si128((__m128i *)(input + 8 * 1)); - in[2] = _mm_load_si128((__m128i *)(input + 8 * 2)); - in[3] = _mm_load_si128((__m128i *)(input + 8 * 3)); - in[4] = _mm_load_si128((__m128i *)(input + 8 * 4)); - in[5] = _mm_load_si128((__m128i *)(input + 8 * 5)); - in[6] = _mm_load_si128((__m128i *)(input + 8 * 6)); - in[7] = _mm_load_si128((__m128i *)(input + 8 * 7)); + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); switch (tx_type) { case 0: // DCT_DCT @@ -950,7 +950,7 @@ void vp9_iht8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest, in[7]); } -void vp9_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); @@ -970,10 +970,10 @@ void vp9_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) { __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; // Rows. Load 4-row input data. - in0 = _mm_load_si128((__m128i *)input); - in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); - in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); - in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in0 = _mm_load_si128((const __m128i *)input); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); // 8x4 Transpose TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) @@ -1228,7 +1228,8 @@ void vp9_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) { stp2_10, stp2_13, stp2_11, stp2_12) \ } -void vp9_idct16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); const __m128i zero = _mm_setzero_si128(); @@ -1283,22 +1284,22 @@ void vp9_idct16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride) { if (i == 1) input += 128; // Load input data. - in0 = _mm_load_si128((__m128i *)input); - in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); - in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); - in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); - in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); - in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); - in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); - in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); - in4 = _mm_load_si128((__m128i *)(input + 8 * 8)); - in12 = _mm_load_si128((__m128i *)(input + 8 * 9)); - in5 = _mm_load_si128((__m128i *)(input + 8 * 10)); - in13 = _mm_load_si128((__m128i *)(input + 8 * 11)); - in6 = _mm_load_si128((__m128i *)(input + 8 * 12)); - in14 = _mm_load_si128((__m128i *)(input + 8 * 13)); - in7 = _mm_load_si128((__m128i *)(input + 8 * 14)); - in15 = _mm_load_si128((__m128i *)(input + 8 * 15)); + in0 = _mm_load_si128((const __m128i *)input); + in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); + in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); + in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); + in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); + in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); + in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); + in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); + in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); + in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); @@ -1435,7 +1436,7 @@ void vp9_idct16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride) { } } -void vp9_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a, i; @@ -2310,24 +2311,24 @@ static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { iadst16_1d_8col(in1); } -static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) { - in[0] = _mm_load_si128((__m128i *)(input + 0 * 16)); - in[1] = _mm_load_si128((__m128i *)(input + 1 * 16)); - in[2] = _mm_load_si128((__m128i *)(input + 2 * 16)); - in[3] = _mm_load_si128((__m128i *)(input + 3 * 16)); - in[4] = _mm_load_si128((__m128i *)(input + 4 * 16)); - in[5] = _mm_load_si128((__m128i *)(input + 5 * 16)); - in[6] = _mm_load_si128((__m128i *)(input + 6 * 16)); - in[7] = _mm_load_si128((__m128i *)(input + 7 * 16)); - - in[8] = _mm_load_si128((__m128i *)(input + 8 * 16)); - in[9] = _mm_load_si128((__m128i *)(input + 9 * 16)); - in[10] = _mm_load_si128((__m128i *)(input + 10 * 16)); - in[11] = _mm_load_si128((__m128i *)(input + 11 * 16)); - in[12] = _mm_load_si128((__m128i *)(input + 12 * 16)); - in[13] = _mm_load_si128((__m128i *)(input + 13 * 16)); - in[14] = _mm_load_si128((__m128i *)(input + 14 * 16)); - in[15] = _mm_load_si128((__m128i *)(input + 15 * 16)); +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); + + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); + in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); + in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); + in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); + in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); + in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); + in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); + in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); } static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { @@ -2386,8 +2387,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { RECON_AND_STORE(dest, in[15]); } -void vp9_iht16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in0[16], in1[16]; load_buffer_8x16(input, in0); @@ -2421,8 +2422,8 @@ void vp9_iht16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride, write_buffer_8x16(dest, in1, stride); } -void vp9_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest, - int stride) { +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); const __m128i zero = _mm_setzero_si128(); @@ -2468,14 +2469,14 @@ void vp9_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest, __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; // 1-D idct. Load input data. - in0 = _mm_load_si128((__m128i *)input); - in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); - in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); - in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); - in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); - in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); - in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); - in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in0 = _mm_load_si128((const __m128i *)input); + in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); @@ -2780,11 +2781,12 @@ void vp9_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest, #define LOAD_DQCOEFF(reg, input) \ { \ - reg = _mm_load_si128((__m128i *) input); \ + reg = _mm_load_si128((const __m128i *) input); \ input += 8; \ } \ -void vp9_idct32x32_1024_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); @@ -3515,7 +3517,7 @@ void vp9_idct32x32_1024_add_sse2(int16_t *input, uint8_t *dest, int stride) { } } //NOLINT -void vp9_idct32x32_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a, i; -- 2.40.0