From: Loren Merritt Date: Sat, 10 Oct 2009 04:43:00 +0000 (+0000) Subject: change all dct arrays to 1d. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1fbba0ca5d97d4f3250864c5cc6431c69855cb59;p=libx264 change all dct arrays to 1d. the C standard doesn't allow you to iterate 1-dimensionally over 2d arrays, and nothing other than the dsp functions themselves cares about the 2dness of dct. this fixes a miscompilation in x264_mb_optimize_chroma_dc. --- diff --git a/common/arm/dct.h b/common/arm/dct.h index e77190f9..b8cb4a12 100644 --- a/common/arm/dct.h +++ b/common/arm/dct.h @@ -23,27 +23,27 @@ #ifndef X264_ARM_DCT_H #define X264_ARM_DCT_H -void x264_dct4x4dc_neon( int16_t d[4][4] ); -void x264_idct4x4dc_neon( int16_t d[4][4] ); +void x264_dct4x4dc_neon( int16_t d[16] ); +void x264_idct4x4dc_neon( int16_t d[16] ); -void x264_sub4x4_dct_neon( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_neon( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct_neon( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); -void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[4][4] ); -void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][4][4] ); -void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][4][4] ); +void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] ); +void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] ); +void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] ); void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[2][2] ); -void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[4][4] ); +void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] ); void x264_sub8x8_dct_dc_neon( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct8_neon( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct8_neon( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); -void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[8][8] ); -void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][8][8] ); +void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] ); +void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] ); -void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[4][4] ); +void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] ); #endif diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S index f348e5fd..4dd71829 100644 --- a/common/arm/quant-a.S +++ b/common/arm/quant-a.S @@ -63,7 +63,7 @@ pmovmskb_byte: bx lr .endm -// quant_2x2_dc( int16_t dct[2][2], int mf, int bias ) +// quant_2x2_dc( int16_t dct[4], int mf, int bias ) function x264_quant_2x2_dc_neon, export=1 vld1.64 {d0}, [r0,:64] vabs.s16 d3, d0 @@ -79,7 +79,7 @@ function x264_quant_2x2_dc_neon, export=1 QUANT_END d3 .endfunc -// quant_4x4_dc( int16_t dct[4][4], int mf, int bias ) +// quant_4x4_dc( int16_t dct[16], int mf, int bias ) function x264_quant_4x4_dc_neon, export=1 vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 @@ -91,7 +91,7 @@ function x264_quant_4x4_dc_neon, export=1 QUANT_END d0 .endfunc -// quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) +// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) function x264_quant_4x4_neon, export=1 vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 @@ -103,7 +103,7 @@ function x264_quant_4x4_neon, export=1 QUANT_END d0 .endfunc -// quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) +// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) function x264_quant_8x8_neon, export=1 vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 @@ -137,7 +137,7 @@ function x264_quant_8x8_neon, export=1 subs r3, r3, #\offset // 6 for 8x8 .endm -// dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp ) .macro DEQUANT size bits function x264_dequant_\size\()_neon, export=1 DEQUANT_START \bits+2, \bits @@ -219,7 +219,7 @@ dequant_\size\()_rshift_loop: DEQUANT 4x4, 4 DEQUANT 8x8, 6 -// dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp ) function x264_dequant_4x4_dc_neon, export=1 DEQUANT_START 6, 6, yes blt dequant_4x4_dc_rshift diff --git a/common/arm/quant.h b/common/arm/quant.h index 0df9ebf1..dcfed633 100644 --- a/common/arm/quant.h +++ b/common/arm/quant.h @@ -23,16 +23,16 @@ #ifndef X264_ARM_QUANT_H #define X264_ARM_QUANT_H -int x264_quant_2x2_dc_armv6( int16_t dct[2][2], int mf, int bias ); +int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias ); -int x264_quant_2x2_dc_neon( int16_t dct[2][2], int mf, int bias ); -int x264_quant_4x4_dc_neon( int16_t dct[4][4], int mf, int bias ); -int x264_quant_4x4_neon( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); -int x264_quant_8x8_neon( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); +int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias ); +int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias ); +int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); -void x264_dequant_4x4_dc_neon( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); -void x264_dequant_4x4_neon( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); -void x264_dequant_8x8_neon( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); +void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); int x264_coeff_last4_arm( int16_t * ); int x264_coeff_last15_neon( int16_t * ); diff --git a/common/common.h b/common/common.h index 4313db81..5ec141ce 100644 --- a/common/common.h +++ b/common/common.h @@ -341,9 +341,9 @@ struct x264_t x264_pps_t *pps; int i_idr_pic_id; - /* quantization matrix for decoding, [cqm][qp%6][coef_y][coef_x] */ - int (*dequant4_mf[4])[4][4]; /* [4][6][4][4] */ - int (*dequant8_mf[2])[8][8]; /* [2][6][8][8] */ + /* quantization matrix for decoding, [cqm][qp%6][coef] */ + int (*dequant4_mf[4])[16]; /* [4][6][16] */ + int (*dequant8_mf[2])[64]; /* [2][6][64] */ /* quantization matrix for trellis, [cqm][qp][coef] */ int (*unquant4_mf[4])[16]; /* [4][52][16] */ int (*unquant8_mf[2])[64]; /* [2][52][64] */ diff --git a/common/dct.c b/common/dct.c index c5a79139..0aed8d0f 100644 --- a/common/dct.c +++ b/common/dct.c @@ -35,75 +35,71 @@ int x264_dct4_weight2_zigzag[2][16]; int x264_dct8_weight2_zigzag[2][64]; -/* - * XXX For all dct dc : input could be equal to output so ... - */ - -static void dct4x4dc( int16_t d[4][4] ) +static void dct4x4dc( int16_t d[16] ) { - int16_t tmp[4][4]; + int16_t tmp[16]; int s01, s23; int d01, d23; int i; for( i = 0; i < 4; i++ ) { - s01 = d[i][0] + d[i][1]; - d01 = d[i][0] - d[i][1]; - s23 = d[i][2] + d[i][3]; - d23 = d[i][2] - d[i][3]; - - tmp[0][i] = s01 + s23; - tmp[1][i] = s01 - s23; - tmp[2][i] = d01 - d23; - tmp[3][i] = d01 + d23; + s01 = d[i*4+0] + d[i*4+1]; + d01 = d[i*4+0] - d[i*4+1]; + s23 = d[i*4+2] + d[i*4+3]; + d23 = d[i*4+2] - d[i*4+3]; + + tmp[0*4+i] = s01 + s23; + tmp[1*4+i] = s01 - s23; + tmp[2*4+i] = d01 - d23; + tmp[3*4+i] = d01 + d23; } for( i = 0; i < 4; i++ ) { - s01 = tmp[i][0] + tmp[i][1]; - d01 = tmp[i][0] - tmp[i][1]; - s23 = tmp[i][2] + tmp[i][3]; - d23 = tmp[i][2] - tmp[i][3]; - - d[i][0] = ( s01 + s23 + 1 ) >> 1; - d[i][1] = ( s01 - s23 + 1 ) >> 1; - d[i][2] = ( d01 - d23 + 1 ) >> 1; - d[i][3] = ( d01 + d23 + 1 ) >> 1; + s01 = tmp[i*4+0] + tmp[i*4+1]; + d01 = tmp[i*4+0] - tmp[i*4+1]; + s23 = tmp[i*4+2] + tmp[i*4+3]; + d23 = tmp[i*4+2] - tmp[i*4+3]; + + d[i*4+0] = ( s01 + s23 + 1 ) >> 1; + d[i*4+1] = ( s01 - s23 + 1 ) >> 1; + d[i*4+2] = ( d01 - d23 + 1 ) >> 1; + d[i*4+3] = ( d01 + d23 + 1 ) >> 1; } } -static void idct4x4dc( int16_t d[4][4] ) +static void idct4x4dc( int16_t d[16] ) { - int16_t tmp[4][4]; + int16_t tmp[16]; int s01, s23; int d01, d23; int i; for( i = 0; i < 4; i++ ) { - s01 = d[i][0] + d[i][1]; - d01 = d[i][0] - d[i][1]; - s23 = d[i][2] + d[i][3]; - d23 = d[i][2] - d[i][3]; - - tmp[0][i] = s01 + s23; - tmp[1][i] = s01 - s23; - tmp[2][i] = d01 - d23; - tmp[3][i] = d01 + d23; + s01 = d[i*4+0] + d[i*4+1]; + d01 = d[i*4+0] - d[i*4+1]; + s23 = d[i*4+2] + d[i*4+3]; + d23 = d[i*4+2] - d[i*4+3]; + + tmp[0*4+i] = s01 + s23; + tmp[1*4+i] = s01 - s23; + tmp[2*4+i] = d01 - d23; + tmp[3*4+i] = d01 + d23; } for( i = 0; i < 4; i++ ) { - s01 = tmp[i][0] + tmp[i][1]; - d01 = tmp[i][0] - tmp[i][1]; - s23 = tmp[i][2] + tmp[i][3]; - d23 = tmp[i][2] - tmp[i][3]; - - d[i][0] = s01 + s23; - d[i][1] = s01 - s23; - d[i][2] = d01 - d23; - d[i][3] = d01 + d23; + s01 = tmp[i*4+0] + tmp[i*4+1]; + d01 = tmp[i*4+0] - tmp[i*4+1]; + s23 = tmp[i*4+2] + tmp[i*4+3]; + d23 = tmp[i*4+2] - tmp[i*4+3]; + + d[i*4+0] = s01 + s23; + d[i*4+1] = s01 - s23; + d[i*4+2] = d01 - d23; + d[i*4+3] = d01 + d23; } } @@ -122,42 +118,42 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size, } } -static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) +static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ) { - int16_t d[4][4]; - int16_t tmp[4][4]; + int16_t d[16]; + int16_t tmp[16]; int i; - pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); + pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); for( i = 0; i < 4; i++ ) { - const int s03 = d[i][0] + d[i][3]; - const int s12 = d[i][1] + d[i][2]; - const int d03 = d[i][0] - d[i][3]; - const int d12 = d[i][1] - d[i][2]; - - tmp[0][i] = s03 + s12; - tmp[1][i] = 2*d03 + d12; - tmp[2][i] = s03 - s12; - tmp[3][i] = d03 - 2*d12; + const int s03 = d[i*4+0] + d[i*4+3]; + const int s12 = d[i*4+1] + d[i*4+2]; + const int d03 = d[i*4+0] - d[i*4+3]; + const int d12 = d[i*4+1] - d[i*4+2]; + + tmp[0*4+i] = s03 + s12; + tmp[1*4+i] = 2*d03 + d12; + tmp[2*4+i] = s03 - s12; + tmp[3*4+i] = d03 - 2*d12; } for( i = 0; i < 4; i++ ) { - const int s03 = tmp[i][0] + tmp[i][3]; - const int s12 = tmp[i][1] + tmp[i][2]; - const int d03 = tmp[i][0] - tmp[i][3]; - const int d12 = tmp[i][1] - tmp[i][2]; - - dct[i][0] = s03 + s12; - dct[i][1] = 2*d03 + d12; - dct[i][2] = s03 - s12; - dct[i][3] = d03 - 2*d12; + const int s03 = tmp[i*4+0] + tmp[i*4+3]; + const int s12 = tmp[i*4+1] + tmp[i*4+2]; + const int d03 = tmp[i*4+0] - tmp[i*4+3]; + const int d12 = tmp[i*4+1] - tmp[i*4+2]; + + dct[i*4+0] = s03 + s12; + dct[i*4+1] = 2*d03 + d12; + dct[i*4+2] = s03 - s12; + dct[i*4+3] = d03 - 2*d12; } } -static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) +static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 ) { sub4x4_dct( dct[0], &pix1[0], &pix2[0] ); sub4x4_dct( dct[1], &pix1[4], &pix2[4] ); @@ -165,7 +161,7 @@ static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); } -static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ) +static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ) { sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] ); sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] ); @@ -175,72 +171,68 @@ static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ) static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 ) { - int16_t d[4][4]; + int16_t d[16]; int sum = 0; - pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); + pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); - sum += d[0][0] + d[0][1] + d[0][2] + d[0][3]; - sum += d[1][0] + d[1][1] + d[1][2] + d[1][3]; - sum += d[2][0] + d[2][1] + d[2][2] + d[2][3]; - sum += d[3][0] + d[3][1] + d[3][2] + d[3][3]; + sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7]; + sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15]; return sum; } -static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ) +static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ) { - dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] ); - dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] ); - dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] ); - dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); + dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] ); + dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] ); + dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] ); + dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); } -static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] ) +static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] ) { - int16_t d[4][4]; - int16_t tmp[4][4]; + int16_t d[16]; + int16_t tmp[16]; int x, y; int i; for( i = 0; i < 4; i++ ) { - const int s02 = dct[0][i] + dct[2][i]; - const int d02 = dct[0][i] - dct[2][i]; - const int s13 = dct[1][i] + (dct[3][i]>>1); - const int d13 = (dct[1][i]>>1) - dct[3][i]; - - tmp[i][0] = s02 + s13; - tmp[i][1] = d02 + d13; - tmp[i][2] = d02 - d13; - tmp[i][3] = s02 - s13; + const int s02 = dct[0*4+i] + dct[2*4+i]; + const int d02 = dct[0*4+i] - dct[2*4+i]; + const int s13 = dct[1*4+i] + (dct[3*4+i]>>1); + const int d13 = (dct[1*4+i]>>1) - dct[3*4+i]; + + tmp[i*4+0] = s02 + s13; + tmp[i*4+1] = d02 + d13; + tmp[i*4+2] = d02 - d13; + tmp[i*4+3] = s02 - s13; } for( i = 0; i < 4; i++ ) { - const int s02 = tmp[0][i] + tmp[2][i]; - const int d02 = tmp[0][i] - tmp[2][i]; - const int s13 = tmp[1][i] + (tmp[3][i]>>1); - const int d13 = (tmp[1][i]>>1) - tmp[3][i]; - - d[0][i] = ( s02 + s13 + 32 ) >> 6; - d[1][i] = ( d02 + d13 + 32 ) >> 6; - d[2][i] = ( d02 - d13 + 32 ) >> 6; - d[3][i] = ( s02 - s13 + 32 ) >> 6; + const int s02 = tmp[0*4+i] + tmp[2*4+i]; + const int d02 = tmp[0*4+i] - tmp[2*4+i]; + const int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1); + const int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i]; + + d[0*4+i] = ( s02 + s13 + 32 ) >> 6; + d[1*4+i] = ( d02 + d13 + 32 ) >> 6; + d[2*4+i] = ( d02 - d13 + 32 ) >> 6; + d[3*4+i] = ( s02 - s13 + 32 ) >> 6; } for( y = 0; y < 4; y++ ) { for( x = 0; x < 4; x++ ) - { - p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] ); - } + p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] ); p_dst += FDEC_STRIDE; } } -static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] ) +static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] ) { add4x4_idct( &p_dst[0], dct[0] ); add4x4_idct( &p_dst[4], dct[1] ); @@ -248,7 +240,7 @@ static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] ) add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] ); } -static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] ) +static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] ) { add8x8_idct( &p_dst[0], &dct[0] ); add8x8_idct( &p_dst[8], &dct[4] ); @@ -287,29 +279,29 @@ static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] ) DST(7) = (a4>>2) - a7 ;\ } -static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) +static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ) { int i; - int16_t tmp[8][8]; + int16_t tmp[64]; - pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); + pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); -#define SRC(x) tmp[x][i] -#define DST(x) tmp[x][i] +#define SRC(x) tmp[x*8+i] +#define DST(x) tmp[x*8+i] for( i = 0; i < 8; i++ ) DCT8_1D #undef SRC #undef DST -#define SRC(x) tmp[i][x] -#define DST(x) dct[x][i] +#define SRC(x) tmp[i*8+x] +#define DST(x) dct[x*8+i] for( i = 0; i < 8; i++ ) DCT8_1D #undef SRC #undef DST } -static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ) +static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ) { sub8x8_dct8( dct[0], &pix1[0], &pix2[0] ); sub8x8_dct8( dct[1], &pix1[8], &pix2[8] ); @@ -344,20 +336,20 @@ static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ) DST(7, b0 - b7);\ } -static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] ) +static void add8x8_idct8( uint8_t *dst, int16_t dct[64] ) { int i; - dct[0][0] += 32; // rounding for the >>6 at the end + dct[0] += 32; // rounding for the >>6 at the end -#define SRC(x) dct[x][i] -#define DST(x,rhs) dct[x][i] = (rhs) +#define SRC(x) dct[x*8+i] +#define DST(x,rhs) dct[x*8+i] = (rhs) for( i = 0; i < 8; i++ ) IDCT8_1D #undef SRC #undef DST -#define SRC(x) dct[i][x] +#define SRC(x) dct[i*8+x] #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) ); for( i = 0; i < 8; i++ ) IDCT8_1D @@ -365,7 +357,7 @@ static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] ) #undef DST } -static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] ) +static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] ) { add8x8_idct8( &dst[0], dct[0] ); add8x8_idct8( &dst[8], dct[1] ); @@ -386,23 +378,23 @@ static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc ) } } -static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] ) +static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] ) { - add4x4_idct_dc( &p_dst[0], dct[0][0] ); - add4x4_idct_dc( &p_dst[4], dct[0][1] ); - add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] ); - add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] ); + add4x4_idct_dc( &p_dst[0], dct[0] ); + add4x4_idct_dc( &p_dst[4], dct[1] ); + add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] ); + add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] ); } -static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] ) +static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] ) { int i; - for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE ) + for( i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE ) { - add4x4_idct_dc( &p_dst[ 0], dct[i][0] ); - add4x4_idct_dc( &p_dst[ 4], dct[i][1] ); - add4x4_idct_dc( &p_dst[ 8], dct[i][2] ); - add4x4_idct_dc( &p_dst[12], dct[i][3] ); + add4x4_idct_dc( &p_dst[ 0], dct[0] ); + add4x4_idct_dc( &p_dst[ 4], dct[1] ); + add4x4_idct_dc( &p_dst[ 8], dct[2] ); + add4x4_idct_dc( &p_dst[12], dct[3] ); } } @@ -545,8 +537,7 @@ void x264_dct_init_weights( void ) } -// gcc pessimizes multi-dimensional arrays here, even with constant indices -#define ZIG(i,y,x) level[i] = dct[0][x*8+y]; +#define ZIG(i,y,x) level[i] = dct[x*8+y]; #define ZIGZAG8_FRAME\ ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ @@ -595,32 +586,32 @@ void x264_dct_init_weights( void ) ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\ ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) -static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) +static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[64] ) { ZIGZAG8_FRAME } -static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] ) +static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[64] ) { ZIGZAG8_FIELD } #undef ZIG -#define ZIG(i,y,x) level[i] = dct[0][x*4+y]; +#define ZIG(i,y,x) level[i] = dct[x*4+y]; #define ZIGDC(i,y,x) ZIG(i,y,x) -static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) +static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] ) { ZIGZAG4_FRAME } -static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] ) +static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] ) { *(uint32_t*)level = *(uint32_t*)dct; ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1) - *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6); - *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8); - *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12); + *(uint32_t*)(level+6) = *(uint32_t*)(dct+6); + *(uint64_t*)(level+8) = *(uint64_t*)(dct+8); + *(uint64_t*)(level+12) = *(uint64_t*)(dct+12); } #undef ZIG diff --git a/common/dct.h b/common/dct.h index 14547aa0..6f282b95 100644 --- a/common/dct.h +++ b/common/dct.h @@ -91,33 +91,33 @@ typedef struct // pix1 stride = FENC_STRIDE // pix2 stride = FDEC_STRIDE // p_dst stride = FDEC_STRIDE - void (*sub4x4_dct) ( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ); - void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[4][4] ); + void (*sub4x4_dct) ( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ); + void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[16] ); - void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ); - void (*sub8x8_dct_dc)( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ); - void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][4][4] ); - void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] ); + void (*sub8x8_dct) ( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 ); + void (*sub8x8_dct_dc)( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ); + void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][16] ); + void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[4] ); - void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); - void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] ); - void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[4][4] ); + void (*sub16x16_dct) ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); + void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][16] ); + void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[16] ); - void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ); - void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] ); + void (*sub8x8_dct8) ( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); + void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[64] ); - void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); - void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][8][8] ); + void (*sub16x16_dct8) ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); + void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][64] ); - void (*dct4x4dc) ( int16_t d[4][4] ); - void (*idct4x4dc)( int16_t d[4][4] ); + void (*dct4x4dc) ( int16_t d[16] ); + void (*idct4x4dc)( int16_t d[16] ); } x264_dct_function_t; typedef struct { - void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] ); - void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] ); + void (*scan_8x8)( int16_t level[64], int16_t dct[64] ); + void (*scan_4x4)( int16_t level[16], int16_t dct[16] ); int (*sub_8x8) ( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst ); int (*sub_4x4) ( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ); int (*sub_4x4ac)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc ); diff --git a/common/quant.c b/common/quant.c index 263fb7c1..096a4b34 100644 --- a/common/quant.c +++ b/common/quant.c @@ -42,141 +42,101 @@ nz |= (coef); \ } -static int quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) +static int quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) { int i, nz = 0; for( i = 0; i < 64; i++ ) - QUANT_ONE( dct[0][i], mf[i], bias[i] ); + QUANT_ONE( dct[i], mf[i], bias[i] ); return !!nz; } -static int quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) +static int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) { int i, nz = 0; for( i = 0; i < 16; i++ ) - QUANT_ONE( dct[0][i], mf[i], bias[i] ); + QUANT_ONE( dct[i], mf[i], bias[i] ); return !!nz; } -static int quant_4x4_dc( int16_t dct[4][4], int mf, int bias ) +static int quant_4x4_dc( int16_t dct[16], int mf, int bias ) { int i, nz = 0; for( i = 0; i < 16; i++ ) - QUANT_ONE( dct[0][i], mf, bias ); + QUANT_ONE( dct[i], mf, bias ); return !!nz; } -static int quant_2x2_dc( int16_t dct[2][2], int mf, int bias ) +static int quant_2x2_dc( int16_t dct[4], int mf, int bias ) { int nz = 0; - QUANT_ONE( dct[0][0], mf, bias ); - QUANT_ONE( dct[0][1], mf, bias ); - QUANT_ONE( dct[0][2], mf, bias ); - QUANT_ONE( dct[0][3], mf, bias ); + QUANT_ONE( dct[0], mf, bias ); + QUANT_ONE( dct[1], mf, bias ); + QUANT_ONE( dct[2], mf, bias ); + QUANT_ONE( dct[3], mf, bias ); return !!nz; } #define DEQUANT_SHL( x ) \ - dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] ) << i_qbits + dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits #define DEQUANT_SHR( x ) \ - dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] + f ) >> (-i_qbits) + dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits) -static void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +static void dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp ) { const int i_mf = i_qp%6; const int i_qbits = i_qp/6 - 4; - int y; + int i; if( i_qbits >= 0 ) { - for( y = 0; y < 4; y++ ) - { - DEQUANT_SHL( 0 ); - DEQUANT_SHL( 1 ); - DEQUANT_SHL( 2 ); - DEQUANT_SHL( 3 ); - } + for( i = 0; i < 16; i++ ) + DEQUANT_SHL( i ); } else { const int f = 1 << (-i_qbits-1); - for( y = 0; y < 4; y++ ) - { - DEQUANT_SHR( 0 ); - DEQUANT_SHR( 1 ); - DEQUANT_SHR( 2 ); - DEQUANT_SHR( 3 ); - } + for( i = 0; i < 16; i++ ) + DEQUANT_SHR( i ); } } -static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ) +static void dequant_8x8( int16_t dct[64], int dequant_mf[6][64], int i_qp ) { const int i_mf = i_qp%6; const int i_qbits = i_qp/6 - 6; - int y; + int i; if( i_qbits >= 0 ) { - for( y = 0; y < 8; y++ ) - { - DEQUANT_SHL( 0 ); - DEQUANT_SHL( 1 ); - DEQUANT_SHL( 2 ); - DEQUANT_SHL( 3 ); - DEQUANT_SHL( 4 ); - DEQUANT_SHL( 5 ); - DEQUANT_SHL( 6 ); - DEQUANT_SHL( 7 ); - } + for( i = 0; i < 64; i++ ) + DEQUANT_SHL( i ); } else { const int f = 1 << (-i_qbits-1); - for( y = 0; y < 8; y++ ) - { - DEQUANT_SHR( 0 ); - DEQUANT_SHR( 1 ); - DEQUANT_SHR( 2 ); - DEQUANT_SHR( 3 ); - DEQUANT_SHR( 4 ); - DEQUANT_SHR( 5 ); - DEQUANT_SHR( 6 ); - DEQUANT_SHR( 7 ); - } + for( i = 0; i < 64; i++ ) + DEQUANT_SHR( i ); } } -static void dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +static void dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp ) { const int i_qbits = i_qp/6 - 6; - int y; + int i; if( i_qbits >= 0 ) { - const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits; - - for( y = 0; y < 4; y++ ) - { - dct[y][0] *= i_dmf; - dct[y][1] *= i_dmf; - dct[y][2] *= i_dmf; - dct[y][3] *= i_dmf; - } + const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits; + for( i = 0; i < 16; i++ ) + dct[i] *= i_dmf; } else { - const int i_dmf = dequant_mf[i_qp%6][0][0]; + const int i_dmf = dequant_mf[i_qp%6][0]; const int f = 1 << (-i_qbits-1); - - for( y = 0; y < 4; y++ ) - { - dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits); - dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits); - dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits); - dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits); - } + for( i = 0; i < 16; i++ ) + dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits); } } diff --git a/common/quant.h b/common/quant.h index b8a7b988..1cfe95d1 100644 --- a/common/quant.h +++ b/common/quant.h @@ -25,14 +25,14 @@ typedef struct { - int (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); - int (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); - int (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias ); - int (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias ); + int (*quant_8x8)( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); + int (*quant_4x4)( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); + int (*quant_4x4_dc)( int16_t dct[16], int mf, int bias ); + int (*quant_2x2_dc)( int16_t dct[4], int mf, int bias ); - void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); - void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); - void (*dequant_4x4_dc)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); + void (*dequant_8x8)( int16_t dct[64], int dequant_mf[6][64], int i_qp ); + void (*dequant_4x4)( int16_t dct[16], int dequant_mf[6][16], int i_qp ); + void (*dequant_4x4_dc)( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); diff --git a/common/set.c b/common/set.c index 1f53cf38..24970be1 100644 --- a/common/set.c +++ b/common/set.c @@ -71,8 +71,8 @@ int x264_cqm_init( x264_t *h ) int def_quant8[6][64]; int def_dequant4[6][16]; int def_dequant8[6][64]; - int quant4_mf[4][6][4][4]; - int quant8_mf[2][6][8][8]; + int quant4_mf[4][6][16]; + int quant8_mf[2][6][64]; int q, i, j, i_list; int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1], 32 - h->param.analyse.i_luma_deadzone[0], @@ -130,14 +130,14 @@ int x264_cqm_init( x264_t *h ) for( i_list = 0; i_list < 4; i_list++ ) for( i = 0; i < 16; i++ ) { - h->dequant4_mf[i_list][q][0][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i]; - quant4_mf[i_list][q][0][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]); + h->dequant4_mf[i_list][q][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i]; + quant4_mf[i_list][q][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]); } for( i_list = 0; i_list < 2; i_list++ ) for( i = 0; i < 64; i++ ) { - h->dequant8_mf[i_list][q][0][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i]; - quant8_mf[i_list][q][0][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]); + h->dequant8_mf[i_list][q][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i]; + quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]); } } for( q = 0; q < 52; q++ ) @@ -145,8 +145,8 @@ int x264_cqm_init( x264_t *h ) for( i_list = 0; i_list < 4; i_list++ ) for( i = 0; i < 16; i++ ) { - h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][0][i]; - h-> quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][0][i], q/6 - 1); + h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i]; + h-> quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1); // round to nearest, unless that would cause the deadzone to be negative h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) ) @@ -158,8 +158,8 @@ int x264_cqm_init( x264_t *h ) for( i_list = 0; i_list < 2; i_list++ ) for( i = 0; i < 64; i++ ) { - h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][0][i]; - h-> quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][0][i], q/6); + h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i]; + h-> quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][i], q/6); h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); if( j > 0xffff && q > max_qp_err ) max_qp_err = q; diff --git a/common/x86/dct.h b/common/x86/dct.h index 9f6ed8d4..e537d624 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -24,51 +24,51 @@ #ifndef X264_I386_DCT_H #define X264_I386_DCT_H -void x264_sub4x4_dct_mmx ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_mmx ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct_mmx ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub4x4_dct_ssse3 ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct_ssse3 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_dc_mmxext( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_dc_sse2 ( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub4x4_dct_mmx ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_mmx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_mmx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_ssse3 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_dc_mmxext( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); -void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] ); -void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct[ 4][4] ); -void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] ); -void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] ); -void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] ); -void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[4][4] ); -void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][4][4] ); -void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] ); -void x264_add16x16_idct_dc_sse2( uint8_t *p_dst, int16_t dct[4][4] ); -void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] ); -void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[4][4] ); +void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] ); +void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct [16] ); +void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] ); +void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [ 4] ); +void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][16] ); +void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [16] ); +void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][16] ); +void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][16] ); +void x264_add16x16_idct_dc_sse2 ( uint8_t *p_dst, int16_t dct [16] ); +void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] ); +void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] ); -void x264_dct4x4dc_mmx ( int16_t d[4][4] ); -void x264_idct4x4dc_mmx ( int16_t d[4][4] ); +void x264_dct4x4dc_mmx ( int16_t d[16] ); +void x264_idct4x4dc_mmx ( int16_t d[16] ); -void x264_sub8x8_dct8_mmx ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct8_mmx ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct8_sse2 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct8_sse2 ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct8_ssse3 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct8_ssse3( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct8_sse2 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct8_sse2 ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); -void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct[8][8] ); -void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][8][8] ); -void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct[8][8] ); -void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] ); +void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] ); +void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] ); +void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct [64] ); +void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] ); -void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[8][8] ); -void x264_zigzag_scan_8x8_frame_sse2 ( int16_t level[64], int16_t dct[8][8] ); -void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] ); -void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] ); -void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] ); -void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ); +void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_sse2 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] ); int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); diff --git a/common/x86/quant.h b/common/x86/quant.h index dff60a85..4e42b812 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -24,30 +24,30 @@ #ifndef X264_I386_QUANT_H #define X264_I386_QUANT_H -int x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias ); -int x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias ); -int x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); -int x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); -int x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias ); -int x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); -int x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); -int x264_quant_2x2_dc_ssse3( int16_t dct[2][2], int mf, int bias ); -int x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias ); -int x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); -int x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); -int x264_quant_4x4_dc_sse4( int16_t dct[4][4], int mf, int bias ); -int x264_quant_4x4_sse4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); -int x264_quant_8x8_sse4( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); -void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); -void x264_dequant_4x4dc_mmxext( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); -void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); -void x264_dequant_4x4_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); -void x264_dequant_4x4dc_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); -void x264_dequant_8x8_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); -void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); -void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); -void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); -void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); +int x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias ); +int x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias ); +int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_8x8_mmx( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); +int x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias ); +int x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); +int x264_quant_2x2_dc_ssse3( int16_t dct[4], int mf, int bias ); +int x264_quant_4x4_dc_ssse3( int16_t dct[16], int mf, int bias ); +int x264_quant_4x4_ssse3( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_8x8_ssse3( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); +int x264_quant_4x4_dc_sse4( int16_t dct[16], int mf, int bias ); +int x264_quant_4x4_sse4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_8x8_sse4( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); +void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4dc_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); diff --git a/encoder/analyse.c b/encoder/analyse.c index 74e72bf6..d137c1af 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -592,8 +592,8 @@ static void predict_4x4_mode_available( unsigned int i_neighbour, /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct ) { - ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] ); - ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] ); + ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[64] ); + ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] ); ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0}; int i; diff --git a/encoder/macroblock.c b/encoder/macroblock.c index ccee06a8..fb1e1d6f 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -27,8 +27,8 @@ /* These chroma DC functions don't have assembly versions and are only used here. */ -#define ZIG(i,y,x) level[i] = dct[x][y]; -static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] ) +#define ZIG(i,y,x) level[i] = dct[x*2+y]; +static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[4] ) { ZIG(0,0,0) ZIG(1,0,1) @@ -38,11 +38,11 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] ) #undef ZIG #define IDCT_DEQUANT_START \ - int d0 = dct[0][0] + dct[0][1]; \ - int d1 = dct[1][0] + dct[1][1]; \ - int d2 = dct[0][0] - dct[0][1]; \ - int d3 = dct[1][0] - dct[1][1]; \ - int dmf = dequant_mf[i_qp%6][0][0]; \ + int d0 = dct[0] + dct[1]; \ + int d1 = dct[2] + dct[3]; \ + int d2 = dct[0] - dct[1]; \ + int d3 = dct[2] - dct[3]; \ + int dmf = dequant_mf[i_qp%6][0]; \ int qbits = i_qp/6 - 5; \ if( qbits > 0 ) \ { \ @@ -50,53 +50,53 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] ) qbits = 0; \ } -static inline void idct_dequant_2x2_dc( int16_t dct[2][2], int16_t dct4x4[4][4][4], int dequant_mf[6][4][4], int i_qp ) +static inline void idct_dequant_2x2_dc( int16_t dct[4], int16_t dct4x4[4][16], int dequant_mf[6][16], int i_qp ) { IDCT_DEQUANT_START - dct4x4[0][0][0] = (d0 + d1) * dmf >> -qbits; - dct4x4[1][0][0] = (d0 - d1) * dmf >> -qbits; - dct4x4[2][0][0] = (d2 + d3) * dmf >> -qbits; - dct4x4[3][0][0] = (d2 - d3) * dmf >> -qbits; + dct4x4[0][0] = (d0 + d1) * dmf >> -qbits; + dct4x4[1][0] = (d0 - d1) * dmf >> -qbits; + dct4x4[2][0] = (d2 + d3) * dmf >> -qbits; + dct4x4[3][0] = (d2 - d3) * dmf >> -qbits; } -static inline void idct_dequant_2x2_dconly( int16_t out[2][2], int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp ) +static inline void idct_dequant_2x2_dconly( int16_t out[4], int16_t dct[4], int dequant_mf[6][16], int i_qp ) { IDCT_DEQUANT_START - out[0][0] = (d0 + d1) * dmf >> -qbits; - out[0][1] = (d0 - d1) * dmf >> -qbits; - out[1][0] = (d2 + d3) * dmf >> -qbits; - out[1][1] = (d2 - d3) * dmf >> -qbits; + out[0] = (d0 + d1) * dmf >> -qbits; + out[1] = (d0 - d1) * dmf >> -qbits; + out[2] = (d2 + d3) * dmf >> -qbits; + out[3] = (d2 - d3) * dmf >> -qbits; } -static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] ) +static inline void dct2x2dc( int16_t d[4], int16_t dct4x4[4][16] ) { - int d0 = dct4x4[0][0][0] + dct4x4[1][0][0]; - int d1 = dct4x4[2][0][0] + dct4x4[3][0][0]; - int d2 = dct4x4[0][0][0] - dct4x4[1][0][0]; - int d3 = dct4x4[2][0][0] - dct4x4[3][0][0]; - d[0][0] = d0 + d1; - d[1][0] = d2 + d3; - d[0][1] = d0 - d1; - d[1][1] = d2 - d3; - dct4x4[0][0][0] = 0; - dct4x4[1][0][0] = 0; - dct4x4[2][0][0] = 0; - dct4x4[3][0][0] = 0; + int d0 = dct4x4[0][0] + dct4x4[1][0]; + int d1 = dct4x4[2][0] + dct4x4[3][0]; + int d2 = dct4x4[0][0] - dct4x4[1][0]; + int d3 = dct4x4[2][0] - dct4x4[3][0]; + d[0] = d0 + d1; + d[2] = d2 + d3; + d[1] = d0 - d1; + d[3] = d2 - d3; + dct4x4[0][0] = 0; + dct4x4[1][0] = 0; + dct4x4[2][0] = 0; + dct4x4[3][0] = 0; } -static inline void dct2x2dc_dconly( int16_t d[2][2] ) +static inline void dct2x2dc_dconly( int16_t d[4] ) { - int d0 = d[0][0] + d[0][1]; - int d1 = d[1][0] + d[1][1]; - int d2 = d[0][0] - d[0][1]; - int d3 = d[1][0] - d[1][1]; - d[0][0] = d0 + d1; - d[1][0] = d2 + d3; - d[0][1] = d0 - d1; - d[1][1] = d2 - d3; + int d0 = d[0] + d[1]; + int d1 = d[2] + d[3]; + int d2 = d[0] - d[1]; + int d3 = d[2] - d[3]; + d[0] = d0 + d1; + d[2] = d2 + d3; + d[1] = d0 - d1; + d[3] = d2 - d3; } -static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) +static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[16], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) { int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY; if( h->mb.b_trellis ) @@ -105,7 +105,7 @@ static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); } -static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx ) +static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[64], int i_qp, int b_intra, int idx ) { int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY; if( h->mb.b_trellis ) @@ -130,7 +130,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp ) int nz; uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]]; uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]]; - ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4] ); + ALIGNED_ARRAY_16( int16_t, dct4x4,[16] ); if( h->mb.b_lossless ) { @@ -166,7 +166,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp ) int nz; uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE]; uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE]; - ALIGNED_ARRAY_16( int16_t, dct8x8,[8],[8] ); + ALIGNED_ARRAY_16( int16_t, dct8x8,[64] ); if( h->mb.b_lossless ) { @@ -196,8 +196,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) uint8_t *p_src = h->mb.pic.p_fenc[0]; uint8_t *p_dst = h->mb.pic.p_fdec[0]; - ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] ); - ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[4],[4] ); + ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] ); + ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] ); int i, nz; int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P); @@ -209,7 +209,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) { int oe = block_idx_xy_fenc[i]; int od = block_idx_xy_fdec[i]; - nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[i], p_src+oe, p_dst+od, &dct_dc4x4[0][block_idx_yx_1d[i]] ); + nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] ); h->mb.cache.non_zero_count[x264_scan8[i]] = nz; h->mb.i_cbp_luma |= nz; } @@ -224,8 +224,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) for( i = 0; i < 16; i++ ) { /* copy dc coeff */ - dct_dc4x4[0][block_idx_xy_1d[i]] = dct4x4[i][0][0]; - dct4x4[i][0][0] = 0; + dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0]; + dct4x4[i][0] = 0; /* quant/scan/dequant */ nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i ); @@ -252,7 +252,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) h->dctf.dct4x4dc( dct_dc4x4 ); if( h->mb.b_trellis ) - nz = x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1, 0 ); + nz = x264_quant_dc_trellis( h, dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1, 0 ); else nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 ); @@ -266,7 +266,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */ if( h->mb.i_cbp_luma ) for( i = 0; i < 16; i++ ) - dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]]; + dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]]; } /* put pixels to fdec */ @@ -276,59 +276,59 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 ); } -static inline int idct_dequant_round_2x2_dc( int16_t ref[2][2], int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp ) +static inline int idct_dequant_round_2x2_dc( int16_t ref[4], int16_t dct[4], int dequant_mf[6][16], int i_qp ) { - int16_t out[2][2]; + int16_t out[4]; idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp ); - return ((ref[0][0] ^ (out[0][0]+32)) - | (ref[0][1] ^ (out[0][1]+32)) - | (ref[1][0] ^ (out[1][0]+32)) - | (ref[1][1] ^ (out[1][1]+32))) >> 6; + return ((ref[0] ^ (out[0]+32)) + | (ref[1] ^ (out[1]+32)) + | (ref[2] ^ (out[2]+32)) + | (ref[3] ^ (out[3]+32))) >> 6; } /* Round down coefficients losslessly in DC-only chroma blocks. * Unlike luma blocks, this can't be done with a lookup table or * other shortcut technique because of the interdependencies * between the coefficients due to the chroma DC transform. */ -static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, int16_t dct2x2[2][2] ) +static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, int16_t dct2x2[4] ) { - int16_t dct2x2_orig[2][2]; + int16_t dct2x2_orig[4]; int coeff; int nz = 0; /* If the QP is too high, there's no benefit to rounding optimization. */ - if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0][0] << (i_qp/6) > 32*64 ) + if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << (i_qp/6) > 32*64 ) return 1; idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); - dct2x2_orig[0][0] += 32; - dct2x2_orig[0][1] += 32; - dct2x2_orig[1][0] += 32; - dct2x2_orig[1][1] += 32; + dct2x2_orig[0] += 32; + dct2x2_orig[1] += 32; + dct2x2_orig[2] += 32; + dct2x2_orig[3] += 32; /* If the DC coefficients already round to zero, terminate early. */ - if( !((dct2x2_orig[0][0]|dct2x2_orig[0][1]|dct2x2_orig[1][0]|dct2x2_orig[1][1])>>6) ) + if( !((dct2x2_orig[0]|dct2x2_orig[1]|dct2x2_orig[2]|dct2x2_orig[3])>>6) ) return 0; /* Start with the highest frequency coefficient... is this the best option? */ for( coeff = 3; coeff >= 0; coeff-- ) { - int sign = dct2x2[0][coeff] < 0 ? -1 : 1; - int level = dct2x2[0][coeff]; + int sign = dct2x2[coeff] < 0 ? -1 : 1; + int level = dct2x2[coeff]; if( !level ) continue; while( level ) { - dct2x2[0][coeff] = level - sign; + dct2x2[coeff] = level - sign; if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) ) break; level -= sign; } nz |= level; - dct2x2[0][coeff] = level; + dct2x2[coeff] = level; } return !!nz; @@ -338,7 +338,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) { int i, ch, nz, nz_dc; int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate); - ALIGNED_ARRAY_16( int16_t, dct2x2,[2],[2] ); + ALIGNED_ARRAY_16( int16_t, dct2x2,[4] ); h->mb.i_cbp_chroma = 0; /* Early termination: check variance of chroma residual before encoding. @@ -369,7 +369,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] ); dct2x2dc_dconly( dct2x2 ); if( h->mb.b_trellis ) - nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 ); + nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 ); else nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<< 1 ); @@ -397,7 +397,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) int i_decimate_score = 0; int nz_ac = 0; - ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4][4] ); + ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] ); if( h->mb.b_lossless ) { @@ -434,7 +434,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) } if( h->mb.b_trellis ) - nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 ); + nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 ); else nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 ); @@ -740,7 +740,7 @@ void x264_macroblock_encode( x264_t *h ) } else if( h->mb.b_transform_8x8 ) { - ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] ); + ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[64] ); b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] ); h->nr_count[1] += h->mb.b_noise_reduction * 4; @@ -748,7 +748,7 @@ void x264_macroblock_encode( x264_t *h ) for( idx = 0; idx < 4; idx++ ) { if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 ); + h->quantf.denoise_dct( dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 ); nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx ); if( nz ) @@ -791,7 +791,7 @@ void x264_macroblock_encode( x264_t *h ) } else { - ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] ); + ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] ); h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] ); h->nr_count[0] += h->mb.b_noise_reduction * 16; @@ -806,7 +806,7 @@ void x264_macroblock_encode( x264_t *h ) idx = i8x8 * 4 + i4x4; if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); + h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx ); h->mb.cache.non_zero_count[x264_scan8[idx]] = nz; @@ -910,8 +910,8 @@ void x264_macroblock_encode( x264_t *h ) *****************************************************************************/ int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) { - ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4][4] ); - ALIGNED_ARRAY_16( int16_t, dct2x2,[2],[2] ); + ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] ); + ALIGNED_ARRAY_16( int16_t, dct2x2,[4] ); ALIGNED_ARRAY_16( int16_t, dctscan,[16] ); int i_qp = h->mb.i_qp; @@ -1078,7 +1078,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) { if( h->mb.b_transform_8x8 ) { - ALIGNED_ARRAY_16( int16_t, dct8x8,[8],[8] ); + ALIGNED_ARRAY_16( int16_t, dct8x8,[64] ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 ); if( nnz8x8 ) @@ -1104,7 +1104,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) { int i4; int i_decimate_8x8 = 0; - ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4][4] ); + ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] ); h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); for( i4 = 0; i4 < 4; i4++ ) { @@ -1133,12 +1133,12 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) for( ch = 0; ch < 2; ch++ ) { - ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4] ); + ALIGNED_ARRAY_16( int16_t, dct4x4,[16] ); p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE; p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE; h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); - dct4x4[0][0] = 0; + dct4x4[0] = 0; if( h->mb.b_trellis ) nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 ); @@ -1181,7 +1181,7 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 ) } else { - ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4] ); + ALIGNED_ARRAY_16( int16_t, dct4x4,[16] ); h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 ); h->mb.cache.non_zero_count[x264_scan8[i4]] = nz; diff --git a/encoder/macroblock.h b/encoder/macroblock.h index 94545779..36cd0e0e 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -57,9 +57,9 @@ void x264_cabac_mb_skip( x264_t *h, int b_skip ); int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma ); -int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, +int x264_quant_4x4_trellis( x264_t *h, int16_t *dct, int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma, int idx ); -int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, +int x264_quant_8x8_trellis( x264_t *h, int16_t *dct, int i_quant_cat, int i_qp, int b_intra, int idx ); void x264_noise_reduction_update( x264_t *h ); diff --git a/encoder/rdo.c b/encoder/rdo.c index 8abce17c..2fb2fd4e 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -609,27 +609,27 @@ const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3}; int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma ) { - return quant_trellis_cabac( h, (int16_t*)dct, + return quant_trellis_cabac( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced], i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 ); } -int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, +int x264_quant_4x4_trellis( x264_t *h, int16_t *dct, int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma, int idx ) { int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC); - return quant_trellis_cabac( h, (int16_t*)dct, + return quant_trellis_cabac( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_dct4_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan4[h->mb.b_interlaced], i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx ); } -int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, +int x264_quant_8x8_trellis( x264_t *h, int16_t *dct, int i_quant_cat, int i_qp, int b_intra, int idx ) { - return quant_trellis_cabac( h, (int16_t*)dct, + return quant_trellis_cabac( h, dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], x264_dct8_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan8[h->mb.b_interlaced], diff --git a/tools/checkasm.c b/tools/checkasm.c index ddb55176..45c0c202 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -500,11 +500,11 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_dct_function_t dct_asm; x264_quant_function_t qf; int ret = 0, ok, used_asm, i, j, interlace; - ALIGNED_16( int16_t dct1[16][4][4] ); - ALIGNED_16( int16_t dct2[16][4][4] ); - ALIGNED_16( int16_t dct4[16][4][4] ); - ALIGNED_16( int16_t dct8[4][8][8] ); - ALIGNED_8( int16_t dctdc[2][2][2] ); + ALIGNED_16( int16_t dct1[16][16] ); + ALIGNED_16( int16_t dct2[16][16] ); + ALIGNED_16( int16_t dct4[16][16] ); + ALIGNED_16( int16_t dct8[4][64] ); + ALIGNED_8( int16_t dctdc[2][4] ); x264_t h_buf; x264_t *h = &h_buf; @@ -608,9 +608,9 @@ static int check_dct( int cpu_ref, int cpu_new ) for( i=0; i<16 && ok; i++ )\ {\ for( j=0; j<16; j++ )\ - dct1[0][0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\ - : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\ - : ((*p++)&0x1fff)-0x1000; /* general case */\ + dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\ + : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\ + : ((*p++)&0x1fff)-0x1000; /* general case */\ memcpy( dct2, dct1, 32 );\ call_c1( dct_c.name, dct1[0] );\ call_a1( dct_asm.name, dct2[0] );\ @@ -741,7 +741,7 @@ static int check_dct( int cpu_ref, int cpu_new ) report( "zigzag_field :" ); ok = 1; used_asm = 0; - TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0][0], 64 ); + TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 ); report( "zigzag_interleave :" ); #undef TEST_ZIGZAG_SCAN #undef TEST_ZIGZAG_SUB @@ -1118,25 +1118,21 @@ static int check_quant( int cpu_ref, int cpu_new ) #define INIT_QUANT8() \ { \ static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \ - int x, y; \ - for( y = 0; y < 8; y++ ) \ - for( x = 0; x < 8; x++ ) \ - { \ - unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \ - dct1[y*8+x] = dct2[y*8+x] = j ? (rand()%(2*scale+1))-scale : 0; \ - } \ + for( i = 0; i < 64; i++ ) \ + { \ + unsigned int scale = (255*scale1d[i>>3]*scale1d[i&7])/16; \ + dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \ + } \ } #define INIT_QUANT4() \ { \ static const int scale1d[4] = {4,6,4,6}; \ - int x, y; \ - for( y = 0; y < 4; y++ ) \ - for( x = 0; x < 4; x++ ) \ - { \ - unsigned int scale = 255*scale1d[y]*scale1d[x]; \ - dct1[y*4+x] = dct2[y*4+x] = j ? (rand()%(2*scale+1))-scale : 0; \ - } \ + for( i = 0; i < 16; i++ ) \ + { \ + unsigned int scale = 255*scale1d[i>>2]*scale1d[i&3]; \ + dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \ + } \ } #define TEST_QUANT_DC( name, cqm ) \ @@ -1151,16 +1147,16 @@ static int check_quant( int cpu_ref, int cpu_new ) int result_c, result_a; \ for( i = 0; i < 16; i++ ) \ dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \ - result_c = call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ - result_a = call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \ { \ oks[0] = 0; \ fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \ break; \ } \ - call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ - call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + call_c2( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + call_a2( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ } \ } \ } @@ -1176,16 +1172,16 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ int result_c, result_a; \ INIT_QUANT##w() \ - result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \ { \ oks[0] = 0; \ fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ break; \ } \ - call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + call_c2( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + call_a2( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ } \ } \ } @@ -1206,18 +1202,18 @@ static int check_quant( int cpu_ref, int cpu_new ) for( qp = 51; qp > 0; qp-- ) \ { \ INIT_QUANT##w() \ - call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ memcpy( dct2, dct1, w*w*2 ); \ - call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \ - call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \ + call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ + call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ if( memcmp( dct1, dct2, w*w*2 ) ) \ { \ oks[1] = 0; \ fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ break; \ } \ - call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \ - call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \ + call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ + call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ } \ } @@ -1235,17 +1231,17 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ for( i = 0; i < 16; i++ ) \ dct1[i] = rand(); \ - call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \ + call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \ memcpy( dct2, dct1, w*w*2 ); \ - call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \ - call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \ + call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ + call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ if( memcmp( dct1, dct2, w*w*2 ) ) \ { \ oks[1] = 0; \ fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ } \ - call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \ - call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \ + call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ + call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ } \ } @@ -1293,8 +1289,8 @@ static int check_quant( int cpu_ref, int cpu_new ) dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \ if( ac ) \ dct1[0] = 0; \ - result_c = call_c( qf_c.decname, (void*)dct1 ); \ - result_a = call_a( qf_a.decname, (void*)dct1 ); \ + result_c = call_c( qf_c.decname, dct1 ); \ + result_a = call_a( qf_a.decname, dct1 ); \ if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \ { \ ok = 0; \ @@ -1324,8 +1320,8 @@ static int check_quant( int cpu_ref, int cpu_new ) nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \ if( !nnz ) \ dct1[ac] = 1; \ - result_c = call_c( qf_c.last, (void*)(dct1+ac) ); \ - result_a = call_a( qf_a.last, (void*)(dct1+ac) ); \ + result_c = call_c( qf_c.last, dct1+ac ); \ + result_a = call_a( qf_a.last, dct1+ac ); \ if( result_c != result_a ) \ { \ ok = 0; \ @@ -1359,8 +1355,8 @@ static int check_quant( int cpu_ref, int cpu_new ) nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \ if( !nnz ) \ dct1[ac] = 1; \ - result_c = call_c( qf_c.lastname, (void*)(dct1+ac), &runlevel_c ); \ - result_a = call_a( qf_a.lastname, (void*)(dct1+ac), &runlevel_a ); \ + result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \ + result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \ if( result_c != result_a || runlevel_c.last != runlevel_a.last || \ memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \ memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \