N: Francesco Corriga
D: VfW
+N: Gabriel Bouvigne
+E: gabriel.bouvigne AT joost DOT com
+D: 2pass VBV
+
N: Guillaume Poirier
E: gpoirier CHEZ mplayerhq POINT hu
D: Altivec optimizations
S: Brittany, France
+N: Fiona Glaser
+E: fiona AT x264 DOT com
+D: x86 asm, 1pass VBV, adaptive quantization, inline asm
+D: various speed optimizations, bugfixes
+S: USA
+
N: Justin Clay
E: justin_clay AT hotmail DOT com
C: wheatgerm
#define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c)))
#define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
#define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
-#define XCHG(type,a,b) { type t = a; a = b; b = t; }
+#define XCHG(type,a,b) do{ type t = a; a = b; b = t; } while(0)
#define FIX8(f) ((int)(f*(1<<8)+.5))
#define CHECKED_MALLOC( var, size )\
{ 4, 6, 12, 14 },
{ 5, 7, 13, 15 }
};
+static const uint8_t block_idx_xy_1d[16] =
+{
+ 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
+};
+static const uint8_t block_idx_xy_fenc[16] =
+{
+ 0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE,
+ 0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE,
+ 2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE,
+ 2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE,
+ 0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE,
+ 0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE,
+ 2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE,
+ 2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE
+};
+static const uint16_t block_idx_xy_fdec[16] =
+{
+ 0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE,
+ 0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE,
+ 2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE,
+ 2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE,
+ 0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE,
+ 0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE,
+ 2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE,
+ 2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
+};
static const uint8_t i_chroma_qp_table[52] =
{
#define array_non_zero_count array_non_zero_count_mmx
static inline int array_non_zero_count_mmx( int16_t *v )
{
- static const uint64_t pw_2 = 0x0202020202020202ULL;
int count;
asm(
"pxor %%mm7, %%mm7 \n"
"movq (%1), %%mm0 \n"
- "movq 16(%1), %%mm1 \n"
- "packsswb 8(%1), %%mm0 \n"
+ "movq 8(%1), %%mm1 \n"
+ "packsswb 16(%1), %%mm0 \n"
"packsswb 24(%1), %%mm1 \n"
"pcmpeqb %%mm7, %%mm0 \n"
"pcmpeqb %%mm7, %%mm1 \n"
"paddb %%mm0, %%mm1 \n"
- "paddb %2, %%mm1 \n"
"psadbw %%mm7, %%mm1 \n"
"movd %%mm1, %0 \n"
:"=r"(count)
- :"r"(v), "m"(pw_2)
+ :"r"(v)
);
- return count;
+ return (count+0x10)&0xff;
}
#undef array_non_zero_int
#define array_non_zero_int array_non_zero_int_mmx
if( b_merged_satd && i_max == 9 )
{
- int satd[3];
+ int satd[9];
h->pixf.intra_sa8d_x3_8x8( p_src_by, edge, satd );
- if( i_pred_mode < 3 )
- satd[i_pred_mode] -= 3 * a->i_lambda;
+ satd[i_pred_mode] -= 3 * a->i_lambda;
for( i=2; i>=0; i-- )
{
int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
for( idx = 0;; idx++ )
{
- int x = block_idx_x[idx];
- int y = block_idx_y[idx];
- uint8_t *p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
- uint8_t *p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
+ uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
+ uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
int i_best = COST_MAX;
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
if( b_merged_satd && i_max >= 6 )
{
- int satd[3];
+ int satd[9];
h->pixf.intra_satd_x3_4x4( p_src_by, p_dst_by, satd );
- if( i_pred_mode < 3 )
- satd[i_pred_mode] -= 3 * a->i_lambda;
+ satd[i_pred_mode] -= 3 * a->i_lambda;
for( i=2; i>=0; i-- )
COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
a->i_predict4x4[idx], i );
int i_nnz = 0;
for( idx = 0; idx < 16; idx++ )
{
- uint8_t *p_src_by;
- uint8_t *p_dst_by;
+ uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
i_best = COST_MAX;
i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
- x = block_idx_x[idx];
- y = block_idx_y[idx];
- p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
- p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
#include "common/common.h"
#include "macroblock.h"
-
#define ZIG(i,y,x) level[i] = dct[x][y];
static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
{
void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
{
- int x = 4 * block_idx_x[idx];
- int y = 4 * block_idx_y[idx];
- uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
- uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
+ uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
+ uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
if( h->mb.b_lossless )
{
for( i = 0; i < 16; i++ )
{
- int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
- int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
+ int oe = block_idx_xy_fenc[i];
+ int od = block_idx_xy_fdec[i];
h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
- dct_dc4x4[block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
+ dct_dc4x4[0][block_idx_xy_1d[i]] = h->dct.luma4x4[i][0];
h->dct.luma4x4[i][0] = 0;
}
h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
- dct_dc4x4[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+ dct_dc4x4[0][block_idx_xy_1d[i]] = dct4x4[i][0][0];
dct4x4[i][0][0] = 0;
/* quant/scan/dequant */
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
- dct4x4[i][0][0] = dct_dc4x4[block_idx_y[i]][block_idx_x[i]];
+ dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
}
/* put pixels to fdec */
h->dctf.add16x16_idct( p_dst, dct4x4 );
for( i = 0; i < 4; i++ )
{
/* copy dc coeff */
- dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+ dct2x2[i>>1][i&1] = dct4x4[i][0][0];
dct4x4[i][0][0] = 0;
/* no trellis; it doesn't seem to help chroma noticeably */
for( i = 0; i < 4; i++ )
h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
}
-
- for( i = 0; i < 4; i++ )
- dct4x4[i][0][0] = dct2x2[0][i];
+ dct4x4[0][0][0] = dct2x2[0][0];
+ dct4x4[1][0][0] = dct2x2[0][1];
+ dct4x4[2][0][0] = dct2x2[1][0];
+ dct4x4[3][0][0] = dct2x2[1][1];
h->dctf.add8x8_idct( p_dst, dct4x4 );
}
}
for( i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
{
- uint8_t *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * FDEC_STRIDE];
+ uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
{
for( i4x4 = 0; i4x4 < 16; i4x4++ )
{
- int x = 4*block_idx_x[i4x4];
- int y = 4*block_idx_y[i4x4];
h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
- h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
- h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
+ h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
+ h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
}
}
else if( h->mb.b_transform_8x8 )
int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
COST_MV_HPEL( mx, my );
}
- i++;
- } while( i < i_mvc );
+ } while( ++i < i_mvc );
bmx = ( bpred_mx + 2 ) >> 2;
bmy = ( bpred_my + 2 ) >> 2;
COST_MV( bmx, bmy );
my = x264_clip3( my, mv_y_min, mv_y_max );
COST_MV( mx, my );
}
- i++;
- } while( i < i_mvc );
+ } while( ++i < i_mvc );
}
COST_MV( 0, 0 );
{
case X264_ME_DIA:
/* diamond search, radius 1 */
- for( i = 0; i < i_me_range; i++ )
+ i = 0;
+ do
{
DIA1_ITER( bmx, bmy );
if( (bmx == omx) & (bmy == omy) )
break;
if( !CHECK_MVRANGE(bmx, bmy) )
break;
- }
+ } while( ++i < i_me_range );
break;
case X264_ME_HEX:
/* hexagon grid */
omx = bmx; omy = bmy;
- for( i = 1; i <= i_me_range/4; i++ )
+
+ i = 1;
+ do
{
static const int hex4[16][2] = {
{-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
COST_MV_X4( 4*i, 1*i, 4*i, 2*i, 2*i, 3*i, 0*i, 4*i );
COST_MV_X4( -2*i, 3*i, -2*i,-3*i, 0*i,-4*i, 2*i,-3*i );
}
- }
+ } while( ++i <= i_me_range/4 );
if( bmy <= mv_y_max )
goto me_hex2;
break;
* because sum(abs(diff)) >= abs(diff(sum)). */
const int stride = m->i_stride[0];
uint16_t *sums_base = m->integral;
- DECLARE_ALIGNED_16( static uint8_t zero[16*16] );
+ /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
+ * unlike the similar case in ratecontrol.c, this is not a problem because it is not used for any
+ * SSE instructions and the only loss is a tiny bit of performance. */
+ DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
DECLARE_ALIGNED_16( int enc_dc[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
for( j=i; j<nmvsad; j++ )
if( mvsads[j].sad <= bsad )
- mvsads[i++] = mvsads[j];
+ {
+ /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
+ if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+ *(uint64_t*)&mvsads[i++] = *(uint64_t*)&mvsads[j];
+ else
+ mvsads[i++] = mvsads[j];
+ }
nmvsad = i;
}
if( nmvsad > limit )
for( j=i+1; j<nmvsad; j++ )
COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
if( bj > i )
- XCHG( mvsad_t, mvsads[i], mvsads[bj] );
+ {
+ if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+ XCHG( uint64_t, *(uint64_t*)&mvsads[i], *(uint64_t*)&mvsads[bj] );
+ else
+ XCHG( mvsad_t, mvsads[i], mvsads[bj] );
+ }
}
nmvsad = limit;
}
BIME_CACHE(-(a),-(b))
#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \
-if( pass == 0 || !visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] ) \
+if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \
{ \
int cost; \
int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
- visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] = 1; \
+ visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
h->mc.memcpy_aligned( pix, pix0[i0], bs ); \
if( i_weight == 32 ) \
h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \
int bm1y = m1->mv[1], om1y = bm1y;
int bcost = COST_MAX;
int pass = 0;
- uint8_t visited[8][8][8][8];
+ /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
+ uint8_t visited[8][8][8];
h->mc.memzero_aligned( visited, sizeof(visited) );
BIME_CACHE( 0, 0 );
if( satd <= bsatd * SATD_THRESH )\
{ \
int cost; \
- cache_mv[0] = cache_mv2[0] = mx; \
- cache_mv[1] = cache_mv2[1] = my; \
+ *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
cost = x264_rd_cost_part( h, i_lambda2, i8, m->i_pixel ); \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
} \
p_cost_mvx = m->p_cost_mv - pmx;
p_cost_mvy = m->p_cost_mv - pmy;
COST_MV_SATD( bmx, bmy, bsatd );
- COST_MV_RD( bmx, bmy, 0, 0, 0);
+ COST_MV_RD( bmx, bmy, 0, 0, 0 );
/* check the predicted mv */
if( (bmx != pmx || bmy != pmy)