From bfa2eac7fdc92eaf27004ef66e93898ec27f61f1 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Thu, 15 May 2008 06:01:01 -0600 Subject: [PATCH] explicit write combining, because gcc fails at optimizing consecutive memory accesses --- common/common.h | 6 +- common/macroblock.c | 215 ++++++++++++++----------------------------- common/macroblock.h | 55 ++++++----- common/osdep.h | 3 + encoder/analyse.c | 149 ++++++++++++++---------------- encoder/cabac.c | 4 +- encoder/cavlc.c | 6 +- encoder/macroblock.c | 10 +- encoder/me.c | 6 +- encoder/me.h | 8 +- encoder/slicetype.c | 4 +- 11 files changed, 196 insertions(+), 270 deletions(-) diff --git a/common/common.h b/common/common.h index 628c0009..a53509b4 100644 --- a/common/common.h +++ b/common/common.h @@ -471,14 +471,14 @@ struct x264_t /* 0 if not available */ DECLARE_ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] ); - DECLARE_ALIGNED_4( int16_t mvd[2][X264_SCAN8_SIZE][2] ); + DECLARE_ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] ); /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */ DECLARE_ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] ); DECLARE_ALIGNED_16( int16_t direct_mv[2][X264_SCAN8_SIZE][2] ); - int8_t direct_ref[2][X264_SCAN8_SIZE]; - int pskip_mv[2]; + DECLARE_ALIGNED_4( int8_t direct_ref[2][X264_SCAN8_SIZE] ); + DECLARE_ALIGNED_4( int16_t pskip_mv[2] ); /* number of neighbors (top and left) that used 8x8 dct */ int i_neighbour_transform_size; diff --git a/common/macroblock.c b/common/macroblock.c index cd1f9cc7..d2fc0cbc 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -88,7 +88,7 @@ int x264_mb_transform_8x8_allowed( x264_t *h ) } } -void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] ) +void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] ) { const int i8 = x264_scan8[idx]; const int i_ref= h->mb.cache.ref[i_list][i8]; @@ -111,14 +111,12 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] { if( idx == 0 && i_refb == i_ref ) { - mvp[0] = mv_b[0]; - mvp[1] = mv_b[1]; + *(uint32_t*)mvp = *(uint32_t*)mv_b; return; } else if( idx != 0 && i_refa == i_ref ) { - mvp[0] = mv_a[0]; - mvp[1] = mv_a[1]; + *(uint32_t*)mvp = *(uint32_t*)mv_a; return; } } @@ -126,14 +124,12 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] { if( idx == 0 && i_refa == i_ref ) { - mvp[0] = mv_a[0]; - mvp[1] = mv_a[1]; + *(uint32_t*)mvp = *(uint32_t*)mv_a; return; } else if( idx != 0 && i_refc == i_ref ) { - mvp[0] = mv_c[0]; - mvp[1] = mv_c[1]; + *(uint32_t*)mvp = *(uint32_t*)mv_c; return; } } @@ -151,26 +147,14 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] else if( i_count == 1 ) { if( i_refa == i_ref ) - { - mvp[0] = mv_a[0]; - mvp[1] = mv_a[1]; - } + *(uint32_t*)mvp = *(uint32_t*)mv_a; else if( i_refb == i_ref ) - { - mvp[0] = mv_b[0]; - mvp[1] = mv_b[1]; - } + *(uint32_t*)mvp = *(uint32_t*)mv_b; else - { - mvp[0] = mv_c[0]; - mvp[1] = mv_c[1]; - } + *(uint32_t*)mvp = *(uint32_t*)mv_c; } else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) - { - mvp[0] = mv_a[0]; - mvp[1] = mv_a[1]; - } + *(uint32_t*)mvp = *(uint32_t*)mv_a; else { mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] ); @@ -178,7 +162,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] } } -void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] ) +void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] ) { int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; @@ -208,26 +192,14 @@ void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] ) else if( i_count == 1 ) { if( i_refa == i_ref ) - { - mvp[0] = mv_a[0]; - mvp[1] = mv_a[1]; - } + *(uint32_t*)mvp = *(uint32_t*)mv_a; else if( i_refb == i_ref ) - { - mvp[0] = mv_b[0]; - mvp[1] = mv_b[1]; - } + *(uint32_t*)mvp = *(uint32_t*)mv_b; else - { - mvp[0] = mv_c[0]; - mvp[1] = mv_c[1]; - } + *(uint32_t*)mvp = *(uint32_t*)mv_c; } else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) - { - mvp[0] = mv_a[0]; - mvp[1] = mv_a[1]; - } + *(uint32_t*)mvp = *(uint32_t*)mv_a; else { mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] ); @@ -236,7 +208,7 @@ void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] ) } -void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] ) +void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] ) { int i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1]; int i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8]; @@ -244,10 +216,10 @@ void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] ) int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8]; if( i_refa == -2 || i_refb == -2 || - ( i_refa == 0 && mv_a[0] == 0 && mv_a[1] == 0 ) || - ( i_refb == 0 && mv_b[0] == 0 && mv_b[1] == 0 ) ) + ( i_refa == 0 && *(uint32_t*)mv_a == 0 ) || + ( i_refb == 0 && *(uint32_t*)mv_b == 0 ) ) { - mv[0] = mv[1] = 0; + *(uint32_t*)mv = 0; } else { @@ -268,8 +240,8 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h ) if( IS_INTRA( type_col ) ) { x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0, 0 ); + x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 ); + x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 ); return 1; } b8x8 = h->sps->b_direct8x8_inference || @@ -291,11 +263,10 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h ) if( b8x8 ) { const int16_t *mv_col = h->fref1[0]->mv[0][ i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride]; - int mv_l0[2]; - mv_l0[0] = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; - mv_l0[1] = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; - x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, mv_l0[0], mv_l0[1] ); - x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, mv_l0[0] - mv_col[0], mv_l0[1] - mv_col[1] ); + const int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; + const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; + x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, pack16to32_mask(l0x, l0y) ); + x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) ); } else { @@ -304,11 +275,10 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h ) const int x4 = i4%2 + 2*x8; const int y4 = i4/2 + 2*y8; const int16_t *mv_col = h->fref1[0]->mv[0][ i_mb_4x4 + x4 + y4 * h->mb.i_b4_stride ]; - int mv_l0[2]; - mv_l0[0] = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; - mv_l0[1] = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; - x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, mv_l0[0], mv_l0[1] ); - x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, mv_l0[0] - mv_col[0], mv_l0[1] - mv_col[1] ); + const int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; + const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; + x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, pack16to32_mask(l0x, l0y) ); + x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) ); } } } @@ -350,7 +320,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h ) static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h ) { int ref[2]; - int mv[2][2]; + DECLARE_ALIGNED_4( int16_t mv[2][2] ); int i_list; int i8, i4; int b8x8; @@ -381,10 +351,7 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h ) { ref[0] = ref[1] = 0; - mv[0][0] = - mv[0][1] = - mv[1][0] = - mv[1][1] = 0; + *(uint64_t*)mv[0] = 0; } else { @@ -393,14 +360,14 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h ) if( ref[i_list] >= 0 ) x264_mb_predict_mv_16x16( h, i_list, ref[i_list], mv[i_list] ); else - mv[i_list][0] = mv[i_list][1] = 0; + *(uint32_t*)mv[i_list] = 0; } } x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, mv[0][0], mv[0][1] ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, mv[1][0], mv[1][1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, mv[0] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, mv[1] ); if( IS_INTRA( type_col ) ) return 1; @@ -435,9 +402,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h ) if( abs( mvcol[0] ) <= 1 && abs( mvcol[1] ) <= 1 ) { if( ref[0] == 0 ) - x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0, 0 ); + x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 ); if( ref[1] == 0 ) - x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0, 0 ); + x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 ); } } else @@ -450,9 +417,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h ) if( abs( mvcol[0] ) <= 1 && abs( mvcol[1] ) <= 1 ) { if( ref[0] == 0 ) - x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, 0, 0 ); + x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, 0 ); if( ref[1] == 0 ) - x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, 0, 0 ); + x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, 0 ); } } } @@ -527,14 +494,13 @@ void x264_mb_load_mv_direct8x8( x264_t *h, int idx ) #define FIXED_SCALE 256 /* This just improves encoder performance, it's not part of the spec */ -void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int mvc[8][2], int *i_mvc ) +void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc ) { int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref]; int i = 0; #define SET_MVP(mvp) { \ - mvc[i][0] = mvp[0]; \ - mvc[i][1] = mvp[1]; \ + *(uint32_t*)mvc[i] = *(uint32_t*)mvp; \ i++; \ } @@ -1262,15 +1228,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) const int ir = i_top_8x8 - 1; const int iv = i_top_4x4 - 1; h->mb.cache.ref[i_list][i8] = h->mb.ref[i_list][ir]; - h->mb.cache.mv[i_list][i8][0] = h->mb.mv[i_list][iv][0]; - h->mb.cache.mv[i_list][i8][1] = h->mb.mv[i_list][iv][1]; + *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv]; } else { const int i8 = x264_scan8[0] - 1 - 1*8; h->mb.cache.ref[i_list][i8] = -2; - h->mb.cache.mv[i_list][i8][0] = 0; - h->mb.cache.mv[i_list][i8][1] = 0; + *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0; } if( h->mb.i_neighbour & MB_TOP ) @@ -1282,22 +1246,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) h->mb.cache.ref[i_list][i8+1] = h->mb.ref[i_list][ir + 0]; h->mb.cache.ref[i_list][i8+2] = h->mb.cache.ref[i_list][i8+3] = h->mb.ref[i_list][ir + 1]; - - for( i = 0; i < 4; i++ ) - { - h->mb.cache.mv[i_list][i8+i][0] = h->mb.mv[i_list][iv + i][0]; - h->mb.cache.mv[i_list][i8+i][1] = h->mb.mv[i_list][iv + i][1]; - } + *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = *(uint64_t*)h->mb.mv[i_list][iv+0]; + *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = *(uint64_t*)h->mb.mv[i_list][iv+2]; } else { const int i8 = x264_scan8[0] - 8; - for( i = 0; i < 4; i++ ) - { - h->mb.cache.ref[i_list][i8+i] = -2; - h->mb.cache.mv[i_list][i8+i][0] = - h->mb.cache.mv[i_list][i8+i][1] = 0; - } + *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = 0; + *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = 0; + *(uint32_t*)&h->mb.cache.ref[i_list][i8] = (uint8_t)(-2) * 0x01010101U; } if( h->mb.i_neighbour & MB_TOPRIGHT ) @@ -1306,15 +1263,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) const int ir = i_top_8x8 + 2; const int iv = i_top_4x4 + 4; h->mb.cache.ref[i_list][i8] = h->mb.ref[i_list][ir]; - h->mb.cache.mv[i_list][i8][0] = h->mb.mv[i_list][iv][0]; - h->mb.cache.mv[i_list][i8][1] = h->mb.mv[i_list][iv][1]; + *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv]; } else { const int i8 = x264_scan8[0] + 4 - 1*8; h->mb.cache.ref[i_list][i8] = -2; - h->mb.cache.mv[i_list][i8][0] = 0; - h->mb.cache.mv[i_list][i8][1] = 0; + *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0; } if( h->mb.i_neighbour & MB_LEFT ) @@ -1328,10 +1283,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8]; for( i = 0; i < 4; i++ ) - { - h->mb.cache.mv[i_list][i8+i*8][0] = h->mb.mv[i_list][iv + i*s4x4][0]; - h->mb.cache.mv[i_list][i8+i*8][1] = h->mb.mv[i_list][iv + i*s4x4][1]; - } + *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = *(uint32_t*)h->mb.mv[i_list][iv + i*s4x4]; } else { @@ -1339,8 +1291,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) for( i = 0; i < 4; i++ ) { h->mb.cache.ref[i_list][i8+i*8] = -2; - h->mb.cache.mv[i_list][i8+i*8][0] = - h->mb.cache.mv[i_list][i8+i*8][1] = 0; + *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = 0; } } @@ -1350,20 +1301,14 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) { const int i8 = x264_scan8[0] - 8; const int iv = i_top_4x4; - for( i = 0; i < 4; i++ ) - { - h->mb.cache.mvd[i_list][i8+i][0] = h->mb.mvd[i_list][iv + i][0]; - h->mb.cache.mvd[i_list][i8+i][1] = h->mb.mvd[i_list][iv + i][1]; - } + *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] = *(uint64_t*)h->mb.mvd[i_list][iv+0]; + *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = *(uint64_t*)h->mb.mvd[i_list][iv+2]; } else { const int i8 = x264_scan8[0] - 8; - for( i = 0; i < 4; i++ ) - { - h->mb.cache.mvd[i_list][i8+i][0] = - h->mb.cache.mvd[i_list][i8+i][1] = 0; - } + *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] = + *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = 0; } if( i_left_type >= 0 ) @@ -1371,19 +1316,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) const int i8 = x264_scan8[0] - 1; const int iv = i_mb_4x4 - 1; for( i = 0; i < 4; i++ ) - { - h->mb.cache.mvd[i_list][i8+i*8][0] = h->mb.mvd[i_list][iv + i*s4x4][0]; - h->mb.cache.mvd[i_list][i8+i*8][1] = h->mb.mvd[i_list][iv + i*s4x4][1]; - } + *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = *(uint32_t*)h->mb.mvd[i_list][iv + i*s4x4]; } else { const int i8 = x264_scan8[0] - 1; for( i = 0; i < 4; i++ ) - { - h->mb.cache.mvd[i_list][i8+i*8][0] = - h->mb.cache.mvd[i_list][i8+i*8][1] = 0; - } + *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = 0; } } } @@ -1516,7 +1455,7 @@ void x264_macroblock_cache_save( x264_t *h ) int i_list; for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ ) { - int y,x; + int y; h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[0]]; h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[4]]; @@ -1525,11 +1464,8 @@ void x264_macroblock_cache_save( x264_t *h ) for( y = 0; y < 4; y++ ) { - for( x = 0; x < 4; x++ ) - { - h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][0] = h->mb.cache.mv[i_list][x264_scan8[0]+x+8*y][0]; - h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][1] = h->mb.cache.mv[i_list][x264_scan8[0]+x+8*y][1]; - } + *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+0]; + *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+2]; } } } @@ -1538,20 +1474,15 @@ void x264_macroblock_cache_save( x264_t *h ) int i_list; for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ ) { - int y,x; + int y; - h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] = - h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] = - h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] = - h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = -1; + *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101; + *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101; for( y = 0; y < 4; y++ ) { - for( x = 0; x < 4; x++ ) - { - h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][0] = 0; - h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][1] = 0; - } + *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0; + *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0; } } } @@ -1569,14 +1500,11 @@ void x264_macroblock_cache_save( x264_t *h ) for( i_list = 0; i_list < 2; i_list++ ) { const int s4x4 = 4 * h->mb.i_mb_stride; - int y,x; + int y; for( y = 0; y < 4; y++ ) { - for( x = 0; x < 4; x++ ) - { - h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][0] = h->mb.cache.mvd[i_list][x264_scan8[0]+x+8*y][0]; - h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][1] = h->mb.cache.mvd[i_list][x264_scan8[0]+x+8*y][1]; - } + *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+0]; + *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+2]; } } } @@ -1586,14 +1514,11 @@ void x264_macroblock_cache_save( x264_t *h ) for( i_list = 0; i_list < 2; i_list++ ) { const int s4x4 = 4 * h->mb.i_mb_stride; - int y,x; + int y; for( y = 0; y < 4; y++ ) { - for( x = 0; x < 4; x++ ) - { - h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][0] = 0; - h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][1] = 0; - } + *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = 0; + *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = 0; } } } diff --git a/common/macroblock.h b/common/macroblock.h index e127e2b7..2766ddd0 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -253,16 +253,16 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ) /* x264_mb_predict_mv_16x16: * set mvp with predicted mv for D_16x16 block * h->mb. need only valid values from other blocks */ -void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] ); +void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] ); /* x264_mb_predict_mv_pskip: * set mvp with predicted mv for P_SKIP * h->mb. need only valid values from other blocks */ -void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] ); +void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] ); /* x264_mb_predict_mv: * set mvp with predicted mv for all blocks except SKIP and DIRECT * h->mb. need valid ref/partition/sub of current block to be valid * and valid mv/ref from other blocks. */ -void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] ); +void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] ); /* x264_mb_predict_mv_direct16x16: * set h->mb.cache.mv and h->mb.cache.ref for B_SKIP or B_DIRECT * h->mb. need only valid values from other blocks. @@ -278,7 +278,7 @@ void x264_mb_load_mv_direct8x8( x264_t *h, int idx ); * set mvc with D_16x16 prediction. * uses all neighbors, even those that didn't end up using this ref. * h->mb. need only valid values from other blocks */ -void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int mvc[8][2], int *i_mvc ); +void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc ); int x264_mb_predict_intra4x4_mode( x264_t *h, int idx ); @@ -293,7 +293,14 @@ int x264_mb_transform_8x8_allowed( x264_t *h ); void x264_mb_mc( x264_t *h ); void x264_mb_mc_8x8( x264_t *h, int i8 ); - +static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b ) +{ +#ifdef WORDS_BIGENDIAN + return (b&0xFFFF) + (a<<16); +#else + return (a&0xFFFF) + (b<<16); +#endif +} static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int height, uint8_t val ) { int dy; @@ -313,30 +320,32 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val ) { int dy, dx; - for( dy = 0; dy < height; dy++ ) - for( dx = 0; dx < width; dx++ ) - ((uint32_t*)dst)[dx+8*dy] = val; -} -static ALWAYS_INLINE uint32_t pack16to32_clip( int a, int b ) -{ -#ifdef WORDS_BIGENDIAN - return (b&0xFFFF) + (a<<16); -#else - return (a&0xFFFF) + (b<<16); -#endif + if( width == 1 || WORD_SIZE < 8 ) + { + for( dy = 0; dy < height; dy++ ) + for( dx = 0; dx < width; dx++ ) + ((uint32_t*)dst)[dx+8*dy] = val; + } + else + { + uint64_t val64 = val + ((uint64_t)val<<32); + for( dy = 0; dy < height; dy++ ) + for( dx = 0; dx < width/2; dx++ ) + ((uint64_t*)dst)[dx+4*dy] = val64; + } } - -static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref ) +#define x264_macroblock_cache_mv_ptr(a,x,y,w,h,l,mv) x264_macroblock_cache_mv(a,x,y,w,h,l,*(uint32_t*)mv) +static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv ) { - x264_macroblock_cache_rect1( &h->mb.cache.ref[i_list][X264_SCAN8_0+x+8*y], width, height, ref ); + x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv ); } -static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, int mvx, int mvy ) +static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv ) { - x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, pack16to32_clip(mvx,mvy) ); + x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv ); } -static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, int mdx, int mdy ) +static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref ) { - x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, pack16to32_clip(mdx,mdy) ); + x264_macroblock_cache_rect1( &h->mb.cache.ref[i_list][X264_SCAN8_0+x+8*y], width, height, ref ); } static ALWAYS_INLINE void x264_macroblock_cache_skip( x264_t *h, int x, int y, int width, int height, int b_skip ) { diff --git a/common/osdep.h b/common/osdep.h index d914e784..c7353e0d 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -127,4 +127,7 @@ #define x264_pthread_cond_wait(c,m) usleep(100) #endif +/* FIXME: long isn't always the native register size (e.g. win64). */ +#define WORD_SIZE sizeof(long) + #endif /* X264_OSDEP_H */ diff --git a/encoder/analyse.c b/encoder/analyse.c index 9da9a5c7..dad24b1f 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -45,7 +45,7 @@ typedef struct /* 8x8 */ int i_cost8x8; /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */ - DECLARE_ALIGNED_8( int mvc[32][5][2] ); + DECLARE_ALIGNED_4( int16_t mvc[32][5][2] ); x264_me_t me8x8[4]; /* Sub 4x4 */ @@ -967,8 +967,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) { x264_me_t m; - int i_ref; - int mvc[7][2], i_mvc; + int i_ref, i_mvc; + DECLARE_ALIGNED_4( int16_t mvc[7][2] ); int i_halfpel_thresh = INT_MAX; int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL; @@ -1013,10 +1013,8 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) a->l0.me16x16 = m; /* save mv for predicting neighbors */ - a->l0.mvc[i_ref][0][0] = - h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0]; - a->l0.mvc[i_ref][0][1] = - h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1]; + *(uint32_t*)a->l0.mvc[i_ref][0] = + *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; } x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); @@ -1024,11 +1022,10 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) h->mb.i_type = P_L0; if( a->b_mbrd && a->l0.me16x16.i_ref == 0 - && a->l0.me16x16.mv[0] == h->mb.cache.pskip_mv[0] - && a->l0.me16x16.mv[1] == h->mb.cache.pskip_mv[1] ) + && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv ) { h->mb.i_partition = D_16x16; - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 ); } } @@ -1060,10 +1057,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t } for( i_ref = 0; i_ref <= i_maxref; i_ref++ ) - { - a->l0.mvc[i_ref][0][0] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0]; - a->l0.mvc[i_ref][0][1] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1]; - } + *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy]; for( i = 0; i < 4; i++ ) { @@ -1090,12 +1084,12 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t m.cost += i_ref_cost; i_halfpel_thresh += i_ref_cost; - *(uint64_t*)a->l0.mvc[i_ref][i+1] = *(uint64_t*)m.mv; + *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv; if( m.cost < l0m->cost ) *l0m = m; } - x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv[0], l0m->mv[1] ); + x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref ); /* mb type cost */ @@ -1115,14 +1109,14 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) uint8_t **p_fref = h->mb.pic.p_fref[0][i_ref]; uint8_t **p_fenc = h->mb.pic.p_fenc; int i_mvc; - int (*mvc)[2] = a->l0.mvc[i_ref]; + int16_t (*mvc)[2] = a->l0.mvc[i_ref]; int i; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; i_mvc = 1; - *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.me16x16.mv; + *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv; for( i = 0; i < 4; i++ ) { @@ -1140,9 +1134,9 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp ); x264_me_search( h, m, mvc, i_mvc ); - x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] ); + x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv ); - *(uint64_t*)mvc[i_mvc] = *(uint64_t*)m->mv; + *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv; i_mvc++; /* mb type cost */ @@ -1163,7 +1157,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) { x264_me_t m; uint8_t **p_fenc = h->mb.pic.p_fenc; - DECLARE_ALIGNED_8( int mvc[3][2] ); + DECLARE_ALIGNED_4( int16_t mvc[3][2] ); int i, j; /* XXX Needed for x264_mb_predict_mv */ @@ -1188,9 +1182,9 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) m.i_ref = i_ref; /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */ - *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0]; - *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][2*i+1]; - *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][2*i+2]; + *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; + *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1]; + *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2]; LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref ); @@ -1202,7 +1196,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) if( m.cost < l0m->cost ) *l0m = m; } - x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, l0m->mv[0], l0m->mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref ); } @@ -1213,7 +1207,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) { x264_me_t m; uint8_t **p_fenc = h->mb.pic.p_fenc; - DECLARE_ALIGNED_8( int mvc[3][2] ); + DECLARE_ALIGNED_4( int16_t mvc[3][2] ); int i, j; /* XXX Needed for x264_mb_predict_mv */ @@ -1237,9 +1231,9 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) m.i_ref_cost = i_ref_cost; m.i_ref = i_ref; - *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0]; - *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][i+1]; - *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][i+3]; + *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; + *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1]; + *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3]; LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref ); @@ -1251,7 +1245,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) if( m.cost < l0m->cost ) *l0m = m; } - x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, l0m->mv[0], l0m->mv[1] ); + x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref ); } @@ -1320,7 +1314,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8 x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc ); - x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] ); + x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv ); } a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost + a->l0.me4x4[i8x8][1].cost + @@ -1360,7 +1354,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8 x264_mb_predict_mv( h, 0, idx, 2, m->mvp ); x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); - x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] ); + x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv ); } a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost + REF_COST( 0, i_ref ) + @@ -1397,7 +1391,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8 x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); - x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] ); + x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv ); } a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost + REF_COST( 0, i_ref ) + @@ -1447,8 +1441,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) int weight; x264_me_t m; - int i_ref; - int mvc[8][2], i_mvc; + int i_ref, i_mvc; + DECLARE_ALIGNED_4( int16_t mvc[8][2] ); int i_halfpel_thresh = INT_MAX; int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL; @@ -1477,8 +1471,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) } /* save mv for predicting neighbors */ - h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0]; - h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1]; + *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; } /* subtract ref cost, so we don't have to add it for the other MB types */ a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref ); @@ -1505,8 +1498,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) } /* save mv for predicting neighbors */ - h->mb.mvr[1][i_ref][h->mb.i_mb_xy][0] = m.mv[0]; - h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1]; + *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; } /* subtract ref cost, so we don't have to add it for the other MB types */ a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref ); @@ -1517,7 +1509,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) /* get cost of BI mode */ weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref]; - if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 ) + if ( (*(uint32_t*)a->l0.me16x16.mv & 0x10001) == 0 ) { /* l0 reference is halfpel, so get_ref on it will make it faster */ src2 = @@ -1570,21 +1562,21 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int switch( h->mb.i_sub_partition[i] ) { case D_L0_8x8: - x264_macroblock_cache_mv( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1] ); + x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv ); break; case D_L0_8x4: - x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv[0], a->l0.me8x4[i][0].mv[1] ); - x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv[0], a->l0.me8x4[i][1].mv[1] ); + x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv ); + x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv ); break; case D_L0_4x8: - x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv[0], a->l0.me4x8[i][0].mv[1] ); - x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv[0], a->l0.me4x8[i][1].mv[1] ); + x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv ); + x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv ); break; case D_L0_4x4: - x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv[0], a->l0.me4x4[i][0].mv[1] ); - x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv[0], a->l0.me4x4[i][1].mv[1] ); - x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv[0], a->l0.me4x4[i][2].mv[1] ); - x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv[0], a->l0.me4x4[i][3].mv[1] ); + x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv ); + x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv ); + x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv ); + x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv ); break; default: x264_log( h, X264_LOG_ERROR, "internal error\n" ); @@ -1596,26 +1588,26 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int if( x264_mb_partition_listX_table[0][part] ) \ { \ x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \ - x264_macroblock_cache_mv( h, x,y,dx,dy, 0, me0.mv[0], me0.mv[1] ); \ + x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \ } \ else \ { \ x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \ - x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0, 0 ); \ + x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \ if( b_mvd ) \ - x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0, 0 ); \ + x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \ } \ if( x264_mb_partition_listX_table[1][part] ) \ { \ x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \ - x264_macroblock_cache_mv( h, x,y,dx,dy, 1, me1.mv[0], me1.mv[1] ); \ + x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \ } \ else \ { \ x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \ - x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0, 0 ); \ + x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \ if( b_mvd ) \ - x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0, 0 ); \ + x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \ } static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd ) @@ -1627,8 +1619,8 @@ static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int x264_mb_load_mv_direct8x8( h, i ); if( b_mvd ) { - x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0, 0 ); - x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0, 0 ); + x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 ); + x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 ); x264_macroblock_cache_skip( h, x, y, 2, 2, 1 ); } } @@ -1681,7 +1673,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) x264_mb_predict_mv( h, l, 4*i, 2, m->mvp ); x264_me_search( h, m, &lX->me16x16.mv, 1 ); - x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, l, m->mv[0], m->mv[1] ); + x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv ); /* BI mode */ h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0], @@ -1717,7 +1709,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) { h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.p_fref[1][a->l1.i_ref] }; DECLARE_ALIGNED_16( uint8_t pix[2][16*8] ); - DECLARE_ALIGNED_8( int mvc[2][2] ); + DECLARE_ALIGNED_4( int16_t mvc[2][2] ); int i, l; h->mb.i_partition = D_16x8; @@ -1740,8 +1732,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i ); LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i ); - *(uint64_t*)mvc[0] = *(uint64_t*)lX->me8x8[2*i].mv; - *(uint64_t*)mvc[1] = *(uint64_t*)lX->me8x8[2*i+1].mv; + *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv; + *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv; x264_mb_predict_mv( h, l, 8*i, 2, m->mvp ); x264_me_search( h, m, mvc, 2 ); @@ -1786,7 +1778,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) { h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.p_fref[1][a->l1.i_ref] }; DECLARE_ALIGNED_8( uint8_t pix[2][8*16] ); - DECLARE_ALIGNED_8( int mvc[2][2] ); + DECLARE_ALIGNED_4( int16_t mvc[2][2] ); int i, l; h->mb.i_partition = D_8x16; @@ -1808,8 +1800,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 ); LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 ); - *(uint64_t*)mvc[0] = *(uint64_t*)lX->me8x8[i].mv; - *(uint64_t*)mvc[1] = *(uint64_t*)lX->me8x8[i+2].mv; + *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv; + *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv; x264_mb_predict_mv( h, l, 4*i, 2, m->mvp ); x264_me_search( h, m, mvc, 2 ); @@ -2626,21 +2618,21 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) { case D_16x16: x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); - x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); break; case D_16x8: x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref ); x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref ); - x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv[0], a->l0.me16x8[0].mv[1] ); - x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv[0], a->l0.me16x8[1].mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv ); + x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv ); break; case D_8x16: x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref ); x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref ); - x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv[0], a->l0.me8x16[0].mv[1] ); - x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv[0], a->l0.me8x16[1].mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv ); + x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv ); break; default: @@ -2662,8 +2654,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) { h->mb.i_partition = D_16x16; x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv[0], - h->mb.cache.pskip_mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv ); break; } @@ -2689,26 +2680,26 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) { case B_L0_L0: x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref ); - x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 ); - x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0, 0 ); - x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0, 0 ); + x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 ); + x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 ); break; case B_L1_L1: x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 ); - x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0, 0 ); - x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0, 0 ); + x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 ); + x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref ); - x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv ); break; case B_BI_BI: x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref ); - x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref ); - x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv ); break; } break; diff --git a/encoder/cabac.c b/encoder/cabac.c index 052c0e41..d482c066 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -484,7 +484,7 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height ) { - int mvp[2]; + DECLARE_ALIGNED_4( int16_t mvp[2] ); int mdx, mdy; /* Calculate mvd */ @@ -497,7 +497,7 @@ static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, i x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy ); /* save value */ - x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mdx, mdy ); + x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, pack16to32_mask(mdx,mdy) ); } static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int i ) diff --git a/encoder/cavlc.c b/encoder/cavlc.c index 54bc567e..726d024f 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -232,7 +232,7 @@ static void cavlc_qp_delta( x264_t *h, bs_t *s ) static void cavlc_mb_mvd( x264_t *h, bs_t *s, int i_list, int idx, int width ) { - int mvp[2]; + DECLARE_ALIGNED_4( int16_t mvp[2] ); x264_mb_predict_mv( h, i_list, idx, width, mvp ); bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] ); bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] ); @@ -408,7 +408,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) } else if( i_mb_type == P_L0 ) { - int mvp[2]; + DECLARE_ALIGNED_4( int16_t mvp[2] ); if( h->mb.i_partition == D_16x16 ) { @@ -524,7 +524,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) /* All B mode */ /* Motion Vector */ int i_list; - int mvp[2]; + DECLARE_ALIGNED_4( int16_t mvp[2] ); int b_list[2][2]; diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 33547146..74c94130 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -585,17 +585,15 @@ void x264_macroblock_encode( x264_t *h ) if( !b_force_no_skip ) { if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && - h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma == 0x00 && - h->mb.cache.mv[0][x264_scan8[0]][0] == h->mb.cache.pskip_mv[0] && - h->mb.cache.mv[0][x264_scan8[0]][1] == h->mb.cache.pskip_mv[1] && - h->mb.cache.ref[0][x264_scan8[0]] == 0 ) + !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && + *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv + && h->mb.cache.ref[0][x264_scan8[0]] == 0 ) { h->mb.i_type = P_SKIP; } /* Check for B_SKIP */ - if( h->mb.i_type == B_DIRECT && - h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 ) + if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) ) { h->mb.i_type = B_SKIP; } diff --git a/encoder/me.c b/encoder/me.c index 7198957d..7598b76f 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -151,7 +151,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite }\ } -void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) +void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; @@ -982,7 +982,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 ) m->cost = bcost; m->mv[0] = bmx; m->mv[1] = bmy; - x264_macroblock_cache_mv ( h, 2*(i8&1), i8&2, bw, bh, 0, bmx, bmy ); - x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, 0, bmx - pmx, bmy - pmy ); + x264_macroblock_cache_mv ( h, 2*(i8&1), i8&2, bw, bh, 0, pack16to32_mask(bmx, bmy) ); + x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, 0, pack16to32_mask(bmx - pmx, bmy - pmy) ); } diff --git a/encoder/me.h b/encoder/me.h index 295dd14a..96135c9e 100644 --- a/encoder/me.h +++ b/encoder/me.h @@ -39,16 +39,16 @@ typedef struct uint16_t *integral; int i_stride[2]; - int mvp[2]; + DECLARE_ALIGNED_4( int16_t mvp[2] ); /* output */ int cost_mv; /* lambda * nbits for the chosen mv */ int cost; /* satd + lambda * nbits */ - DECLARE_ALIGNED_8( int mv[2] ); + DECLARE_ALIGNED_4( int16_t mv[2] ); } x264_me_t; -void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh ); -static inline void x264_me_search( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc ) +void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh ); +static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc ) { x264_me_search_ref( h, m, mvc, i_mvc, NULL ); } void x264_me_refine_qpel( x264_t *h, x264_me_t *m ); diff --git a/encoder/slicetype.c b/encoder/slicetype.c index 8cee4f90..58e666be 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -153,9 +153,9 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, i_cost_bak = i_bcost; for( l = 0; l < 1 + b_bidir; l++ ) { - int mvc[4][2] = {{0}}, i_mvc; + int16_t mvc[4][2] = {{0}}; + int i_mvc = 0; int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy]; - i_mvc = 0; #define MVC(mv) { mvc[i_mvc][0] = mv[0]; mvc[i_mvc][1] = mv[1]; i_mvc++; } if( i_mb_x > 0 ) MVC(fenc_mv[-1]); -- 2.40.0