From 72f79049c1c34ea5feb41a05f26c42f65451b681 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Fri, 23 Apr 2010 19:09:37 +0000 Subject: [PATCH] Fix a ton of pessimization caused by aliasing in cache_save and cache_load --- common/macroblock.c | 201 ++++++++++++++++++++++++-------------------- 1 file changed, 108 insertions(+), 93 deletions(-) diff --git a/common/macroblock.c b/common/macroblock.c index f4e22eff..2b0ee647 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -1026,19 +1026,24 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) int left = h->mb.i_mb_left_xy; int top = h->mb.i_mb_top_xy; + /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing. */ + /* By only dereferencing them once, we avoid this issue. */ + int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode; + uint8_t (*nnz)[24] = h->mb.non_zero_count; + int16_t *cbp = h->mb.cbp; + /* load cache */ if( h->mb.i_neighbour & MB_TOP ) { - h->mb.cache.i_cbp_top = h->mb.cbp[top]; - + h->mb.cache.i_cbp_top = cbp[top]; /* load intra4x4 */ - CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[top][0] ); + CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] ); /* load non_zero_count */ - CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[top][12] ); + CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] ); /* shift because x264_scan8[16] is misaligned */ - M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[top][18] ) << 8; - M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[top][22] ) << 8; + M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &nnz[top][18] ) << 8; + M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &nnz[top][22] ) << 8; } else { @@ -1055,25 +1060,25 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) if( h->mb.i_neighbour & MB_LEFT ) { - h->mb.cache.i_cbp_left = h->mb.cbp[left]; + h->mb.cache.i_cbp_left = cbp[left]; /* load intra4x4 */ - h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = h->mb.intra4x4_pred_mode[left][4]; - h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = h->mb.intra4x4_pred_mode[left][5]; - h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = h->mb.intra4x4_pred_mode[left][6]; - h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[left][3]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][5]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][6]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][3]; /* load non_zero_count */ - h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[left][3]; - h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[left][7]; - h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[left][11]; - h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[left][15]; + h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3]; + h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7]; + h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11]; + h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15]; - h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[left][16+1]; - h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = h->mb.non_zero_count[left][16+3]; + h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][16+1]; + h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][16+3]; - h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = h->mb.non_zero_count[left][16+4+1]; - h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = h->mb.non_zero_count[left][16+4+3]; + h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1]; + h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3]; } else { @@ -1146,11 +1151,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ ) { + int16_t (*mv)[2] = h->mb.mv[l]; + int8_t *ref = h->mb.ref[l]; + int i8 = x264_scan8[0] - 1 - 1*8; if( h->mb.i_neighbour & MB_TOPLEFT ) { - h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 - 1]; - CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 - 1] ); + h->mb.cache.ref[l][i8] = ref[top_8x8 - 1]; + CP32( h->mb.cache.mv[l][i8], mv[top_4x4 - 1] ); } else { @@ -1162,10 +1170,10 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) if( h->mb.i_neighbour & MB_TOP ) { h->mb.cache.ref[l][i8+0] = - h->mb.cache.ref[l][i8+1] = h->mb.ref[l][top_8x8 + 0]; + h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0]; h->mb.cache.ref[l][i8+2] = - h->mb.cache.ref[l][i8+3] = h->mb.ref[l][top_8x8 + 1]; - CP128( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4] ); + h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1]; + CP128( h->mb.cache.mv[l][i8], mv[top_4x4] ); } else { @@ -1176,8 +1184,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) i8 = x264_scan8[0] + 4 - 1*8; if( h->mb.i_neighbour & MB_TOPRIGHT ) { - h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 + 2]; - CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 + 4] ); + h->mb.cache.ref[l][i8] = ref[top_8x8 + 2]; + CP32( h->mb.cache.mv[l][i8], mv[top_4x4 + 4] ); } else h->mb.cache.ref[l][i8] = -2; @@ -1188,14 +1196,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) const int ir = h->mb.i_b8_xy - 1; const int iv = h->mb.i_b4_xy - 1; h->mb.cache.ref[l][i8+0*8] = - h->mb.cache.ref[l][i8+1*8] = h->mb.ref[l][ir + 0*s8x8]; + h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8]; h->mb.cache.ref[l][i8+2*8] = - h->mb.cache.ref[l][i8+3*8] = h->mb.ref[l][ir + 1*s8x8]; + h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8]; - CP32( h->mb.cache.mv[l][i8+0*8], h->mb.mv[l][iv + 0*s4x4] ); - CP32( h->mb.cache.mv[l][i8+1*8], h->mb.mv[l][iv + 1*s4x4] ); - CP32( h->mb.cache.mv[l][i8+2*8], h->mb.mv[l][iv + 2*s4x4] ); - CP32( h->mb.cache.mv[l][i8+3*8], h->mb.mv[l][iv + 3*s4x4] ); + CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] ); + CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] ); + CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] ); + CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] ); } else { @@ -1208,17 +1216,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) if( h->param.b_cabac ) { + uint8_t (*mvd)[8][2] = h->mb.mvd[l]; if( h->mb.i_neighbour & MB_TOP ) - CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], h->mb.mvd[l][top][0] ); + CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], mvd[top][0] ); else M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0; if( h->mb.i_neighbour & MB_LEFT ) { - CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], h->mb.mvd[l][left][4] ); - CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], h->mb.mvd[l][left][5] ); - CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], h->mb.mvd[l][left][6] ); - CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], h->mb.mvd[l][left][3] ); + CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][4] ); + CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][5] ); + CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][6] ); + CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][3] ); } else for( int i = 0; i < 4; i++ ) @@ -1283,10 +1292,10 @@ void x264_macroblock_cache_save( x264_t *h ) const int i_mb_4x4 = h->mb.i_b4_xy; const int i_mb_8x8 = h->mb.i_b8_xy; - /* GCC pessimizes direct stores to heap-allocated 8-bit arrays due to aliasing.*/ + /* GCC pessimizes direct stores to heap-allocated arrays due to aliasing. */ /* By only dereferencing them once, we avoid this issue. */ - int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy]; - uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy]; + int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy]; + uint8_t *nnz = h->mb.non_zero_count[i_mb_xy]; x264_macroblock_store_pic( h, 0 ); x264_macroblock_store_pic( h, 1 ); @@ -1301,15 +1310,15 @@ void x264_macroblock_cache_save( x264_t *h ) /* save intra4x4 */ if( i_mb_type == I_4x4 ) { - CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] ); - M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ], - h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ], - h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0); + CP32( &i4x4[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] ); + M32( &i4x4[4] ) = pack8to32( h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ], + h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ], + h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0); } else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) ) - M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL; + M64( i4x4 ) = I_PRED_4x4_DC * 0x0101010101010101ULL; else - M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL; + M64( i4x4 ) = (uint8_t)(-1) * 0x0101010101010101ULL; if( i_mb_type == I_PCM ) @@ -1320,19 +1329,19 @@ void x264_macroblock_cache_save( x264_t *h ) h->mb.i_cbp_luma = 0xf; h->mb.cbp[i_mb_xy] = 0x72f; /* all set */ h->mb.b_transform_8x8 = 0; - memset( non_zero_count, 16, sizeof( *h->mb.non_zero_count ) ); + memset( nnz, 16, sizeof( *h->mb.non_zero_count ) ); } else { /* save non zero count */ - CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ); - CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ); - CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ); - CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ); - M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8; - M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8; - M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8; - M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8; + CP32( &nnz[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ); + CP32( &nnz[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ); + CP32( &nnz[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ); + CP32( &nnz[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ); + M16( &nnz[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8; + M16( &nnz[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8; + M16( &nnz[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8; + M16( &nnz[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8; if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 ) h->mb.i_qp = h->mb.i_last_qp; @@ -1347,50 +1356,56 @@ void x264_macroblock_cache_save( x264_t *h ) if( h->sh.i_type != SLICE_TYPE_I ) { + int16_t (*mv0)[2] = &h->mb.mv[0][i_mb_4x4]; + int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4]; + int8_t *ref0 = &h->mb.ref[0][i_mb_8x8]; + int8_t *ref1 = &h->mb.ref[1][i_mb_8x8]; if( !IS_INTRA( i_mb_type ) ) { - h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]]; - h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]]; - h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]]; - h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]]; - CP128( h->mb.mv[0][i_mb_4x4+0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] ); - CP128( h->mb.mv[0][i_mb_4x4+1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] ); - CP128( h->mb.mv[0][i_mb_4x4+2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] ); - CP128( h->mb.mv[0][i_mb_4x4+3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] ); + ref0[0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]]; + ref0[1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]]; + ref0[0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]]; + ref0[1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]]; + CP128( &mv0[0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] ); + CP128( &mv0[1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] ); + CP128( &mv0[2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] ); + CP128( &mv0[3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] ); if( h->sh.i_type == SLICE_TYPE_B ) { - h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]]; - h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]]; - h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]]; - h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]]; - CP128( h->mb.mv[1][i_mb_4x4+0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] ); - CP128( h->mb.mv[1][i_mb_4x4+1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] ); - CP128( h->mb.mv[1][i_mb_4x4+2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] ); - CP128( h->mb.mv[1][i_mb_4x4+3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] ); + ref1[0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]]; + ref1[1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]]; + ref1[0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]]; + ref1[1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]]; + CP128( &mv1[0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] ); + CP128( &mv1[1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] ); + CP128( &mv1[2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] ); + CP128( &mv1[3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] ); } } else { - M16( &h->mb.ref[0][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101; - M16( &h->mb.ref[0][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101; - M128( h->mb.mv[0][i_mb_4x4+0*s4x4] ) = M128_ZERO; - M128( h->mb.mv[0][i_mb_4x4+1*s4x4] ) = M128_ZERO; - M128( h->mb.mv[0][i_mb_4x4+2*s4x4] ) = M128_ZERO; - M128( h->mb.mv[0][i_mb_4x4+3*s4x4] ) = M128_ZERO; + M16( &ref0[0*s8x8] ) = (uint8_t)(-1) * 0x0101; + M16( &ref0[1*s8x8] ) = (uint8_t)(-1) * 0x0101; + M128( &mv0[0*s4x4] ) = M128_ZERO; + M128( &mv0[1*s4x4] ) = M128_ZERO; + M128( &mv0[2*s4x4] ) = M128_ZERO; + M128( &mv0[3*s4x4] ) = M128_ZERO; if( h->sh.i_type == SLICE_TYPE_B ) { - M16( &h->mb.ref[1][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101; - M16( &h->mb.ref[1][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101; - M128( h->mb.mv[1][i_mb_4x4+0*s4x4] ) = M128_ZERO; - M128( h->mb.mv[1][i_mb_4x4+1*s4x4] ) = M128_ZERO; - M128( h->mb.mv[1][i_mb_4x4+2*s4x4] ) = M128_ZERO; - M128( h->mb.mv[1][i_mb_4x4+3*s4x4] ) = M128_ZERO; + M16( &ref1[0*s8x8] ) = (uint8_t)(-1) * 0x0101; + M16( &ref1[1*s8x8] ) = (uint8_t)(-1) * 0x0101; + M128( &mv1[0*s4x4] ) = M128_ZERO; + M128( &mv1[1*s4x4] ) = M128_ZERO; + M128( &mv1[2*s4x4] ) = M128_ZERO; + M128( &mv1[3*s4x4] ) = M128_ZERO; } } } if( h->param.b_cabac ) { + uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy]; + uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy]; if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM ) h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ]; else @@ -1398,23 +1413,23 @@ void x264_macroblock_cache_save( x264_t *h ) if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) ) { - CP64( h->mb.mvd[0][i_mb_xy][0], h->mb.cache.mvd[0][x264_scan8[10]] ); - CP16( h->mb.mvd[0][i_mb_xy][4], h->mb.cache.mvd[0][x264_scan8[5 ]] ); - CP16( h->mb.mvd[0][i_mb_xy][5], h->mb.cache.mvd[0][x264_scan8[7 ]] ); - CP16( h->mb.mvd[0][i_mb_xy][6], h->mb.cache.mvd[0][x264_scan8[13]] ); + CP64( mvd0[0], h->mb.cache.mvd[0][x264_scan8[10]] ); + CP16( mvd0[4], h->mb.cache.mvd[0][x264_scan8[5 ]] ); + CP16( mvd0[5], h->mb.cache.mvd[0][x264_scan8[7 ]] ); + CP16( mvd0[6], h->mb.cache.mvd[0][x264_scan8[13]] ); if( h->sh.i_type == SLICE_TYPE_B ) { - CP64( h->mb.mvd[1][i_mb_xy][0], h->mb.cache.mvd[1][x264_scan8[10]] ); - CP16( h->mb.mvd[1][i_mb_xy][4], h->mb.cache.mvd[1][x264_scan8[5 ]] ); - CP16( h->mb.mvd[1][i_mb_xy][5], h->mb.cache.mvd[1][x264_scan8[7 ]] ); - CP16( h->mb.mvd[1][i_mb_xy][6], h->mb.cache.mvd[1][x264_scan8[13]] ); + CP64( mvd1[0], h->mb.cache.mvd[1][x264_scan8[10]] ); + CP16( mvd1[4], h->mb.cache.mvd[1][x264_scan8[5 ]] ); + CP16( mvd1[5], h->mb.cache.mvd[1][x264_scan8[7 ]] ); + CP16( mvd1[6], h->mb.cache.mvd[1][x264_scan8[13]] ); } } else { - M128( h->mb.mvd[0][i_mb_xy][0] ) = M128_ZERO; + M128( mvd0[0] ) = M128_ZERO; if( h->sh.i_type == SLICE_TYPE_B ) - M128( h->mb.mvd[1][i_mb_xy][0] ) = M128_ZERO; + M128( mvd1[0] ) = M128_ZERO; } if( h->sh.i_type == SLICE_TYPE_B ) -- 2.40.0