Rewrite is significantly slower, but is necessary to make asm possible.
Similar concept to ffmpeg's deblock strength asm.
Roughly one order of magnitude faster than C.
Overall, with the asm, saves ~100-300 clocks in deblocking per MB.
#define CP64(dst,src) M64(dst) = M64(src)
#define CP128(dst,src) M128(dst) = M128(src)
+#define X264_SCAN8_SIZE (6*8)
+#define X264_SCAN8_LUMA_SIZE (5*8)
+#define X264_SCAN8_0 (4+1*8)
+
+static const int x264_scan8[16+2*4+3] =
+{
+ /* Luma */
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+
+ /* Cb */
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+
+ /* Cr */
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+
+ /* Luma DC */
+ 4+5*8,
+
+ /* Chroma DC */
+ 6+5*8, 7+5*8
+};
+/*
+ 0 1 2 3 4 5 6 7
+ 0
+ 1 B B L L L L
+ 2 B B L L L L
+ 3 L L L L
+ 4 R R L L L L
+ 5 R R Dy DuDv
+*/
+
#include "x264.h"
#include "bs.h"
#include "set.h"
x264_synch_frame_list_t ofbuf;
} x264_lookahead_t;
-/* From ffmpeg
- */
-#define X264_SCAN8_SIZE (6*8)
-#define X264_SCAN8_LUMA_SIZE (5*8)
-#define X264_SCAN8_0 (4+1*8)
-
-static const int x264_scan8[16+2*4+3] =
-{
- /* Luma */
- 4+1*8, 5+1*8, 4+2*8, 5+2*8,
- 6+1*8, 7+1*8, 6+2*8, 7+2*8,
- 4+3*8, 5+3*8, 4+4*8, 5+4*8,
- 6+3*8, 7+3*8, 6+4*8, 7+4*8,
-
- /* Cb */
- 1+1*8, 2+1*8,
- 1+2*8, 2+2*8,
-
- /* Cr */
- 1+4*8, 2+4*8,
- 1+5*8, 2+5*8,
-
- /* Luma DC */
- 4+5*8,
-
- /* Chroma DC */
- 6+5*8, 7+5*8
-};
-/*
- 0 1 2 3 4 5 6 7
- 0
- 1 B B L L L L
- 2 B B L L L L
- 3 L L L L
- 4 R R L L L L
- 5 R R Dy DuDv
-*/
-
typedef struct x264_ratecontrol_t x264_ratecontrol_t;
struct x264_t
deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
}
+static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, int bframe, int step, int first_edge_only )
+{
+ for( int dir = 0; dir < 2; dir++ )
+ {
+ int s1 = dir ? 1 : 8;
+ int s2 = dir ? 8 : 1;
+ for( int edge = 0; edge < (first_edge_only ? 1 : 4); edge += step )
+ for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 )
+ {
+ int locn = loc - s2;
+ if( nnz[loc] || nnz[locn] )
+ bs[dir][edge][i] = 2;
+ else if( ref[0][loc] != ref[0][locn] ||
+ abs( mv[0][loc][0] - mv[0][locn][0] ) >= 4 ||
+ abs( mv[0][loc][1] - mv[0][locn][1] ) >= mvy_limit ||
+ (bframe && (ref[1][loc] != ref[1][locn] ||
+ abs( mv[1][loc][0] - mv[1][locn][0] ) >= 4 ||
+ abs( mv[1][loc][1] - mv[1][locn][1] ) >= mvy_limit )))
+ {
+ bs[dir][edge][i] = 1;
+ }
+ else
+ bs[dir][edge][i] = 0;
+ }
+ }
+}
+
static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
{
int index_a = i_qp + h->sh.i_alpha_c0_offset;
int beta = beta_table(i_qp + h->sh.i_beta_offset);
int8_t tc[4];
- if( !alpha || !beta )
+ if( !M32(bS) || !alpha || !beta )
return;
tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
void x264_frame_deblock_row( x264_t *h, int mb_y )
{
- int s8x8 = 2 * h->mb.i_mb_stride;
- int s4x4 = 4 * h->mb.i_mb_stride;
int b_interlaced = h->sh.b_mbaff;
int mvy_limit = 4 >> b_interlaced;
int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
- int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
- int mb_x;
int stridey = h->fdec->i_stride[0];
int stride2y = stridey << b_interlaced;
int strideuv = h->fdec->i_stride[1];
int stride2uv = strideuv << b_interlaced;
- int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
- int ref_table[32+2];
- #define ref_table(x) ref_table[x+2]
+ int deblock_ref_table[2][32+2];
uint8_t (*nnz_backup)[16] = h->scratch_buffer;
- ref_table(-2) = -2;
- ref_table(-1) = -1;
- for( int i = 0; i < (h->i_ref0 << h->sh.b_mbaff); i++ )
+ for( int l = 0; l < 2; l++ )
{
- if( !h->mb.b_interlaced )
- ref_table(i) = h->fref0[i]->i_poc;
- else
- ref_table(i) = h->fref0[i>>1]->i_poc + (i&1);
+ int refs = (l ? h->i_ref1 : h->i_ref0) << h->sh.b_mbaff;
+ x264_frame_t **fref = l ? h->fref1 : h->fref0;
+ deblock_ref_table(l,-2) = -2;
+ deblock_ref_table(l,-1) = -1;
+ for( int i = 0; i < refs; i++ )
+ {
+ /* Mask off high bits to avoid frame num collisions with -1/-2.
+ * frame num values don't actually have to be correct, just unique.
+ * frame num values can't cover a range of more than 32. */
+ if( !h->mb.b_interlaced )
+ deblock_ref_table(l,i) = fref[i]->i_frame_num&63;
+ else
+ deblock_ref_table(l,i) = ((fref[i>>1]->i_frame_num&63)<<1) + (i&1);
+ }
}
if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
- for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
+ for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
{
- int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
- int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
- int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
- int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
- int i_qp = h->mb.qp[mb_xy];
- int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
+ ALIGNED_ARRAY_16( uint8_t, bs, [2][4][4] );
+
+ x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
+ x264_macroblock_cache_load_deblock( h, mb_x, mb_y, deblock_ref_table );
+
+ int mb_xy = h->mb.i_mb_xy;
+ int transform_8x8 = h->mb.mb_transform_size[mb_xy];
+ int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
+
uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
pixv -= 7*strideuv;
}
- x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
+ int qp = h->mb.qp[mb_xy];
+ int qpc = h->chroma_qp_table[qp];
+ int first_edge_only = h->mb.type[mb_xy] == P_SKIP || qp <= qp_thresh;
- if( i_qp <= qp_thresh )
- i_edge_end = 1;
-
- #define FILTER_DIR(intra, i_dir)\
+ #define FILTER( intra, dir, edge, qp, chroma_qp )\
+ do\
{\
- /* Y plane */\
- i_qpn= h->mb.qp[mbn_xy];\
- if( i_dir == 0 )\
- {\
- /* vertical edge */\
- deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
- stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
- h->loopf.deblock_h_luma##intra );\
- if( !(i_edge & 1) )\
- {\
- /* U/V planes */\
- int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
- deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
- stride2uv, bS, i_qpc, 1,\
- h->loopf.deblock_h_chroma##intra );\
- }\
- }\
- else\
- {\
- /* horizontal edge */\
- deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
- stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
- h->loopf.deblock_v_luma##intra );\
- /* U/V planes */\
- if( !(i_edge & 1) )\
- {\
- int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
- deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
- stride2uv, bS, i_qpc, 1,\
- h->loopf.deblock_v_chroma##intra );\
- }\
- }\
+ deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1), NULL,\
+ stride2y, bs[dir][edge], qp, 0,\
+ h->loopf.deblock_luma##intra[dir] );\
+ if( !(edge & 1) )\
+ deblock_edge##intra( h, pixu + 2*edge*(dir?stride2uv:1), pixv + 2*edge*(dir?stride2uv:1),\
+ stride2uv, bs[dir][edge], chroma_qp, 1,\
+ h->loopf.deblock_chroma##intra[dir] );\
+ } while(0)
+
+ if( intra_cur )
+ memset( bs, 3, sizeof(bs) );
+ else
+ h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B, transform_8x8 + 1, first_edge_only );
+
+ if( h->mb.i_neighbour & MB_LEFT )
+ {
+ int qpl = h->mb.qp[h->mb.i_mb_left_xy];
+ int qp_left = (qp + qpl + 1) >> 1;
+ int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1;
+ int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_left_xy] );
+ if( intra_cur || intra_left )
+ FILTER( _intra, 0, 0, qp_left, qpc_left );
+ else
+ FILTER( , 0, 0, qp_left, qpc_left );
}
- #define DEBLOCK_STRENGTH(i_dir)\
- {\
- /* *** Get bS for each 4px for the current edge *** */\
- if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
- M32( bS ) = 0x03030303;\
- else\
- {\
- M32( bS ) = 0x00000000;\
- for( int i = 0; i < 4; i++ )\
- {\
- int x = i_dir == 0 ? i_edge : i;\
- int y = i_dir == 0 ? i : i_edge;\
- int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
- int yn = i_dir == 0 ? y : (y - 1)&0x03;\
- if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
- h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
- bS[i] = 2;\
- else if(!(i_edge&no_sub8x8))\
- {\
- if((i&no_sub8x8) && bS[i-1] != 2)\
- bS[i] = bS[i-1];\
- else\
- {\
- int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
- int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
- int i4p= mb_4x4+x+y*s4x4;\
- int i4q= mbn_4x4+xn+yn*s4x4;\
- if((!(ref_table(h->mb.ref[0][i8p]) == ref_table(h->mb.ref[0][i8q])) ||\
- abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
- abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
- (h->sh.i_type == SLICE_TYPE_B &&\
- (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
- abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
- abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
- {\
- bS[i] = 1;\
- }\
- }\
- }\
- }\
- }\
+ if( !first_edge_only )
+ {
+ if( !transform_8x8 ) FILTER( , 0, 1, qp, qpc );
+ FILTER( , 0, 2, qp, qpc );
+ if( !transform_8x8 ) FILTER( , 0, 3, qp, qpc );
}
- /* i_dir == 0 -> vertical edge
- * i_dir == 1 -> horizontal edge */
- #define DEBLOCK_DIR(i_dir)\
- {\
- int i_edge = 0;\
- int i_qpn, mbn_xy, mbn_8x8, mbn_4x4;\
- ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
- /* We don't have to consider the MBAFF case of a slice breaking in the middle\
- * of a row because x264 doesn't support that case. If we add support for that,\
- * this will have to become significantly more complex. */\
- if( i_dir == 0 && (mb_x == 0 || (!deblock_on_slice_edges &&\
- h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-1])) )\
- i_edge++;\
- if( i_dir == 1 && (mb_y <= b_interlaced || (!deblock_on_slice_edges &&\
- h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-(h->mb.i_mb_stride<<b_interlaced)])) )\
- i_edge++;\
- if( i_edge )\
- i_edge+= b_8x8_transform;\
- else\
- {\
- mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\
- mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
- mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
- if( b_interlaced && i_dir == 1 )\
- {\
- mbn_xy -= h->mb.i_mb_stride;\
- mbn_8x8 -= 2 * s8x8;\
- mbn_4x4 -= 4 * s4x4;\
- }\
- else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
- {\
- FILTER_DIR( _intra, i_dir );\
- goto end##i_dir;\
- }\
- DEBLOCK_STRENGTH(i_dir);\
- if( M32( bS ) )\
- FILTER_DIR( , i_dir);\
- end##i_dir:\
- i_edge += b_8x8_transform+1;\
- }\
- mbn_xy = mb_xy;\
- mbn_8x8 = mb_8x8;\
- mbn_4x4 = mb_4x4;\
- for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
- {\
- DEBLOCK_STRENGTH(i_dir);\
- if( M32( bS ) )\
- FILTER_DIR( , i_dir);\
- }\
+ if( h->mb.i_neighbour & MB_TOP )
+ {
+ int qpt = h->mb.qp[h->mb.i_mb_top_xy];
+ int qp_top = (qp + qpt + 1) >> 1;
+ int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
+ int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
+ if( !b_interlaced && (intra_cur || intra_top) )
+ FILTER( _intra, 1, 0, qp_top, qpc_top );
+ else
+ {
+ if( intra_top )
+ memset( bs[1][0], 3, sizeof(bs[1][0]) );
+ FILTER( , 1, 0, qp_top, qpc_top );
+ }
}
- DEBLOCK_DIR(0);
- DEBLOCK_DIR(1);
+ if( !first_edge_only )
+ {
+ if( !transform_8x8 ) FILTER( , 1, 1, qp, qpc );
+ FILTER( , 1, 2, qp, qpc );
+ if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
+ }
}
if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
}
-void x264_frame_deblock( x264_t *h )
-{
- for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
- x264_frame_deblock_row( h, mb_y );
-}
-
#ifdef HAVE_MMX
void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
+ int mvy_limit, int bframe, int step, int first_edge_only );
+void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
+ int mvy_limit, int bframe, int step, int first_edge_only );
+void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
+ int mvy_limit, int bframe, int step, int first_edge_only );
#ifdef ARCH_X86
void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
{
- pf->deblock_v_luma = deblock_v_luma_c;
- pf->deblock_h_luma = deblock_h_luma_c;
- pf->deblock_v_chroma = deblock_v_chroma_c;
- pf->deblock_h_chroma = deblock_h_chroma_c;
- pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
- pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
- pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
- pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
+ pf->deblock_luma[1] = deblock_v_luma_c;
+ pf->deblock_luma[0] = deblock_h_luma_c;
+ pf->deblock_chroma[1] = deblock_v_chroma_c;
+ pf->deblock_chroma[0] = deblock_h_chroma_c;
+ pf->deblock_luma_intra[1] = deblock_v_luma_intra_c;
+ pf->deblock_luma_intra[0] = deblock_h_luma_intra_c;
+ pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c;
+ pf->deblock_chroma_intra[0] = deblock_h_chroma_intra_c;
+ pf->deblock_strength = deblock_strength_c;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
{
- pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
- pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
- pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
- pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
+ pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
+ pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
+ pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
+ pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext;
#ifdef ARCH_X86
- pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
- pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
- pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
- pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
+ pf->deblock_luma[1] = x264_deblock_v_luma_mmxext;
+ pf->deblock_luma[0] = x264_deblock_h_luma_mmxext;
+ pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext;
+ pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext;
#endif
- if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
+ pf->deblock_strength = x264_deblock_strength_mmxext;
+ if( cpu&X264_CPU_SSE2 )
{
- pf->deblock_v_luma = x264_deblock_v_luma_sse2;
- pf->deblock_h_luma = x264_deblock_h_luma_sse2;
- pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
- pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
+ pf->deblock_strength = x264_deblock_strength_sse2;
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
+ pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
+ pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
+ pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
+ }
}
+ if( cpu&X264_CPU_SSSE3 )
+ pf->deblock_strength = x264_deblock_strength_ssse3;
}
#endif
#ifdef HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
{
- pf->deblock_v_luma = x264_deblock_v_luma_altivec;
- pf->deblock_h_luma = x264_deblock_h_luma_altivec;
+ pf->deblock_luma[1] = x264_deblock_v_luma_altivec;
+ pf->deblock_luma[0] = x264_deblock_h_luma_altivec;
}
#endif // HAVE_ALTIVEC
#ifdef HAVE_ARMV6
if( cpu&X264_CPU_NEON )
{
- pf->deblock_v_luma = x264_deblock_v_luma_neon;
- pf->deblock_h_luma = x264_deblock_h_luma_neon;
- pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
- pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
+ pf->deblock_luma[1] = x264_deblock_v_luma_neon;
+ pf->deblock_luma[0] = x264_deblock_h_luma_neon;
+ pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
+ pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
}
#endif
}
typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
typedef struct
{
- x264_deblock_inter_t deblock_v_luma;
- x264_deblock_inter_t deblock_h_luma;
- x264_deblock_inter_t deblock_v_chroma;
- x264_deblock_inter_t deblock_h_chroma;
- x264_deblock_intra_t deblock_v_luma_intra;
- x264_deblock_intra_t deblock_h_luma_intra;
- x264_deblock_intra_t deblock_v_chroma_intra;
- x264_deblock_intra_t deblock_h_chroma_intra;
+ x264_deblock_inter_t deblock_luma[2];
+ x264_deblock_inter_t deblock_chroma[2];
+ x264_deblock_intra_t deblock_luma_intra[2];
+ x264_deblock_intra_t deblock_chroma_intra[2];
+ void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit,
+ int bframe, int step, int first_edge_only );
} x264_deblock_function_t;
x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
void x264_frame_expand_border_lowres( x264_frame_t *frame );
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame );
-void x264_frame_deblock( x264_t *h );
void x264_frame_deblock_row( x264_t *h, int mb_y );
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
| ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
}
+static void inline x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
+{
+ int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
+ int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
+
+ h->mb.i_neighbour = 0;
+ h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
+ h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x);
+ h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x);
+
+ if( mb_x > 0 )
+ {
+ h->mb.i_mb_left_xy = h->mb.i_mb_xy - 1;
+ if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_left_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
+ h->mb.i_neighbour |= MB_LEFT;
+ }
+
+ if( top >= 0 )
+ {
+ h->mb.i_mb_top_xy = top;
+ if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
+ h->mb.i_neighbour |= MB_TOP;
+ }
+}
+
+void x264_macroblock_cache_load_deblock( x264_t *h, int mb_x, int mb_y, int deblock_ref_table[2][34] )
+{
+ x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
+
+ if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) )
+ return;
+
+ int cur = h->mb.i_mb_xy;
+ int left = h->mb.i_mb_left_xy;
+ int top = h->mb.i_mb_top_xy;
+ int top_y = mb_y - (1 << h->mb.b_interlaced);
+ int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
+ int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
+ int s8x8 = h->mb.i_b8_stride;
+ int s4x4 = h->mb.i_b4_stride;
+
+ uint8_t (*nnz)[24] = h->mb.non_zero_count;
+
+ if( h->mb.i_neighbour & MB_TOP )
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
+
+ if( h->mb.i_neighbour & MB_LEFT )
+ {
+ h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
+ h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
+ h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
+ h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
+ }
+
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8], &nnz[cur][0*4] );
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8], &nnz[cur][1*4] );
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8], &nnz[cur][2*4] );
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8], &nnz[cur][3*4] );
+
+ for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
+ {
+ int16_t (*mv)[2] = h->mb.mv[l];
+ int8_t *ref = h->mb.ref[l];
+
+ int i8 = x264_scan8[0] - 8;
+ if( h->mb.i_neighbour & MB_TOP )
+ {
+ h->mb.cache.ref[l][i8+0] =
+ h->mb.cache.ref[l][i8+1] = deblock_ref_table(l,ref[top_8x8 + 0]);
+ h->mb.cache.ref[l][i8+2] =
+ h->mb.cache.ref[l][i8+3] = deblock_ref_table(l,ref[top_8x8 + 1]);
+ CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
+ }
+
+ i8 = x264_scan8[0] - 1;
+ if( h->mb.i_neighbour & MB_LEFT )
+ {
+ int ir = h->mb.i_b8_xy - 1;
+ int iv = h->mb.i_b4_xy - 1;
+ h->mb.cache.ref[l][i8+0*8] =
+ h->mb.cache.ref[l][i8+1*8] = deblock_ref_table(l,ref[ir + 0*s8x8]);
+ h->mb.cache.ref[l][i8+2*8] =
+ h->mb.cache.ref[l][i8+3*8] = deblock_ref_table(l,ref[ir + 1*s8x8]);
+
+ CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
+ CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
+ CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
+ CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
+ }
+
+ int ref0 = deblock_ref_table(l,ref[h->mb.i_b8_xy+0+0*s8x8]);
+ int ref1 = deblock_ref_table(l,ref[h->mb.i_b8_xy+1+0*s8x8]);
+ int ref2 = deblock_ref_table(l,ref[h->mb.i_b8_xy+0+1*s8x8]);
+ int ref3 = deblock_ref_table(l,ref[h->mb.i_b8_xy+1+1*s8x8]);
+ uint32_t reftop = pack16to32( (uint8_t)ref0, (uint8_t)ref1 ) * 0x0101;
+ uint32_t refbot = pack16to32( (uint8_t)ref2, (uint8_t)ref3 ) * 0x0101;
+
+ M32( &h->mb.cache.ref[l][x264_scan8[0]+8*0] ) = reftop;
+ M32( &h->mb.cache.ref[l][x264_scan8[0]+8*1] ) = reftop;
+ M32( &h->mb.cache.ref[l][x264_scan8[0]+8*2] ) = refbot;
+ M32( &h->mb.cache.ref[l][x264_scan8[0]+8*3] ) = refbot;
+ CP128( h->mb.cache.mv[l][x264_scan8[0]+8*0], mv[h->mb.i_b4_xy+0*s4x4] );
+ CP128( h->mb.cache.mv[l][x264_scan8[0]+8*1], mv[h->mb.i_b4_xy+1*s4x4] );
+ CP128( h->mb.cache.mv[l][x264_scan8[0]+8*2], mv[h->mb.i_b4_xy+2*s4x4] );
+ CP128( h->mb.cache.mv[l][x264_scan8[0]+8*3], mv[h->mb.i_b4_xy+3*s4x4] );
+ }
+}
+
static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
{
int w = i ? 8 : 16;
void x264_macroblock_slice_init( x264_t *h );
void x264_macroblock_thread_init( x264_t *h );
void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y );
+void x264_macroblock_cache_load_deblock( x264_t *h, int mb_x, int mb_y, int deblock_ref_table[2][34] );
+#define deblock_ref_table(l,x) deblock_ref_table[l][x+2]
void x264_macroblock_cache_save( x264_t *h );
void x264_macroblock_bipred_init( x264_t *h );
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
SECTION .text
movd %8, m5
%endmacro
-%macro SBUTTERFLY 4
+%macro SBUTTERFLY3 4
movq %4, %2
punpckl%1 %2, %3
punpckh%1 %4, %3
movq m4, %5
movq m5, %6
movq m6, %7
- SBUTTERFLY bw, m0, m1, m7
- SBUTTERFLY bw, m2, m3, m1
- SBUTTERFLY bw, m4, m5, m3
+ SBUTTERFLY3 bw, m0, m1, m7
+ SBUTTERFLY3 bw, m2, m3, m1
+ SBUTTERFLY3 bw, m4, m5, m3
movq [%9+0x10], m1
- SBUTTERFLY bw, m6, %8, m5
- SBUTTERFLY wd, m0, m2, m1
- SBUTTERFLY wd, m4, m6, m2
+ SBUTTERFLY3 bw, m6, %8, m5
+ SBUTTERFLY3 wd, m0, m2, m1
+ SBUTTERFLY3 wd, m4, m6, m2
punpckhdq m0, m4
movq [%9+0x00], m0
- SBUTTERFLY wd, m7, [%9+0x10], m6
- SBUTTERFLY wd, m3, m5, m4
- SBUTTERFLY dq, m7, m3, m0
- SBUTTERFLY dq, m1, m2, m5
+ SBUTTERFLY3 wd, m7, [%9+0x10], m6
+ SBUTTERFLY3 wd, m3, m5, m4
+ SBUTTERFLY3 dq, m7, m3, m0
+ SBUTTERFLY3 dq, m1, m2, m5
punpckldq m6, m4
movq [%9+0x10], m1
movq [%9+0x20], m5
movq m4, %5
movq m5, %6
movq m6, %7
- SBUTTERFLY bw, m0, m1, m7
- SBUTTERFLY bw, m2, m3, m1
- SBUTTERFLY bw, m4, m5, m3
- SBUTTERFLY bw, m6, %8, m5
+ SBUTTERFLY3 bw, m0, m1, m7
+ SBUTTERFLY3 bw, m2, m3, m1
+ SBUTTERFLY3 bw, m4, m5, m3
+ SBUTTERFLY3 bw, m6, %8, m5
movq %9, m3
- SBUTTERFLY wd, m0, m2, m3
- SBUTTERFLY wd, m4, m6, m2
- SBUTTERFLY wd, m7, m1, m6
+ SBUTTERFLY3 wd, m0, m2, m3
+ SBUTTERFLY3 wd, m4, m6, m2
+ SBUTTERFLY3 wd, m7, m1, m6
movq %11, m2
movq m2, %9
- SBUTTERFLY wd, m2, m5, m1
- SBUTTERFLY dq, m0, m4, m5
- SBUTTERFLY dq, m7, m2, m4
+ SBUTTERFLY3 wd, m2, m5, m1
+ SBUTTERFLY3 dq, m0, m4, m5
+ SBUTTERFLY3 dq, m7, m2, m4
movq %9, m0
movq %10, m5
movq %13, m7
movq %14, m4
- SBUTTERFLY dq, m3, %11, m0
- SBUTTERFLY dq, m6, m1, m5
+ SBUTTERFLY3 dq, m3, %11, m0
+ SBUTTERFLY3 dq, m6, m1, m5
movq %11, m3
movq %12, m0
movq %15, m6
paddb m1, m5
paddb m2, m6
ret
+
+;-----------------------------------------------------------------------------
+; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
+; uint8_t bs[2][4][4], int mvy_limit, int bframe, int step,
+; int first_edge_only )
+;-----------------------------------------------------------------------------
+
+%define scan8start (4+1*8)
+%define nnz r0+scan8start
+%define ref r1+scan8start
+%define mv r2+scan8start*4
+%define bs0 r3
+%define bs1 r3+16
+
+%macro LOAD_BYTES_MMX 1
+ movd m2, [%1+8*0-1]
+ movd m0, [%1+8*0]
+ movd m3, [%1+8*2-1]
+ movd m1, [%1+8*2]
+ punpckldq m2, [%1+8*1-1]
+ punpckldq m0, [%1+8*1]
+ punpckldq m3, [%1+8*3-1]
+ punpckldq m1, [%1+8*3]
+%endmacro
+
+%macro DEBLOCK_STRENGTH_REFS_MMX 0
+ LOAD_BYTES_MMX ref
+ pxor m2, m0
+ pxor m3, m1
+ por m2, [bs0+0]
+ por m3, [bs0+8]
+ movq [bs0+0], m2
+ movq [bs0+8], m3
+
+ movd m2, [ref-8*1]
+ movd m3, [ref+8*1]
+ punpckldq m2, m0 ; row -1, row 0
+ punpckldq m3, m1 ; row 1, row 2
+ pxor m0, m2
+ pxor m1, m3
+ por m0, [bs1+0]
+ por m1, [bs1+8]
+ movq [bs1+0], m0
+ movq [bs1+8], m1
+%endmacro
+
+%macro DEBLOCK_STRENGTH_MVS_MMX 2
+ mova m0, [mv-%2]
+ mova m1, [mv-%2+8]
+ psubw m0, [mv]
+ psubw m1, [mv+8]
+ packsswb m0, m1
+ ABSB m0, m1
+ psubusb m0, m7
+ packsswb m0, m0
+ por m0, [%1]
+ movd [%1], m0
+%endmacro
+
+%macro DEBLOCK_STRENGTH_NNZ_MMX 1
+ por m2, m0
+ por m3, m1
+ mova m4, [%1]
+ mova m5, [%1+8]
+ pminub m2, m6
+ pminub m3, m6
+ pminub m4, m6 ; mv ? 1 : 0
+ pminub m5, m6
+ paddb m2, m2 ; nnz ? 2 : 0
+ paddb m3, m3
+ pmaxub m2, m4
+ pmaxub m3, m5
+%endmacro
+
+%macro LOAD_BYTES_XMM 1
+ movu m0, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
+ movu m1, [%1+12]
+ mova m2, m0
+ pslldq m0, 1
+ shufps m2, m1, 0xdd ; cur nnz, all rows
+ pslldq m1, 1
+ shufps m0, m1, 0xdd ; left neighbors
+ mova m1, m2
+ movd m3, [%1-8] ; could be palignr if nnz was aligned
+ pslldq m1, 4
+ por m1, m3 ; top neighbors
+%endmacro
+
+INIT_MMX
+cglobal deblock_strength_mmxext, 6,6
+ ; Prepare mv comparison register
+ shl r4d, 8
+ add r4d, 3 - (1<<8)
+ movd m7, r4d
+ SPLATW m7
+ mova m6, [pb_1]
+ pxor m0, m0
+ mova [bs0+0], m0
+ mova [bs0+8], m0
+ mova [bs1+0], m0
+ mova [bs1+8], m0
+
+.lists:
+ DEBLOCK_STRENGTH_REFS_MMX
+ mov r4d, 4
+.mvs:
+ DEBLOCK_STRENGTH_MVS_MMX bs0, 4
+ DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8
+ add r2, 4*8
+ add r3, 4
+ dec r4d
+ jg .mvs
+ add r1, 40
+ add r2, 4*8
+ sub r3, 16
+ dec r5d
+ jge .lists
+
+ ; Check nnz
+ LOAD_BYTES_MMX nnz
+ DEBLOCK_STRENGTH_NNZ_MMX bs0
+ ; Transpose column output
+ SBUTTERFLY bw, 2, 3, 4
+ SBUTTERFLY bw, 2, 3, 4
+ mova [bs0+0], m2
+ mova [bs0+8], m3
+ movd m2, [nnz-8*1]
+ movd m3, [nnz+8*1]
+ punpckldq m2, m0 ; row -1, row 0
+ punpckldq m3, m1 ; row 1, row 2
+ DEBLOCK_STRENGTH_NNZ_MMX bs1
+ mova [bs1+0], m2
+ mova [bs1+8], m3
+ RET
+
+%macro DEBLOCK_STRENGTH_XMM 1
+cglobal deblock_strength_%1, 6,6,7
+ ; Prepare mv comparison register
+ shl r4d, 8
+ add r4d, 3 - (1<<8)
+ movd m6, r4d
+ SPLATW m6
+ pxor m4, m4 ; bs0
+ pxor m5, m5 ; bs1
+
+.lists:
+ ; Check refs
+ LOAD_BYTES_XMM ref
+ pxor m0, m2
+ pxor m1, m2
+ por m4, m0
+ por m5, m1
+
+ ; Check mvs
+ movu m0, [mv-4+4*8*0]
+ movu m1, [mv-4+4*8*1]
+ movu m2, [mv-4+4*8*2]
+ movu m3, [mv-4+4*8*3]
+ psubw m0, [mv+4*8*0]
+ psubw m1, [mv+4*8*1]
+ psubw m2, [mv+4*8*2]
+ psubw m3, [mv+4*8*3]
+ packsswb m0, m1
+ packsswb m2, m3
+ ABSB2 m0, m2, m1, m3
+ psubusb m0, m6
+ psubusb m2, m6
+ packsswb m0, m2
+ por m4, m0
+
+ mova m0, [mv+4*8*-1]
+ mova m1, [mv+4*8* 0]
+ mova m2, [mv+4*8* 1]
+ mova m3, [mv+4*8* 2]
+ psubw m0, m1
+ psubw m1, m2
+ psubw m2, m3
+ psubw m3, [mv+4*8* 3]
+ packsswb m0, m1
+ packsswb m2, m3
+ ABSB2 m0, m2, m1, m3
+ psubusb m0, m6
+ psubusb m2, m6
+ packsswb m0, m2
+ por m5, m0
+ add r1, 40
+ add r2, 4*8*5
+ dec r5d
+ jge .lists
+
+ ; Check nnz
+ LOAD_BYTES_XMM nnz
+ por m0, m2
+ por m1, m2
+ mova m6, [pb_1]
+ pminub m0, m6
+ pminub m1, m6
+ pminub m4, m6 ; mv ? 1 : 0
+ pminub m5, m6
+ paddb m0, m0 ; nnz ? 2 : 0
+ paddb m1, m1
+ pmaxub m4, m0
+ pmaxub m5, m1
+%ifidn %1,ssse3
+ pshufb m4, [transpose_shuf]
+%else
+ movhlps m3, m4
+ punpcklbw m4, m3
+ movhlps m3, m4
+ punpcklbw m4, m3
+%endif
+ mova [bs1], m5
+ mova [bs0], m4
+ RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_STRENGTH_XMM sse2
+%define ABSB2 ABSB2_SSSE3
+DEBLOCK_STRENGTH_XMM ssse3
pabsw %2, %2
%endmacro
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
+%macro ABSB_MMX 2
+ pxor %2, %2
+ psubb %2, %1
+ pminub %1, %2
+%endmacro
+
+%macro ABSB2_MMX 4
+ pxor %3, %3
+ pxor %4, %4
+ psubb %3, %1
+ psubb %4, %2
+ pminub %1, %3
+ pminub %2, %4
+%endmacro
+
+%macro ABSB_SSSE3 2
+ pabsb %1, %1
+%endmacro
+
+%macro ABSB2_SSSE3 4
+ pabsb %1, %1
+ pabsb %2, %2
+%endmacro
%macro ABS4 6
ABS2 %1, %2, %5, %6
ABS2 %3, %4, %5, %6
%endmacro
+%define ABS1 ABS1_MMX
+%define ABS2 ABS2_MMX
+%define ABSB ABSB_MMX
+%define ABSB2 ABSB2_MMX
+
%macro SPLATB_MMX 3
movd %1, [%2-3] ;to avoid crossing a cacheline
punpcklbw %1, %1
} \
}
- TEST_DEBLOCK( deblock_h_luma, 0, tcs[i] );
- TEST_DEBLOCK( deblock_v_luma, 1, tcs[i] );
- TEST_DEBLOCK( deblock_h_chroma, 0, tcs[i] );
- TEST_DEBLOCK( deblock_v_chroma, 1, tcs[i] );
- TEST_DEBLOCK( deblock_h_luma_intra, 0 );
- TEST_DEBLOCK( deblock_v_luma_intra, 1 );
- TEST_DEBLOCK( deblock_h_chroma_intra, 0 );
- TEST_DEBLOCK( deblock_v_chroma_intra, 1 );
+ TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] );
+ TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
+ TEST_DEBLOCK( deblock_chroma[0], 0, tcs[i] );
+ TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
+ TEST_DEBLOCK( deblock_luma_intra[0], 0 );
+ TEST_DEBLOCK( deblock_luma_intra[1], 1 );
+ TEST_DEBLOCK( deblock_chroma_intra[0], 0 );
+ TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
+
+ if( db_a.deblock_strength != db_ref.deblock_strength )
+ {
+ for( int i = 0; i < 100; i++ )
+ {
+ ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
+ ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
+ ALIGNED_ARRAY_16( int16_t, mv, [2][X264_SCAN8_LUMA_SIZE][2] );
+ ALIGNED_ARRAY_16( uint8_t, bs, [2][2][4][4] );
+ for( int j = 0; j < X264_SCAN8_SIZE; j++ )
+ nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
+ for( int j = 0; j < 2; j++ )
+ for( int k = 0; k < X264_SCAN8_LUMA_SIZE; k++ )
+ {
+ ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2;
+ for( int l = 0; l < 2; l++ )
+ mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512;
+ }
+ set_func_name( "deblock_strength" );
+ call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1), 1, 0 );
+ call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1), 1, 0 );
+ if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) )
+ {
+ ok = 0;
+ fprintf( stderr, "deblock_strength: [FAILED]\n" );
+ for( int j = 0; j < 2; j++ )
+ {
+ for( int k = 0; k < 2; k++ )
+ for( int l = 0; l < 4; l++ )
+ {
+ for( int m = 0; m < 4; m++ )
+ printf("%d ",bs[j][k][l][m]);
+ printf("\n");
+ }
+ printf("\n");
+ }
+ break;
+ }
+ }
+ }
report( "deblock :" );