Integrate array_non_zero with the CAVLC 8x8dct interleave function.
Roughly 1.5-2x faster than the original separate array_non_zero method.
#undef ZIG
#undef COPY4x4
-static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src )
+static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
{
int i,j;
for( i=0; i<4; i++ )
+ {
+ int nz = 0;
for( j=0; j<16; j++ )
+ {
+ nz |= src[i+j*4];
dst[i*16+j] = src[i+j*4];
+ }
+ nnz[(i&1) + (i>>1)*8] = !!nz;
+ }
}
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
- void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src );
+ void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz );
} x264_zigzag_function_t;
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
+pb_1: times 8 db 1
SECTION .text
movdqa [r0+16], xmm1
RET
-INIT_MMX
-cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3
- mov r2d, 24
-.loop:
- movq m0, [r1+r2*4+ 0]
- movq m1, [r1+r2*4+ 8]
- movq m2, [r1+r2*4+16]
- movq m3, [r1+r2*4+24]
+;-----------------------------------------------------------------------------
+; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
+;-----------------------------------------------------------------------------
+
+%macro INTERLEAVE 1
+ movq m0, [r1+%1*4+ 0]
+ movq m1, [r1+%1*4+ 8]
+ movq m2, [r1+%1*4+16]
+ movq m3, [r1+%1*4+24]
TRANSPOSE4x4W 0,1,2,3,4
- movq [r0+r2+ 0], m0
- movq [r0+r2+32], m1
- movq [r0+r2+64], m2
- movq [r0+r2+96], m3
- sub r2d, 8
- jge .loop
- REP_RET
+ movq [r0+%1+ 0], m0
+ movq [r0+%1+32], m1
+ movq [r0+%1+64], m2
+ movq [r0+%1+96], m3
+%if %1
+ packsswb m0, m1
+ por m6, m2
+ por m7, m3
+ por m5, m0
+%else
+ packsswb m0, m1
+ SWAP m5, m0
+ SWAP m6, m2
+ SWAP m7, m3
+%endif
+%endmacro
+
+INIT_MMX
+cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
+ INTERLEAVE 0
+ INTERLEAVE 8
+ INTERLEAVE 16
+ INTERLEAVE 24
+ packsswb m6, m7
+ packsswb m5, m6
+ packsswb m5, m5
+ pxor m0, m0
+ pcmpeqb m5, m0
+ paddb m5, [pb_1 GLOBAL]
+ movd r0d, m5
+ mov [r2+0], r0w
+ shr r0d, 16
+ mov [r2+8], r0w
+ RET
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
-void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src );
+void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
#endif
/* shuffle 8x8 dct coeffs into 4x4 lists */
for( i8 = i8start; i8 <= i8end; i8++ )
if( h->mb.i_cbp_luma & (1 << i8) )
- {
- h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
- for( i4 = 0; i4 < 4; i4++ )
- h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
- }
+ h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
}
for( i8 = i8start; i8 <= i8end; i8++ )
call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
}
+#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
+ if( zigzag_asm.name != zigzag_ref.name ) \
+ { \
+ for( j=0; j<100; j++ ) \
+ { \
+ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+ used_asm = 1; \
+ memcpy(dct, buf1, size*sizeof(int16_t));\
+ for( i=0; i<size; i++ ) \
+ dct[i] = rand()&0x1F ? 0 : dct[i]; \
+ memcpy(buf3, buf4, 10*sizeof(uint8_t)); \
+ call_c( zigzag_c.name, t1, dct, buf3 ); \
+ call_a( zigzag_asm.name, t2, dct, buf4 ); \
+ if( memcmp( t1, t2, size*sizeof(int16_t) ) || memcmp( buf3, buf4, 10*sizeof(uint8_t) ) ) \
+ { \
+ ok = 0; \
+ } \
+ } \
+ }
+
interlace = 0;
x264_zigzag_init( 0, &zigzag_c, 0 );
x264_zigzag_init( cpu_ref, &zigzag_ref, 0 );
ok = 1; used_asm = 0;
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
- TEST_ZIGZAG_SCAN( interleave_8x8_cavlc, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
report( "zigzag_frame :" );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
report( "zigzag_field :" );
+
+ ok = 1; used_asm = 0;
+ TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0][0], 64 );
+ report( "zigzag_interleave :" );
#undef TEST_ZIGZAG_SCAN
#undef TEST_ZIGZAG_SUB