/* Current MB DCT coeffs */
struct
{
- DECLARE_ALIGNED( int, luma16x16_dc[16], 16 );
- DECLARE_ALIGNED( int, chroma_dc[2][4], 16 );
+ DECLARE_ALIGNED( int16_t, luma16x16_dc[16], 16 );
+ DECLARE_ALIGNED( int16_t, chroma_dc[2][4], 16 );
// FIXME merge with union
- DECLARE_ALIGNED( int, luma8x8[4][64], 16 );
+ DECLARE_ALIGNED( int16_t, luma8x8[4][64], 16 );
union
{
- DECLARE_ALIGNED( int, residual_ac[15], 16 );
- DECLARE_ALIGNED( int, luma4x4[16], 16 );
+ DECLARE_ALIGNED( int16_t, residual_ac[15], 16 );
+ DECLARE_ALIGNED( int16_t, luma4x4[16], 16 );
} block[16+8];
} dct;
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
DECLARE_ALIGNED( uint8_t, i4x4_fdec_buf[16*16], 16 );
DECLARE_ALIGNED( uint8_t, i8x8_fdec_buf[16*16], 16 );
- DECLARE_ALIGNED( int, i8x8_dct_buf[3][64], 16 );
- DECLARE_ALIGNED( int, i4x4_dct_buf[15][16], 16 );
+ DECLARE_ALIGNED( int16_t, i8x8_dct_buf[3][64], 16 );
+ DECLARE_ALIGNED( int16_t, i4x4_dct_buf[15][16], 16 );
/* pointer over mb of the frame to be compressed */
uint8_t *p_fenc[3];
}
-#define ZIG(i,y,x) level[i] = dct[x][y];
+// gcc pessimizes multi-dimensional arrays here, even with constant indices
+#define ZIG(i,y,x) level[i] = dct[0][x*8+y];
-static void zigzag_scan_8x8_frame( int level[64], int16_t dct[8][8] )
+static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
{
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
}
-static void zigzag_scan_8x8_field( int level[64], int16_t dct[8][8] )
+static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
{
ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)
ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)
ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
}
-static void zigzag_scan_4x4_frame( int level[16], int16_t dct[4][4] )
+#undef ZIG
+#define ZIG(i,y,x) level[i] = dct[0][x*4+y];
+
+static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
{
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
}
-static void zigzag_scan_4x4_field( int level[16], int16_t dct[4][4] )
+static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
{
- ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)
- ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)
- ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)
- ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
+ *(uint32_t*)level = *(uint32_t*)dct;
+ ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
+ *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
+ *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
+ *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
}
-static void zigzag_scan_4x4ac_frame( int level[15], int16_t dct[4][4] )
+static void zigzag_scan_4x4ac_frame( int16_t level[15], int16_t dct[4][4] )
{
ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
}
-static void zigzag_scan_4x4ac_field( int level[15], int16_t dct[4][4] )
+static void zigzag_scan_4x4ac_field( int16_t level[15], int16_t dct[4][4] )
{
ZIG( 0,1,0) ZIG( 1,0,1) ZIG( 2,2,0)
ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1)
}
#undef ZIG
-
#define ZIG(i,y,x) {\
int oe = x+y*FENC_STRIDE;\
int od = x+y*FDEC_STRIDE;\
p_dst[od] = p_src[oe];\
}
-static void zigzag_sub_4x4_frame( int level[16], const uint8_t *p_src, uint8_t *p_dst )
+static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
}
-static void zigzag_sub_4x4_field( int level[16], const uint8_t *p_src, uint8_t *p_dst )
+static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)
ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
}
-static void zigzag_sub_4x4ac_frame( int level[15], const uint8_t *p_src, uint8_t *p_dst )
+static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
}
-static void zigzag_sub_4x4ac_field( int level[15], const uint8_t *p_src, uint8_t *p_dst )
+static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,1,0) ZIG( 1,0,1) ZIG( 2,2,0)
ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1)
pf->sub_4x4 = zigzag_sub_4x4_field;
pf->sub_4x4ac = zigzag_sub_4x4ac_field;
#ifdef HAVE_MMX
-#ifdef ARCH_X86
- if( cpu&X264_CPU_MMX )
- pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmx;
-#endif
- if( cpu&X264_CPU_SSE2 )
- pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
+ if( cpu&X264_CPU_MMXEXT )
+ pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
#endif
#ifdef ARCH_PPC
typedef struct
{
- void (*scan_8x8)( int level[64], int16_t dct[8][8] );
- void (*scan_4x4)( int level[16], int16_t dct[4][4] );
- void (*scan_4x4ac)( int level[15], int16_t dct[4][4] );
- void (*sub_4x4)( int level[16], const uint8_t *p_src, uint8_t *p_dst );
- void (*sub_4x4ac)( int level[15], const uint8_t *p_src, uint8_t *p_dst );
+ void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] );
+ void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
+ void (*scan_4x4ac)( int16_t level[15], int16_t dct[4][4] );
+ void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
+ void (*sub_4x4ac)( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst );
} x264_zigzag_function_t;
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8
-;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_field_mmx
- mov edx, [esp+8]
- mov ecx, [esp+4]
- punpcklwd mm0, [edx]
- punpckhwd mm1, [edx]
- punpcklwd mm2, [edx+8]
- punpckhwd mm3, [edx+8]
- punpcklwd mm4, [edx+16]
- punpckhwd mm5, [edx+16]
- punpcklwd mm6, [edx+24]
- punpckhwd mm7, [edx+24]
- psrad mm0, 16
- psrad mm1, 16
- psrad mm2, 16
- psrad mm3, 16
- psrad mm4, 16
- psrad mm5, 16
- psrad mm6, 16
- psrad mm7, 16
- movq [ecx ], mm0
- movq [ecx+16], mm2
- movq [ecx+24], mm3
- movq [ecx+32], mm4
- movq [ecx+40], mm5
- movq [ecx+48], mm6
- movq [ecx+56], mm7
- movq [ecx+12], mm1
- movd [ecx+ 8], mm2
- ret
%endif
+
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] )
+; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_field_sse2, 2,2
- punpcklwd xmm0, [r1]
- punpckhwd xmm1, [r1]
- punpcklwd xmm2, [r1+16]
- punpckhwd xmm3, [r1+16]
- psrad xmm0, 16
- psrad xmm1, 16
- psrad xmm2, 16
- psrad xmm3, 16
- movq [r0 ], xmm0
- movdqa [r0+16], xmm1
- movdqa [r0+32], xmm2
- movhlps xmm0, xmm0
- movdqa [r0+48], xmm3
- movq [r0+12], xmm0
- movd [r0+ 8], xmm1
+; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
+cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
+ pshufw mm0, [r1+4], 0xd2
+ movq mm1, [r1+16]
+ movq mm2, [r1+24]
+ movq [r0+4], mm0
+ movq [r0+16], mm1
+ movq [r0+24], mm2
+ mov r2d, [r1]
+ mov [r0], r2d
+ mov r2d, [r1+12]
+ mov [r0+12], r2d
RET
void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] );
void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );
-void x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] );
-void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] );
+void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
#endif
static const int identity[16] =
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int *l, int i_count )
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
/****************************************************************************
* block_residual_write_cavlc:
****************************************************************************/
-static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int *l, int i_count )
+static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t *l, int i_count )
{
int level[16], run[16];
int i_total, i_trailing;
#define ZIG(i,y,x) level[i] = dct[x][y];
-static inline void zigzag_scan_2x2_dc( int level[4], int16_t dct[2][2] )
+static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
{
ZIG(0,0,0)
ZIG(1,0,1)
* for the complete mb: if score < 6 -> null
* chroma: for the complete mb: if score < 7 -> null
*/
-static int x264_mb_decimate_score( int *dct, int i_max )
+static int x264_mb_decimate_score( int16_t *dct, int i_max )
{
static const int i_ds_table4[16] = {
3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
{
DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
- DECLARE_ALIGNED( int, dctscan[16], 16 );
+ DECLARE_ALIGNED( int16_t, dctscan[16], 16 );
int i_qp = h->mb.i_qp;
int mvp[2];
return 0;
}
-static inline int array_non_zero_count( int *v, int i_count )
+static inline int array_non_zero_count( int16_t *v, int i_count )
{
int i;
int i_nz;
x264_zigzag_function_t zigzag_ref;
x264_zigzag_function_t zigzag_asm;
- int32_t level1[64] __attribute__((aligned(16)));
- int32_t level2[64] __attribute__((aligned(16)));
+ int16_t level1[64] __attribute__((aligned(16)));
+ int16_t level2[64] __attribute__((aligned(16)));
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
used_asm = 1; \
call_c( zigzag_c.name, t1, dct ); \
call_a( zigzag_asm.name, t2, dct ); \
- if( memcmp( t1, t2, size ) ) \
+ if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
call_c( zigzag_c.name, t1, buf2, buf3 ); \
call_a( zigzag_asm.name, t2, buf2, buf4 ); \
- if( memcmp( t1, t2, size )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \
+ if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
x264_zigzag_init( cpu_new, &zigzag_asm, 0 );
ok = 1; used_asm = 0;
- TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64*4 );
- TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16*4 );
- TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15*4 );
- TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16*4 );
- TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15*4 );
+ TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
+ TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
+ TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15 );
+ TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
+ TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15 );
report( "zigzag_frame :" );
x264_zigzag_init( 0, &zigzag_c, 1 );
x264_zigzag_init( cpu_new, &zigzag_asm, 1 );
ok = 1; used_asm = 0;
- TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64*4 );
- TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16*4 );
- TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15*4 );
- TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16*4 );
- TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15*4 );
+ TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
+ TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
+ TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15 );
+ TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
+ TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15 );
report( "zigzag_field :" );
#undef TEST_ZIGZAG_SCAN
#undef TEST_ZIGZAG_SUB