~15% faster chroma encode by reorganizing CBP calculation and adding special-case idct_dc function, since most coded chroma blocks are DC-only.
Small optimization in cache_save (skip_bp)
Fix array_non_zero to not violate strict aliasing (should eliminate miscompilation issues in the future)
Add in automatic substitutions for some asm instructions that have an equivalent smaller representation.
add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
}
+static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
+{
+ int i;
+ dc = (dc + 32) >> 6;
+ for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
+ {
+ p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
+ p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
+ p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
+ p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
+ }
+}
+
+static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
+{
+ add4x4_idct_dc( &p_dst[0], dct[0][0] );
+ add4x4_idct_dc( &p_dst[4], dct[0][1] );
+ add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
+ add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
+}
+
/****************************************************************************
* x264_dct_init:
dctf->sub8x8_dct = sub8x8_dct;
dctf->add8x8_idct = add8x8_idct;
+ dctf->add8x8_idct_dc = add8x8_idct_dc;
dctf->sub16x16_dct = sub16x16_dct;
dctf->add16x16_idct = add16x16_idct;
{
dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
dctf->add4x4_idct = x264_add4x4_idct_mmx;
+ dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
}
+
+ if( cpu&X264_CPU_SSSE3 )
+ dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
#endif //HAVE_MMX
#ifdef ARCH_PPC
void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][4][4] );
+ void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] );
void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
h->mb.skipbp[i_mb_xy] = 0xf;
else if( i_mb_type == B_8x8 )
{
- int skipbp = 0;
- for( i = 0; i < 4; i++ )
- skipbp |= ( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) << i;
+ int skipbp = ( h->mb.i_sub_partition[0] == D_DIRECT_8x8 ) << 0;
+ skipbp |= ( h->mb.i_sub_partition[1] == D_DIRECT_8x8 ) << 1;
+ skipbp |= ( h->mb.i_sub_partition[2] == D_DIRECT_8x8 ) << 2;
+ skipbp |= ( h->mb.i_sub_partition[3] == D_DIRECT_8x8 ) << 3;
h->mb.skipbp[i_mb_xy] = skipbp;
}
else
#define array_non_zero_int array_non_zero_int_c
static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
{
- uint64_t *x = v;
+ union {uint16_t s[4]; uint64_t l;} *x = v;
if(i_count == 8)
- return !!x[0];
+ return !!x[0].l;
else if(i_count == 16)
- return !!(x[0]|x[1]);
+ return !!(x[0].l|x[1].l);
else if(i_count == 32)
- return !!(x[0]|x[1]|x[2]|x[3]);
+ return !!(x[0].l|x[1].l|x[2].l|x[3].l);
else
{
int i;
i_count /= sizeof(uint64_t);
for( i = 0; i < i_count; i++ )
- if( x[i] ) return 1;
+ if( x[i].l ) return 1;
return 0;
}
}
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
+pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
SECTION .text
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
+;-----------------------------------------------------------------------------
+; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
+;-----------------------------------------------------------------------------
+
+%macro ADD_DC 3
+ movq mm4, [%3+FDEC_STRIDE*0]
+ movq mm5, [%3+FDEC_STRIDE*1]
+ movq mm6, [%3+FDEC_STRIDE*2]
+ paddusb mm4, %1
+ paddusb mm5, %1
+ paddusb mm6, %1
+ paddusb %1, [%3+FDEC_STRIDE*3]
+ psubusb mm4, %2
+ psubusb mm5, %2
+ psubusb mm6, %2
+ psubusb %1, %2
+ movq [%3+FDEC_STRIDE*0], mm4
+ movq [%3+FDEC_STRIDE*1], mm5
+ movq [%3+FDEC_STRIDE*2], mm6
+ movq [%3+FDEC_STRIDE*3], %1
+%endmacro
+
+cglobal x264_add8x8_idct_dc_mmx, 2,2
+ movq mm0, [r1]
+ pxor mm1, mm1
+ add r0, FDEC_STRIDE*4
+ paddw mm0, [pw_32 GLOBAL]
+ psraw mm0, 6
+ psubw mm1, mm0
+ packuswb mm0, mm0
+ packuswb mm1, mm1
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ pshufw mm2, mm0, 0xFA
+ pshufw mm3, mm1, 0xFA
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
+ ADD_DC mm2, mm3, r0
+ ret
+
+cglobal x264_add8x8_idct_dc_ssse3, 2,2
+ movq xmm0, [r1]
+ pxor xmm1, xmm1
+ add r0, FDEC_STRIDE*4
+ paddw xmm0, [pw_32 GLOBAL]
+ psraw xmm0, 6
+ psubw xmm1, xmm0
+ movdqa xmm5, [pb_idctdc_unpack GLOBAL]
+ packuswb xmm0, xmm0
+ packuswb xmm1, xmm1
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ movq xmm2, [r0+FDEC_STRIDE*-4]
+ movq xmm3, [r0+FDEC_STRIDE*-3]
+ movq xmm4, [r0+FDEC_STRIDE*-2]
+ movq xmm5, [r0+FDEC_STRIDE*-1]
+ movhps xmm2, [r0+FDEC_STRIDE* 0]
+ movhps xmm3, [r0+FDEC_STRIDE* 1]
+ movhps xmm4, [r0+FDEC_STRIDE* 2]
+ movhps xmm5, [r0+FDEC_STRIDE* 3]
+ paddusb xmm2, xmm0
+ paddusb xmm3, xmm0
+ paddusb xmm4, xmm0
+ paddusb xmm5, xmm0
+ psubusb xmm2, xmm1
+ psubusb xmm3, xmm1
+ psubusb xmm4, xmm1
+ psubusb xmm5, xmm1
+ movq [r0+FDEC_STRIDE*-4], xmm2
+ movq [r0+FDEC_STRIDE*-3], xmm3
+ movq [r0+FDEC_STRIDE*-2], xmm4
+ movq [r0+FDEC_STRIDE*-1], xmm5
+ movhps [r0+FDEC_STRIDE* 0], xmm2
+ movhps [r0+FDEC_STRIDE* 1], xmm3
+ movhps [r0+FDEC_STRIDE* 2], xmm4
+ movhps [r0+FDEC_STRIDE* 3], xmm5
+ ret
+
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
+void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
+void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] );
void x264_dct4x4dc_mmx ( int16_t d[4][4] );
void x264_idct4x4dc_mmx ( int16_t d[4][4] );
%endif
%endmacro
+;Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+ %ifnum %2
+ %if %2==128
+ sub %1, -128
+ %else
+ add %1, %2
+ %endif
+ %else
+ add %1, %2
+ %endif
+%endmacro
+
+%macro sub 2
+ %ifnum %2
+ %if %2==128
+ add %1, -128
+ %else
+ sub %1, %2
+ %endif
+ %else
+ sub %1, %2
+ %endif
+%endmacro
}
#undef ZIG
+#define IDCT_DEQUANT_START \
+ int d0 = dct[0][0] + dct[0][1]; \
+ int d1 = dct[1][0] + dct[1][1]; \
+ int d2 = dct[0][0] - dct[0][1]; \
+ int d3 = dct[1][0] - dct[1][1]; \
+ int dmf = dequant_mf[i_qp%6][0][0]; \
+ int qbits = i_qp/6 - 5; \
+ if( qbits > 0 ) \
+ { \
+ dmf <<= qbits; \
+ qbits = 0; \
+ }
+
static inline void idct_dequant_2x2_dc( int16_t dct[2][2], int16_t dct4x4[4][4][4], int dequant_mf[6][4][4], int i_qp )
{
- int d0 = dct[0][0] + dct[0][1];
- int d1 = dct[1][0] + dct[1][1];
- int d2 = dct[0][0] - dct[0][1];
- int d3 = dct[1][0] - dct[1][1];
- int dmf = dequant_mf[i_qp%6][0][0];
- int qbits = i_qp/6 - 5;
- if( qbits > 0 )
- {
- dmf <<= qbits;
- qbits = 0;
- }
+ IDCT_DEQUANT_START
dct4x4[0][0][0] = (d0 + d1) * dmf >> -qbits;
dct4x4[1][0][0] = (d0 - d1) * dmf >> -qbits;
dct4x4[2][0][0] = (d2 + d3) * dmf >> -qbits;
dct4x4[3][0][0] = (d2 - d3) * dmf >> -qbits;
}
+static inline void idct_dequant_2x2_dconly( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
+{
+ IDCT_DEQUANT_START
+ dct[0][0] = (d0 + d1) * dmf >> -qbits;
+ dct[0][1] = (d0 - d1) * dmf >> -qbits;
+ dct[1][0] = (d2 + d3) * dmf >> -qbits;
+ dct[1][1] = (d2 - d3) * dmf >> -qbits;
+}
+
static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
{
int d0 = dct4x4[0][0][0] + dct4x4[1][0][0];
void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{
- int i, ch;
+ int i, ch, nz;
int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
+ h->mb.i_cbp_chroma = 0;
for( ch = 0; ch < 2; ch++ )
{
h->zigzagf.sub_4x4( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od );
h->dct.chroma_dc[ch][i] = h->dct.luma4x4[16+i+ch*4][0];
h->dct.luma4x4[16+i+ch*4][0] = 0;
+ nz = array_non_zero( h->dct.luma4x4[16+i+ch*4] );
+ h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
+ h->mb.i_cbp_chroma |= nz;
}
+ h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( h->dct.chroma_dc[ch] );
continue;
}
if( b_decimate && i_decimate_score < 7 )
{
- /* Near null chroma 8x8 block so make it null (bits saving) */
- memset( &h->dct.luma4x4[16+ch*4], 0, 4 * sizeof( *h->dct.luma4x4 ) );
- if( !array_non_zero( dct2x2 ) )
+ /* Decimate the block */
+ h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
+ if( !array_non_zero( dct2x2 ) ) /* Whole block is empty */
{
- memset( h->dct.chroma_dc[ch], 0, sizeof( h->dct.chroma_dc[ch] ) );
+ h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
continue;
}
- memset( dct4x4, 0, sizeof( dct4x4 ) );
+ /* DC-only */
+ h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
+ zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+ idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
}
else
{
for( i = 0; i < 4; i++ )
- h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ {
+ nz = array_non_zero( h->dct.luma4x4[16+ch*4+i] );
+ h->mb.cache.non_zero_count[x264_scan8[16+ch*4+i]] = nz;
+ h->mb.i_cbp_chroma |= nz;
+ if( nz )
+ h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ }
+ /* Don't optimize for the AC-only case--it's very rare */
+ h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( dct2x2 );
+ zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+ idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ h->dctf.add8x8_idct( p_dst, dct4x4 );
}
-
- zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
- idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
- h->dctf.add8x8_idct( p_dst, dct4x4 );
}
- /* coded block pattern */
- h->mb.i_cbp_chroma = 0;
- for( i = 0; i < 8; i++ )
- {
- int nz = array_non_zero( h->dct.luma4x4[16+i] );
- h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
- h->mb.i_cbp_chroma |= nz;
- }
- h->mb.cache.non_zero_count[x264_scan8[25]] = array_non_zero( h->dct.chroma_dc[0] );
- h->mb.cache.non_zero_count[x264_scan8[26]] = array_non_zero( h->dct.chroma_dc[1] );
if( h->mb.i_cbp_chroma )
h->mb.i_cbp_chroma = 2; /* dc+ac (we can't do only ac) */
else if( h->mb.cache.non_zero_count[x264_scan8[25]] |
ok = 1; used_asm = 0;
TEST_IDCT( add4x4_idct, dct4 );
TEST_IDCT( add8x8_idct, dct4 );
+ TEST_IDCT( add8x8_idct_dc, dct4 );
TEST_IDCT( add16x16_idct, dct4 );
report( "add_idct4 :" );