From: Loren Merritt Date: Mon, 15 Aug 2011 18:18:55 +0000 (+0000) Subject: SSSE3/SSE4 9-way fully merged i4x4 analysis (sad/satd_x9) X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3d82e875d06b9d1e15ca2baa16b1bd9640500972;p=libx264 SSSE3/SSE4 9-way fully merged i4x4 analysis (sad/satd_x9) i4x4 analysis cycles (per partition): penryn sandybridge 184-> 75 157-> 54 preset=superfast (sad) 281->165 225->124 preset=faster (satd with early termination) 332->165 263->124 preset=medium 379->165 297->124 preset=slower (satd without early termination) This is the first code in x264 that intentionally produces different behavior on different cpus: satd_x9 is implemented only on ssse3+ and checks all intra directions, whereas the old code (on fast presets) may early terminate after checking only some of them. There is no systematic difference on slow presets, though they still occasionally disagree about tiebreaks. For ease of debugging, add an option "--cpu-independent" to disable satd_x9 and any analogous future code. --- diff --git a/common/common.c b/common/common.c index 4aff737a..ce076e59 100644 --- a/common/common.c +++ b/common/common.c @@ -620,6 +620,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) } OPT2("deterministic", "n-deterministic") p->b_deterministic = atobool(value); + OPT("cpu-independent") + p->b_cpu_independent = atobool(value); OPT2("level", "level-idc") { if( !strcmp(value, "1b") ) diff --git a/common/osdep.h b/common/osdep.h index ad1ebe43..dbb0937f 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -114,6 +114,7 @@ #endif #define ALIGNED_ARRAY_32( ... ) ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) +#define ALIGNED_ARRAY_64( ... ) ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) #define UNINIT(x) x=x diff --git a/common/pixel.c b/common/pixel.c index 7da32153..91dc1b87 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -552,6 +552,12 @@ INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 ) #endif +// No C implementation of intra_satd_x9. See checkasm for its behavior, +// or see x264_mb_analyse_intra for the entirely different algorithm we +// use when lacking an asm implementation of it. + + + /**************************************************************************** * structural similarity metric ****************************************************************************/ @@ -1045,6 +1051,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _ssse3 ); + pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3; + pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3; } INIT_ADS( _ssse3 ); if( !(cpu&X264_CPU_SLOW_ATOM) ) @@ -1087,6 +1095,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _sse4 ); + pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_sse4; + pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4; } pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; @@ -1102,6 +1112,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _avx ); + pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_avx; + pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx; } INIT5( ssd, _avx ); #if ARCH_X86_64 diff --git a/common/pixel.h b/common/pixel.h index 6d2fc65c..c7ee0fbf 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -106,20 +106,24 @@ typedef struct int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta, uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ); - /* calculate satd or sad of V, H, and DC modes. - * may be NULL, in which case just use pred+satd instead. */ - void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec , int res[3] ); - void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec , int res[3] ); - void (*intra_sad_x3_16x16) ( pixel *fenc, pixel *fdec , int res[3] ); - void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec , int res[3] ); - void (*intra_satd_x3_8x8c) ( pixel *fenc, pixel *fdec , int res[3] ); - void (*intra_sad_x3_8x8c) ( pixel *fenc, pixel *fdec , int res[3] ); - void (*intra_mbcmp_x3_4x4) ( pixel *fenc, pixel *fdec , int res[3] ); - void (*intra_satd_x3_4x4) ( pixel *fenc, pixel *fdec , int res[3] ); - void (*intra_sad_x3_4x4) ( pixel *fenc, pixel *fdec , int res[3] ); + /* calculate satd or sad of V, H, and DC modes. */ + void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_sad_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_satd_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_sad_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_mbcmp_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_satd_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_sad_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_mbcmp_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] ); void (*intra_sa8d_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] ); void (*intra_sad_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] ); + /* find minimum satd or sad of all modes. + * may be NULL, in which case just use pred+satd instead. */ + int (*intra_mbcmp_x9_4x4)( pixel *fenc, pixel *fdec, uint16_t *bitcosts ); + int (*intra_satd_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts ); + int (*intra_sad_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts ); } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index 78d1e555..be947eab 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -51,6 +51,7 @@ const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1) const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1 const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 +const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0 const pd_1, times 4 dd 1 const pd_32, times 4 dd 32 diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index d1cda71d..21170cdb 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -58,19 +58,46 @@ pb_pppm: times 4 db 1,1,1,-1 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0 +intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6 +intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4 +intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0 +intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11 +intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8 +intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9 +intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3 +intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1 +intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1 +pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007 +pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007 +intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15 + +intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6 +intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4 +intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0 +intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11 +intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8 +intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9 +intrax9b_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1 +intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1 +intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1 + sw_f0: dq 0xfff0, 0 +sq_0f: dq 0xffffffff, 0 pd_f0: times 4 dd 0xffff0000 -sq_0f: times 1 dq 0xffffffff SECTION .text +cextern pb_0 +cextern pb_1 cextern pw_1 cextern pw_8 +cextern pw_16 cextern pw_64 cextern pw_00ff cextern pw_ppppmmmm cextern pw_ppmmppmm cextern pw_pmpmpmpm +cextern pw_pmmpzzzz cextern hsub_mul ;============================================================================= @@ -839,8 +866,6 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 ; SATD ;============================================================================= -%define TRANS TRANS_SSE2 - %macro JDUP 2 %if cpuflag(sse4) ; just use shufps on anything post conroe @@ -1765,7 +1790,7 @@ cglobal hadamard_load ABSW m0, m0, m1 ; 4x1 sum %endmacro -%macro INTRA_SATDS_MMX 0 +%macro INTRA_X3_MMX 0 ;----------------------------------------------------------------------------- ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- @@ -1785,8 +1810,7 @@ cglobal intra_satd_x3_4x4, 3,3 SCALAR_HADAMARD left, 0, m4, m5 SCALAR_HADAMARD top, 0, m6, m5, m7 paddw m6, m4 - psrlw m6, 1 - paddw m6, [pw_8] + pavgw m6, [pw_16] pand m6, [sw_f0] ; dc SUM3x4 @@ -1968,7 +1992,413 @@ cglobal intra_satd_x3_8x8c, 0,6 movd [r2+8], m2 ; i8x8c_v satd ADD rsp, 72 RET -%endmacro ; INTRA_SATDS_MMX +%endmacro ; INTRA_X3_MMX + + + +%macro PRED4x4_LOWPASS 5 + mova %5, %2 + pavgb %2, %3 + pxor %3, %5 + pand %3, [pb_1] + psubusb %2, %3 + pavgb %1, %4, %2 +%endmacro + +%macro INTRA_X9_PRED 2 +%if cpuflag(sse4) + movu m1, [r1-1*FDEC_STRIDE-8] + pinsrb m1, [r1+3*FDEC_STRIDE-1], 0 + pinsrb m1, [r1+2*FDEC_STRIDE-1], 1 + pinsrb m1, [r1+1*FDEC_STRIDE-1], 2 + pinsrb m1, [r1+0*FDEC_STRIDE-1], 3 +%else + movd mm0, [r1+3*FDEC_STRIDE-4] + punpcklbw mm0, [r1+2*FDEC_STRIDE-4] + movd mm1, [r1+1*FDEC_STRIDE-4] + punpcklbw mm1, [r1+0*FDEC_STRIDE-4] + punpckhwd mm0, mm1 + psrlq mm0, 32 + movq2dq m0, mm0 + movu m1, [r1-1*FDEC_STRIDE-8] + movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7 +%endif ; cpuflag + pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ + psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ + psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __ + pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __ + mova %2, m1 + PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __ + ; ddl ddr + ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2 + ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1 + ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0 + ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt + pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1 + pshufb m3, m0, [%1_ddlr2] ; rows 2,3 + ; hd hu + ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2 + ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3 + ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3 + ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3 + pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 + palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt + pshufb m6, m7, [%1_hdu1] + pshufb m7, m7, [%1_hdu2] + ; vr vl + ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4 + ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4 + ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5 + ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 + psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ... + palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 + pshufb m4, m5, [%1_vrl1] + pshufb m5, m5, [%1_vrl2] +%endmacro ; INTRA_X9_PRED + +%macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp + pshufb m%1, [intrax9b_vh1] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3 + pmaddubsw m%1, [hmul_4p] + pshufhw m0, m%1, q2301 + pshuflw m0, m0, q2301 + psignw m%1, [pw_pmpmpmpm] + paddw m0, m%1 + psllw m0, 2 ; hadamard(top), hadamard(left) + mova m1, m0 + mova m2, m0 + movhlps m3, m0 + pshufb m1, [intrax9b_v1] + pshufb m2, [intrax9b_v2] + paddw m0, m3 + psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated? + pavgw m0, [pw_16] + pand m0, [sw_f0] ; dc + ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be + ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs. + ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef. + HADAMARD 0, sumsub, %2, %3, %4, %5 + HADAMARD 1, sumsub, %2, %3, %4, %5 + psubw m3, m%2 + psubw m0, m%2 + psubw m1, m%2 + psubw m2, m%3 + pabsw m%3, m%3 + pabsw m3, m3 + pabsw m0, m0 + pabsw m1, m1 + pabsw m2, m2 + pavgw m3, m%3 + pavgw m0, m%3 + pavgw m1, m2 +%if cpuflag(sse4) + phaddw m3, m0 +%else + SBUTTERFLY qdq, 3, 0, 2 + paddw m3, m0 +%endif + movhlps m2, m1 + paddw m1, m2 + phaddw m1, m3 + pmaddwd m1, [pw_1] ; v, _, h, dc +%endmacro ; INTRA_X9_VHDC + +%macro INTRA_X9_END 1 +%if cpuflag(sse4) + phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu + movd eax, m0 + add eax, 1<<16 + cmp ax, r1w + cmovge eax, r1d +%else +%if %1 + ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index + psllw m0, 3 + paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu +%else + ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index + psllw m0, 2 + paddusw m0, m0 + paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu +%endif + movhlps m1, m0 + pminsw m0, m1 + pshuflw m1, m0, q0032 + pminsw m0, m1 + pshuflw m1, m0, q0001 + pminsw m0, m1 + movd eax, m0 + movsx r2d, ax + and eax, 7 + sar r2d, 3 + shl eax, 16 + ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits + ; 1<<12: undo sign manipulation + lea eax, [rax+r2+(1<<16)+(1<<12)] + cmp ax, r1w + cmovge eax, r1d +%endif ; cpuflag +%endmacro ; INTRA_X9_END + +%macro INTRA_X9 0 +;----------------------------------------------------------------------------- +; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts ) +;----------------------------------------------------------------------------- +cglobal intra_sad_x9_4x4, 3,3,9 +%ifdef ARCH_X86_64 + INTRA_X9_PRED intrax9a, m8 +%else + sub rsp, 0x1c + INTRA_X9_PRED intrax9a, [rsp] +%endif +%if cpuflag(sse4) + movd m0, [r0+0*FENC_STRIDE] + pinsrd m0, [r0+1*FENC_STRIDE], 1 + movd m1, [r0+2*FENC_STRIDE] + pinsrd m1, [r0+3*FENC_STRIDE], 1 +%else + movd mm0, [r0+0*FENC_STRIDE] + punpckldq mm0, [r0+1*FENC_STRIDE] + movd mm1, [r0+2*FENC_STRIDE] + punpckldq mm1, [r0+3*FENC_STRIDE] + movq2dq m0, mm0 + movq2dq m1, mm1 +%endif + punpcklqdq m0, m0 + punpcklqdq m1, m1 + psadbw m2, m0 + psadbw m3, m1 + psadbw m4, m0 + psadbw m5, m1 + psadbw m6, m0 + psadbw m7, m1 + paddd m2, m3 + paddd m4, m5 + paddd m6, m7 +%ifdef ARCH_X86_64 + SWAP 7, 8 + pxor m8, m8 + %define %%zero m8 +%else + mova m7, [rsp] + %define %%zero [pb_0] +%endif + mova m3, m7 + mova m5, m7 + pshufb m7, [intrax9a_dc] + pshufb m3, [intrax9a_vh1] + psadbw m7, %%zero + pshufb m5, [intrax9a_vh2] + psrlw m7, 2 + psadbw m3, m0 + pavgw m7, %%zero + pshufb m7, %%zero + psadbw m5, m1 + psadbw m0, m7 + paddd m3, m5 + psadbw m1, m7 + paddd m0, m1 + movzx r1d, word [r2] + movd r0d, m3 ; v + add r1d, r0d + punpckhqdq m3, m0 ; h, dc + shufps m3, m2, q2020 + psllq m6, 32 + por m4, m6 + movu m0, [r2+2] + packssdw m3, m4 + paddw m0, m3 + INTRA_X9_END 1 +%ifndef ARCH_X86_64 + add rsp, 0x1c +%endif + RET + +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts ) +;----------------------------------------------------------------------------- +cglobal intra_satd_x9_4x4, 3,3,16 + INTRA_X9_PRED intrax9b, m15 + movd m8, [r0+0*FENC_STRIDE] + movd m9, [r0+1*FENC_STRIDE] + movd m10, [r0+2*FENC_STRIDE] + movd m11, [r0+3*FENC_STRIDE] + mova m12, [hmul_8p] + pshufd m8, m8, 0 + pshufd m9, m9, 0 + pshufd m10, m10, 0 + pshufd m11, m11, 0 + pmaddubsw m8, m12 + pmaddubsw m9, m12 + pmaddubsw m10, m12 + pmaddubsw m11, m12 + movddup m0, m2 + pshufd m1, m2, q3232 + movddup m2, m3 + movhlps m3, m3 + call .satd_8x4 ; ddr, ddl + movddup m2, m5 + pshufd m3, m5, q3232 + mova m5, m0 + movddup m0, m4 + pshufd m1, m4, q3232 + call .satd_8x4 ; vr, vl + movddup m2, m7 + pshufd m3, m7, q3232 + mova m4, m0 + movddup m0, m6 + pshufd m1, m6, q3232 + call .satd_8x4 ; hd, hu +%if cpuflag(sse4) + punpckldq m4, m0 +%else + punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't +%endif + mova m1, [pw_ppmmppmm] + psignw m8, m1 + psignw m10, m1 + paddw m8, m9 + paddw m10, m11 + INTRA_X9_VHDC 15, 8, 10, 6, 7 + ; find minimum + movu m0, [r2+2] + movd r1d, m1 + palignr m5, m1, 8 +%if notcpuflag(sse4) + pshufhw m0, m0, q3120 ; compensate for different order in unpack +%endif + packssdw m5, m4 + paddw m0, m5 + movzx r0d, word [r2] + add r1d, r0d + INTRA_X9_END 0 + RET +RESET_MM_PERMUTATION +ALIGN 16 +.satd_8x4: + pmaddubsw m0, m12 + pmaddubsw m1, m12 + pmaddubsw m2, m12 + pmaddubsw m3, m12 + psubw m0, m8 + psubw m1, m9 + psubw m2, m10 + psubw m3, m11 + SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap + pmaddwd m0, [pw_1] +%if cpuflag(sse4) + pshufd m1, m0, q0032 +%else + movhlps m1, m0 +%endif + paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free + ret + +%else ; !ARCH_X86_64 +cglobal intra_satd_x9_4x4, 3,3,8 + sub rsp, 0x9c + INTRA_X9_PRED intrax9b, [rsp+0x80] + mova [rsp+0x40], m4 + mova [rsp+0x50], m5 + mova [rsp+0x60], m6 + mova [rsp+0x70], m7 + movd m4, [r0+0*FENC_STRIDE] + movd m5, [r0+1*FENC_STRIDE] + movd m6, [r0+2*FENC_STRIDE] + movd m0, [r0+3*FENC_STRIDE] + mova m7, [hmul_8p] + pshufd m4, m4, 0 + pshufd m5, m5, 0 + pshufd m6, m6, 0 + pshufd m0, m0, 0 + pmaddubsw m4, m7 + pmaddubsw m5, m7 + pmaddubsw m6, m7 + pmaddubsw m0, m7 + mova [rsp+0x00], m4 + mova [rsp+0x10], m5 + mova [rsp+0x20], m6 + mova [rsp+0x30], m0 + movddup m0, m2 + pshufd m1, m2, q3232 + movddup m2, m3 + movhlps m3, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m7 + pmaddubsw m2, m7 + pmaddubsw m3, m7 + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + call .satd_8x4b ; ddr, ddl + mova m3, [rsp+0x50] + mova m1, [rsp+0x40] + movddup m2, m3 + movhlps m3, m3 + movq [rsp+0x48], m0 + movddup m0, m1 + movhlps m1, m1 + call .satd_8x4 ; vr, vl + mova m3, [rsp+0x70] + mova m1, [rsp+0x60] + movddup m2, m3 + movhlps m3, m3 + movq [rsp+0x50], m0 + movddup m0, m1 + movhlps m1, m1 + call .satd_8x4 ; hd, hu + movq [rsp+0x58], m0 + mova m1, [rsp+0x80] + mova m4, [rsp+0x00] + mova m5, [rsp+0x20] + mova m2, [pw_ppmmppmm] + psignw m4, m2 + psignw m5, m2 + paddw m4, [rsp+0x10] + paddw m5, [rsp+0x30] + INTRA_X9_VHDC 1, 4, 5, 6, 7 + ; find minimum + movu m0, [r2+2] + movd r1d, m1 + movhlps m1, m1 + movhps m1, [rsp+0x48] +%if cpuflag(sse4) + pshufd m2, [rsp+0x50], q3120 + packssdw m1, m2 +%else + packssdw m1, [rsp+0x50] + pshufhw m0, m0, q3120 +%endif + paddw m0, m1 + movzx r0d, word [r2] + add r1d, r0d + INTRA_X9_END 0 + add rsp, 0x9c + RET +RESET_MM_PERMUTATION +ALIGN 16 +.satd_8x4: + pmaddubsw m0, m7 + pmaddubsw m1, m7 + pmaddubsw m2, m7 + pmaddubsw m3, m7 + psubw m0, [rsp+0x00+gprsize] + psubw m1, [rsp+0x10+gprsize] + psubw m2, [rsp+0x20+gprsize] +.satd_8x4b: + psubw m3, [rsp+0x30+gprsize] + SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap + pmaddwd m0, [pw_1] +%if cpuflag(sse4) + pshufd m1, m0, q0032 +%else + movhlps m1, m0 +%endif + paddd xmm0, m0, m1 + ret +%endif ; ARCH +%endmacro ; INTRA_X9 + ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0 @@ -2439,7 +2869,7 @@ SATDS_SSE2 %ifndef HIGH_BIT_DEPTH INTRA_SA8D_SSE2 INIT_MMX mmx2 -INTRA_SATDS_MMX +INTRA_X3_MMX %endif INIT_XMM sse2 HADAMARD_AC_SSE2 @@ -2455,13 +2885,16 @@ INIT_XMM ssse3 SATDS_SSE2 SA8D HADAMARD_AC_SSE2 +%ifndef HIGH_BIT_DEPTH +INTRA_X9 +%endif %undef movdqa ; nehalem doesn't like movaps %undef movdqu ; movups %undef punpcklqdq ; or movlhps %ifndef HIGH_BIT_DEPTH INTRA_SA8D_SSE2 INIT_MMX ssse3 -INTRA_SATDS_MMX +INTRA_X3_MMX %endif %define TRANS TRANS_SSE4 @@ -2470,12 +2903,16 @@ INIT_XMM sse4 SATDS_SSE2 SA8D HADAMARD_AC_SSE2 +%ifndef HIGH_BIT_DEPTH +INTRA_X9 +%endif INIT_XMM avx SATDS_SSE2 SA8D %ifndef HIGH_BIT_DEPTH INTRA_SA8D_SSE2 +INTRA_X9 %endif HADAMARD_AC_SSE2 diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 2de815df..d5cdb4a4 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -113,6 +113,12 @@ void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * ); +int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * ); +int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * ); +int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * ); +int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * ); +int x264_intra_sad_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * ); +int x264_intra_sad_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * ); void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, diff --git a/encoder/analyse.c b/encoder/analyse.c index f4652e74..69de5174 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -276,6 +276,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ); static uint16_t x264_cost_ref[QP_MAX+1][3][33]; static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER; +static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32]; float *x264_analyse_prepare_costs( x264_t *h ) { @@ -316,6 +317,9 @@ int x264_analyse_init_costs( x264_t *h, float *logs, int qp ) h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j]; } } + uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32; + for( int i = 0; i < 17; i++ ) + cost_i4x4_mode[i] = 3*lambda*(i!=8); return 0; fail: return -1; @@ -938,6 +942,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ { int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */ int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX; + uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8; h->mb.i_cbp_luma = 0; if( a->b_early_terminate && a->i_mbrd ) @@ -959,51 +964,63 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ /* emulate missing topright samples */ MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] ); - if( !h->mb.b_lossless && predict_mode[5] >= 0 ) + if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 ) { - int satd[9]; - h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd ); - int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; - satd[i_pred_mode] -= 3 * lambda; - for( int i = 2; i >= 0; i-- ) - COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i ); - - /* Take analysis shortcuts: don't analyse modes that are too - * far away direction-wise from the favored mode. */ - if( a->i_mbrd < 1 + a->b_fast_intra ) - predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical]; - else - predict_mode += 3; + /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */ + i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode ); + a->i_predict4x4[idx] = i_best >> 16; + i_best &= 0xffff; } - - if( i_best > 0 ) + else { - for( ; *predict_mode >= 0; predict_mode++ ) + if( !h->mb.b_lossless && predict_mode[5] >= 0 ) { - int i_satd; - int i_mode = *predict_mode; - - if( h->mb.b_lossless ) - x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode ); + int satd[9]; + h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd ); + int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; + satd[i_pred_mode] -= 3 * lambda; + i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC; + COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H ); + COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V ); + + /* Take analysis shortcuts: don't analyse modes that are too + * far away direction-wise from the favored mode. */ + if( a->i_mbrd < 1 + a->b_fast_intra ) + predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical]; else - h->predict_4x4[i_mode]( p_dst_by ); + predict_mode += 3; + } - i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); - if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) + if( i_best > 0 ) + { + for( ; *predict_mode >= 0; predict_mode++ ) { - i_satd -= lambda * 3; - if( i_satd <= 0 ) + int i_satd; + int i_mode = *predict_mode; + + if( h->mb.b_lossless ) + x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode ); + else + h->predict_4x4[i_mode]( p_dst_by ); + + i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); + if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) { - i_best = i_satd; - a->i_predict4x4[idx] = i_mode; - break; + i_satd -= lambda * 3; + if( i_satd <= 0 ) + { + i_best = i_satd; + a->i_predict4x4[idx] = i_mode; + break; + } } - } - COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode ); + COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode ); + } } + i_best += 3 * lambda; } - i_cost += i_best + 3 * lambda; + i_cost += i_best; if( i_cost > i_satd_thresh || idx == 15 ) break; diff --git a/encoder/encoder.c b/encoder/encoder.c index 1bb1ed30..987b39a4 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -930,6 +930,8 @@ static void mbcmp_init( x264_t *h ) h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c; h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8; h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4; + h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL + : satd ? h->pixf.intra_satd_x9_4x4 : h->pixf.intra_sad_x9_4x4; satd &= h->param.analyse.i_me_method == X264_ME_TESA; memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); diff --git a/tools/checkasm.c b/tools/checkasm.c index b25d745a..bb1fafc8 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -242,7 +242,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) x264_pixel_function_t pixel_c; x264_pixel_function_t pixel_ref; x264_pixel_function_t pixel_asm; - x264_predict8x8_t predict_8x8[9+3]; + x264_predict_t predict_4x4[12]; + x264_predict8x8_t predict_8x8[12]; x264_predict_8x8_filter_t predict_8x8_filter; ALIGNED_16( pixel edge[36] ); uint16_t cost_mv[32]; @@ -251,6 +252,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) x264_pixel_init( 0, &pixel_c ); x264_pixel_init( cpu_ref, &pixel_ref ); x264_pixel_init( cpu_new, &pixel_asm ); + x264_predict_4x4_init( 0, predict_4x4 ); x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter ); predict_8x8_filter( pbuf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); @@ -437,7 +439,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) } report( "pixel vsad :" ); -#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \ +#define TEST_INTRA_X3( name, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ int res_c[3], res_asm[3]; \ @@ -454,17 +456,56 @@ static int check_pixel( int cpu_ref, int cpu_new ) } \ } +#define TEST_INTRA_X9( name, cmp ) \ + if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ + { \ + set_func_name( #name ); \ + used_asm = 1; \ + ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \ + for( int i=0; i<17; i++ ) \ + bitcosts[i] = 9*(i!=8); \ + for( int i=0; i<32; i++ ) \ + { \ + pixel *fenc = pbuf1+48+i*12; \ + pixel *fdec = pbuf3+48+i*12; \ + int pred_mode = i%9; \ + int res_c = INT_MAX; \ + for( int j=0; j<9; j++ ) \ + { \ + predict_4x4[j]( fdec ); \ + int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec, FDEC_STRIDE ) + 9*(j!=pred_mode); \ + if( cost < (uint16_t)res_c ) \ + res_c = cost + (j<<16); \ + } \ + int res_a = call_a( pixel_asm.name, fenc, fdec, bitcosts+8-pred_mode ); \ + if( res_c != res_a ) \ + { \ + ok = 0; \ + fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \ + break; \ + } \ + } \ + } + + memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); ok = 1; used_asm = 0; - TEST_INTRA_MBCMP( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 ); - TEST_INTRA_MBCMP( intra_satd_x3_8x8c , predict_8x8c , satd[PIXEL_8x8] , 0 ); - TEST_INTRA_MBCMP( intra_satd_x3_4x4 , predict_4x4 , satd[PIXEL_4x4] , 0 ); - TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge ); + TEST_INTRA_X3( intra_satd_x3_16x16, 0 ); + TEST_INTRA_X3( intra_satd_x3_8x8c, 0 ); + TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge ); + TEST_INTRA_X3( intra_satd_x3_4x4, 0 ); report( "intra satd_x3 :" ); - TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 ); - TEST_INTRA_MBCMP( intra_sad_x3_8x8c , predict_8x8c , sad [PIXEL_8x8] , 0 ); - TEST_INTRA_MBCMP( intra_sad_x3_8x8 , predict_8x8 , sad [PIXEL_8x8] , 1, edge ); - TEST_INTRA_MBCMP( intra_sad_x3_4x4 , predict_4x4 , sad [PIXEL_4x4] , 0 ); + ok = 1; used_asm = 0; + TEST_INTRA_X3( intra_sad_x3_16x16, 0 ); + TEST_INTRA_X3( intra_sad_x3_8x8c, 0 ); + TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge ); + TEST_INTRA_X3( intra_sad_x3_4x4, 0 ); report( "intra sad_x3 :" ); + ok = 1; used_asm = 0; + TEST_INTRA_X9( intra_satd_x9_4x4, satd ); + report( "intra satd_x9 :" ); + ok = 1; used_asm = 0; + TEST_INTRA_X9( intra_sad_x9_4x4, sad ); + report( "intra sad_x9 :" ); ok = 1; used_asm = 0; if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core ) diff --git a/x264.c b/x264.c index 9aa03bfd..72399569 100644 --- a/x264.c +++ b/x264.c @@ -756,6 +756,8 @@ static void help( x264_param_t *defaults, int longhelp ) H2( " --thread-input Run Avisynth in its own thread\n" ); H2( " --sync-lookahead Number of buffer frames for threaded lookahead\n" ); H2( " --non-deterministic Slightly improve quality of SMP, at the cost of repeatability\n" ); + H2( " --cpu-independent Ensure exact reproducibility across different cpus,\n" + " as opposed to letting them select different algorithms\n" ); H2( " --asm Override CPU detection\n" ); H2( " --no-asm Disable all CPU optimizations\n" ); H2( " --visualize Show MB types overlayed on the encoded video\n" ); @@ -924,6 +926,7 @@ static struct option long_options[] = { "thread-input", no_argument, NULL, OPT_THREAD_INPUT }, { "sync-lookahead", required_argument, NULL, 0 }, { "non-deterministic", no_argument, NULL, 0 }, + { "cpu-independent", no_argument, NULL, 0 }, { "psnr", no_argument, NULL, 0 }, { "ssim", no_argument, NULL, 0 }, { "quiet", no_argument, NULL, OPT_QUIET }, diff --git a/x264.h b/x264.h index f23569ee..2cdcfb7c 100644 --- a/x264.h +++ b/x264.h @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 116 +#define X264_BUILD 117 /* x264_t: * opaque handler for encoder */ @@ -236,6 +236,7 @@ typedef struct x264_param_t int i_threads; /* encode multiple frames in parallel */ int b_sliced_threads; /* Whether to use slice-based threading. */ int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */ + int b_cpu_independent; /* force canonical behavior rather than cpu-dependent optimal algorithms */ int i_sync_lookahead; /* threaded lookahead buffer */ /* Video Properties */