From 422979198e492d5068034a3a5b1e4991af2b63a1 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Fri, 30 Sep 2011 19:09:19 -0700 Subject: [PATCH] SSSE3/SSE4/AVX 9-way fully merged i8x8 analysis (sad_x9) ~3 times faster than current analysis, plus (like intra_sad_x9_4x4) analyzes all modes without shortcuts. --- common/pixel.c | 3 + common/pixel.h | 3 + common/predict.c | 1 + common/x86/pixel-a.asm | 406 +++++++++++++++++++++++++++++++++++++++ common/x86/pixel.h | 3 + common/x86/predict-a.asm | 1 + common/x86/x86inc.asm | 2 + encoder/analyse.c | 98 ++++++---- encoder/encoder.c | 2 + encoder/macroblock.c | 2 +- encoder/macroblock.h | 21 +- encoder/rdo.c | 2 +- tools/checkasm.c | 67 +++++++ 13 files changed, 559 insertions(+), 52 deletions(-) diff --git a/common/pixel.c b/common/pixel.c index 208b2b83..f45dda4f 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1072,6 +1072,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT4( hadamard_ac, _ssse3 ); pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3; + pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_ssse3; } INIT_ADS( _ssse3 ); if( !(cpu&X264_CPU_SLOW_ATOM) ) @@ -1116,6 +1117,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT4( hadamard_ac, _sse4 ); pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_sse4; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4; + pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_sse4; } pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; @@ -1133,6 +1135,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT4( hadamard_ac, _avx ); pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_avx; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx; + pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx; } INIT5( ssd, _avx ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx; diff --git a/common/pixel.h b/common/pixel.h index b291344e..8448a07e 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -137,6 +137,9 @@ typedef struct int (*intra_mbcmp_x9_4x4)( pixel *fenc, pixel *fdec, uint16_t *bitcosts ); int (*intra_satd_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts ); int (*intra_sad_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts ); + int (*intra_mbcmp_x9_8x8)( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds ); + int (*intra_sa8d_x9_8x8) ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds ); + int (*intra_sad_x9_8x8) ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds ); } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); diff --git a/common/predict.c b/common/predict.c index 8b8a6c5d..5d8094f7 100644 --- a/common/predict.c +++ b/common/predict.c @@ -634,6 +634,7 @@ static void x264_predict_8x8_filter_c( pixel *src, pixel edge[36], int i_neighbo edge[14] = ((have_lt ? SRC(-1,-1) : SRC(-1,0)) + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) + edge[6] = edge[7] = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2; } diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 3e81c832..703754b0 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -85,6 +85,36 @@ intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1 intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0 +intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5 +intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4 +intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1 +intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 +intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10 +intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11 +intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14 +intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15 +intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 +intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9 +intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10 +intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11 +intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13 +intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12 +intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9 +intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8 +intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14 +intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14 +intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12 +intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12 +intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10 +intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8 +intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10 +intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8 +intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2 +intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0 +intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15 +intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15 +pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003 + transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 @@ -2061,12 +2091,20 @@ cglobal intra_satd_x3_8x8c, 0,6 %macro PRED4x4_LOWPASS 5 +%ifid %5 + pavgb %5, %2, %3 + pxor %3, %2 + pand %3, [pb_1] + psubusb %5, %3 + pavgb %1, %4, %5 +%else mova %5, %2 pavgb %2, %3 pxor %3, %5 pand %3, [pb_1] psubusb %2, %3 pavgb %1, %4, %2 +%endif %endmacro %macro INTRA_X9_PRED 2 @@ -2530,7 +2568,372 @@ ALIGN 16 %endif ; ARCH %endmacro ; INTRA_X9 +;----------------------------------------------------------------------------- +; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds ) +;----------------------------------------------------------------------------- + +%macro INTRA8_X9 0 +cglobal intra_sad_x9_8x8, 5,6,9 + %define fenc02 m4 + %define fenc13 m5 + %define fenc46 m6 + %define fenc57 m7 +%ifdef ARCH_X86_64 + %define tmp m8 + %assign padbase 0x0 +%else + %define tmp [rsp] + %assign padbase 0x10 +%endif + %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15) + %define pred(i,j) [rsp+i*0x40+j*0x10+padbase] + + SUB rsp, pad + movq fenc02, [r0+FENC_STRIDE* 0] + movq fenc13, [r0+FENC_STRIDE* 1] + movq fenc46, [r0+FENC_STRIDE* 4] + movq fenc57, [r0+FENC_STRIDE* 5] + movhps fenc02, [r0+FENC_STRIDE* 2] + movhps fenc13, [r0+FENC_STRIDE* 3] + movhps fenc46, [r0+FENC_STRIDE* 6] + movhps fenc57, [r0+FENC_STRIDE* 7] + + ; save instruction size: avoid 4-byte memory offsets + lea r0, [intra8x9_h1+128] + %define off(m) (r0+m-(intra8x9_h1+128)) + +; v + movddup m0, [r2+16] + mova pred(0,0), m0 + psadbw m1, m0, fenc02 + mova pred(0,1), m0 + psadbw m2, m0, fenc13 + mova pred(0,2), m0 + psadbw m3, m0, fenc46 + mova pred(0,3), m0 + psadbw m0, m0, fenc57 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + movd [r4+0], m0 + +; h + movq m0, [r2+7] + pshufb m1, m0, [off(intra8x9_h1)] + pshufb m2, m0, [off(intra8x9_h2)] + mova pred(1,0), m1 + psadbw m1, fenc02 + mova pred(1,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m3, m0, [off(intra8x9_h3)] + pshufb m2, m0, [off(intra8x9_h4)] + mova pred(1,2), m3 + psadbw m3, fenc46 + mova pred(1,3), m2 + psadbw m2, fenc57 + paddw m1, m3 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+2], m1 + + lea r5, [rsp+padbase+0x100] + %define pred(i,j) [r5+i*0x40+j*0x10-0x100] + +; dc + movhps m0, [r2+16] + pxor m2, m2 + psadbw m0, m2 + movhlps m1, m0 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + pshufb m0, m2 + mova pred(2,0), m0 + psadbw m1, m0, fenc02 + mova pred(2,1), m0 + psadbw m2, m0, fenc13 + mova pred(2,2), m0 + psadbw m3, m0, fenc46 + mova pred(2,3), m0 + psadbw m0, m0, fenc57 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + movd [r4+4], m0 + +; ddl +; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 +; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 +; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA +; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB +; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC +; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD +; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE +; Ft8 Ft9 FtA FtB FtC FtD FtE FtF + mova m0, [r2+16] + movu m2, [r2+17] + pslldq m1, m0, 1 + pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___ + PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF + pshufb m1, m0, [off(intra8x9_ddl1)] + pshufb m2, m0, [off(intra8x9_ddl2)] + mova pred(3,0), m1 + psadbw m1, fenc02 + mova pred(3,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddl3)] + mova pred(3,2), m2 + psadbw m2, fenc46 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddl4)] + mova pred(3,3), m2 + psadbw m2, fenc57 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+6], m1 + +; vl +; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 +; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 +; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 +; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 +; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA +; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA +; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB +; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB + pshufb m1, m3, [off(intra8x9_vl1)] + pshufb m2, m0, [off(intra8x9_vl2)] + pshufb m3, m3, [off(intra8x9_vl3)] + pshufb m0, m0, [off(intra8x9_vl4)] + mova pred(7,0), m1 + psadbw m1, fenc02 + mova pred(7,1), m2 + psadbw m2, fenc13 + mova pred(7,2), m3 + psadbw m3, fenc46 + mova pred(7,3), m0 + psadbw m0, fenc57 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(sse4) + pextrw [r4+14], m0, 0 +%else + movd r5d, m0 + mov [r4+14], r5w + lea r5, [rsp+padbase+0x100] +%endif + +; ddr +; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 +; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 +; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 +; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 +; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 +; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 +; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 +; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt + movu m2, [r2+8] + movu m0, [r2+7] + movu m1, [r2+6] + pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 + PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 + pshufb m1, m0, [off(intra8x9_ddr1)] + pshufb m2, m0, [off(intra8x9_ddr2)] + mova pred(4,0), m1 + psadbw m1, fenc02 + mova pred(4,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddr3)] + mova pred(4,2), m2 + psadbw m2, fenc46 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddr4)] + mova pred(4,3), m2 + psadbw m2, fenc57 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+8], m1 + + add r0, 256 + add r5, 0xC0 + %define off(m) (r0+m-(intra8x9_h1+256+128)) + %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0] + +; vr +; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 +; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 +; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 +; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 +; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 +; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 +; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 +; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 + movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 + pshufb m1, m2, [off(intra8x9_vr1)] + pshufb m2, m2, [off(intra8x9_vr3)] + mova pred(5,0), m1 + psadbw m1, fenc02 + mova pred(5,2), m2 + psadbw m2, fenc46 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_vr2)] + mova pred(5,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_vr4)] + mova pred(5,3), m2 + psadbw m2, fenc57 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+10], m1 + +; hd +; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 +; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3 +; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1 +; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt +; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 +; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 +; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 +; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 + pshufd m2, m3, q0001 +%if cpuflag(sse4) + pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___ +%else + movss m1, m0, m2 + SWAP 1, 2 +%endif + punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___ + pshufb m1, m2, [off(intra8x9_hd1)] + pshufb m2, m2, [off(intra8x9_hd2)] + mova pred(6,0), m1 + psadbw m1, fenc02 + mova pred(6,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_hd3)] + pshufb m3, m0, [off(intra8x9_hd4)] + mova pred(6,2), m2 + psadbw m2, fenc46 + mova pred(6,3), m3 + psadbw m3, fenc57 + paddw m1, m2 + paddw m1, m3 + movhlps m2, m1 + paddw m1, m2 + ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall + pslldq m1, 12 + SWAP 3, 1 + +; hu +; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 +; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5 +; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 +; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 +; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 +; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 +; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 +; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 +%if cpuflag(sse4) + pinsrb m0, [r2+7], 15 ; Gl7 +%else + movd m1, [r2+7] + pslldq m0, 1 + palignr m1, m0, 1 + SWAP 0, 1 +%endif + pshufb m1, m0, [off(intra8x9_hu1)] + pshufb m2, m0, [off(intra8x9_hu2)] + mova pred(8,0), m1 + psadbw m1, fenc02 + mova pred(8,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_hu3)] + pshufb m0, m0, [off(intra8x9_hu4)] + mova pred(8,2), m2 + psadbw m2, fenc46 + mova pred(8,3), m0 + psadbw m0, fenc57 + paddw m1, m2 + paddw m1, m0 + movhlps m2, m1 + paddw m1, m2 + movd r2d, m1 + + movu m0, [r3] + por m3, [r4] + paddw m0, m3 + mova [r4], m0 + movzx r5d, word [r3+16] + add r2d, r5d + mov [r4+16], r2w +%if cpuflag(sse4) + phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl + movd eax, m0 +%else + ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index + paddusw m0, m0 + paddusw m0, m0 + paddw m0, [off(pw_s00112233)] + movhlps m1, m0 + pminsw m0, m1 + pshuflw m1, m0, q0032 + pminsw m0, m1 + movd eax, m0 + ; repack with 3 bit index + xor eax, 0x80008000 + movzx r3d, ax + shr eax, 15 + add r3d, r3d + or eax, 1 + cmp eax, r3d + cmovg eax, r3d + ; reverse to phminposuw order + mov r3d, eax + and eax, 7 + shr r3d, 3 + shl eax, 16 + or eax, r3d +%endif + add r2d, 8<<16 + cmp ax, r2w + cmovg eax, r2d + + mov r2d, eax + shr r2d, 16 + shl r2d, 6 + add r1, 4*FDEC_STRIDE + mova m0, [rsp+padbase+r2+0x00] + mova m1, [rsp+padbase+r2+0x10] + mova m2, [rsp+padbase+r2+0x20] + mova m3, [rsp+padbase+r2+0x30] + movq [r1+FDEC_STRIDE*-4], m0 + movhps [r1+FDEC_STRIDE*-2], m0 + movq [r1+FDEC_STRIDE*-3], m1 + movhps [r1+FDEC_STRIDE*-1], m1 + movq [r1+FDEC_STRIDE* 0], m2 + movhps [r1+FDEC_STRIDE* 2], m2 + movq [r1+FDEC_STRIDE* 1], m3 + movhps [r1+FDEC_STRIDE* 3], m3 + ADD rsp, pad + RET +%endmacro ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0 ; out: [tmp]=hadamard4, m0=satd @@ -3018,6 +3421,7 @@ SA8D HADAMARD_AC_SSE2 %ifndef HIGH_BIT_DEPTH INTRA_X9 +INTRA8_X9 %endif %undef movdqa ; nehalem doesn't like movaps %undef movdqu ; movups @@ -3036,6 +3440,7 @@ SA8D HADAMARD_AC_SSE2 %ifndef HIGH_BIT_DEPTH INTRA_X9 +INTRA8_X9 %endif INIT_XMM avx @@ -3044,6 +3449,7 @@ SA8D %ifndef HIGH_BIT_DEPTH INTRA_SA8D_SSE2 INTRA_X9 +INTRA8_X9 %endif HADAMARD_AC_SSE2 diff --git a/common/x86/pixel.h b/common/x86/pixel.h index f5bc865b..a5de0c0a 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -127,6 +127,9 @@ int x264_intra_satd_x9_4x4_xop ( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_sad_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_sad_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * ); +int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); +int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); +int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index d3497139..973233c0 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -668,6 +668,7 @@ cglobal predict_8x8_filter, 4,6,6 add t4d, r5d shr t4d, 2 mov [t1+7*SIZEOF_PIXEL], t4%1 + mov [t1+6*SIZEOF_PIXEL], t4%1 test r3b, 2 je .done .check_top: diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index d003596d..6e8e6cd5 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -873,6 +873,8 @@ AVX_INSTR minpd, 1, 0 AVX_INSTR minps, 1, 0 AVX_INSTR minsd, 1, 0 AVX_INSTR minss, 1, 0 +AVX_INSTR movsd, 1, 0 +AVX_INSTR movss, 1, 0 AVX_INSTR mpsadbw, 0, 1 AVX_INSTR mulpd, 1, 0 AVX_INSTR mulps, 1, 0 diff --git a/encoder/analyse.c b/encoder/analyse.c index 71bbe870..3f9a6123 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -94,7 +94,7 @@ typedef struct int i_satd_i8x8; int i_cbp_i8x8_luma; - int i_satd_i8x8_dir[12][4]; + ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] ); int i_predict8x8[4]; int i_satd_i4x4; @@ -844,6 +844,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( a->i_satd_i16x16 > i16x16_thresh ) return; + uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8; /* 8x8 prediction selection */ if( flags & X264_ANALYSE_I8x8 ) { @@ -870,53 +871,69 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx ); h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); - if( !h->mb.b_lossless && predict_mode[5] >= 0 ) + if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 ) { - int satd[9]; - h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd ); - int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; - satd[i_pred_mode] -= 3 * lambda; - for( int i = 2; i >= 0; i-- ) + /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */ + i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] ); + i_cost += i_best & 0xffff; + i_best >>= 16; + a->i_predict8x8[idx] = i_best; + if( idx == 3 || i_cost > i_satd_thresh ) + break; + x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best ); + } + else + { + if( !h->mb.b_lossless && predict_mode[5] >= 0 ) { - int cost = satd[i]; - a->i_satd_i8x8_dir[i][idx] = cost + 4 * lambda; - COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i ); + int satd[9]; + h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd ); + int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; + satd[i_pred_mode] -= 3 * lambda; + for( int i = 2; i >= 0; i-- ) + { + int cost = satd[i]; + a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda; + COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i ); + } + + /* Take analysis shortcuts: don't analyse modes that are too + * far away direction-wise from the favored mode. */ + if( a->i_mbrd < 1 + a->b_fast_intra ) + predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical]; + else + predict_mode += 3; } - /* Take analysis shortcuts: don't analyse modes that are too - * far away direction-wise from the favored mode. */ - if( a->i_mbrd < 1 + a->b_fast_intra ) - predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical]; - else - predict_mode += 3; - } + for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ ) + { + int i_satd; + int i_mode = *predict_mode; - for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ ) - { - int i_satd; - int i_mode = *predict_mode; + if( h->mb.b_lossless ) + x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge ); + else + h->predict_8x8[i_mode]( p_dst_by, edge ); - if( h->mb.b_lossless ) - x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge ); - else - h->predict_8x8[i_mode]( p_dst_by, edge ); + i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); + if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) + i_satd -= 3 * lambda; - i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); - if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) - i_satd -= 3 * lambda; + COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode ); + a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda; + } + i_cost += i_best + 3*lambda; - COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode ); - a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * lambda; + if( idx == 3 || i_cost > i_satd_thresh ) + break; + if( h->mb.b_lossless ) + x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge ); + else + h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge ); + x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] ); } - i_cost += i_best + 3 * lambda; - - if( idx == 3 || i_cost > i_satd_thresh ) - break; - /* we need to encode this block now (for next ones) */ - x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge ); - - x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] ); + x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 ); } if( idx == 3 ) @@ -951,7 +968,6 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ { int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */ int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX; - uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8; h->mb.i_cbp_luma = 0; if( a->b_early_terminate && a->i_mbrd ) @@ -1233,7 +1249,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE, h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE}; int cbp_luma_new = 0; - int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8 : COST_MAX; + int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX; i_best = COST_MAX64; @@ -1244,7 +1260,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) for( ; *predict_mode >= 0; predict_mode++ ) { int i_mode = *predict_mode; - if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh ) + if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh ) continue; h->mb.i_cbp_luma = a->i_cbp_i8x8_luma; diff --git a/encoder/encoder.c b/encoder/encoder.c index 9c4621ae..b8ab8e5d 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -964,6 +964,8 @@ static void mbcmp_init( x264_t *h ) h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4; h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL : satd ? h->pixf.intra_satd_x9_4x4 : h->pixf.intra_sad_x9_4x4; + h->pixf.intra_mbcmp_x9_8x8 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL + : satd ? h->pixf.intra_sa8d_x9_8x8 : h->pixf.intra_sad_x9_8x8; satd &= h->param.analyse.i_me_method == X264_ME_TESA; memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 94149f37..28c4d02d 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -673,7 +673,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_ for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ ) { int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]]; - x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL ); + x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 ); } i_qp = h->mb.i_chroma_qp; } diff --git a/encoder/macroblock.h b/encoder/macroblock.h index 4dc69401..f8c21496 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -140,7 +140,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_ } } -static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge ) +static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge, int b_predict ) { int x = idx&1; int y = idx>>1; @@ -150,16 +150,19 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_ ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] ); ALIGNED_ARRAY_32( pixel, edge_buf,[36] ); - if( !edge ) + if( b_predict ) { - h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] ); - edge = edge_buf; - } + if( !edge ) + { + h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] ); + edge = edge_buf; + } - if( h->mb.b_lossless ) - x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge ); - else - h->predict_8x8[i_mode]( p_dst, edge ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge ); + else + h->predict_8x8[i_mode]( p_dst, edge ); + } if( h->mb.b_lossless ) { diff --git a/encoder/rdo.c b/encoder/rdo.c index bbc0a3d2..ce723a35 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -276,7 +276,7 @@ static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode, for( int p = 0; p < plane_count; p++ ) { - x264_mb_encode_i8x8( h, p, i8, i_qp, i_mode, edge[p] ); + x264_mb_encode_i8x8( h, p, i8, i_qp, i_mode, edge[p], 1 ); i_qp = h->mb.i_chroma_qp; } diff --git a/tools/checkasm.c b/tools/checkasm.c index 1450bfb0..3eaab658 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -498,6 +498,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*sizeof(pixel)) ) \ { \ ok = 0; \ + fprintf( stderr, #name" [FAILED]\n" ); \ for( int j=0; j<16; j++ ) \ fprintf( stderr, "%02x ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \ fprintf( stderr, "\n" ); \ @@ -509,6 +510,70 @@ static int check_pixel( int cpu_ref, int cpu_new ) } \ } +#define TEST_INTRA8_X9( name, cmp ) \ + if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ + { \ + set_func_name( #name ); \ + used_asm = 1; \ + ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \ + ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ) = {0}; \ + ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ) = {0}; \ + for( int i=0; i<17; i++ ) \ + bitcosts[i] = 9*(i!=8); \ + for( int i=0; i<32; i++ ) \ + { \ + pixel *fenc = pbuf1+48+i*12; \ + pixel *fdec1 = pbuf3+48+i*12; \ + pixel *fdec2 = pbuf4+48+i*12; \ + int pred_mode = i%9; \ + int res_c = INT_MAX; \ + predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \ + for( int j=0; j<9; j++ ) \ + { \ + predict_8x8[j]( fdec1, edge ); \ + satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \ + if( satds_c[j] < (uint16_t)res_c ) \ + res_c = satds_c[j] + (j<<16); \ + } \ + predict_8x8[res_c>>16]( fdec1, edge ); \ + int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \ + if( res_c != res_a || memcmp(satds_c, satds_a, sizeof(satds_c)) ) \ + { \ + ok = 0; \ + fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \ + for( int j = 0; j < 9; j++ ) \ + fprintf( stderr, "%5d ", satds_c[j]); \ + fprintf( stderr, "\n" ); \ + for( int j = 0; j < 9; j++ ) \ + fprintf( stderr, "%5d ", satds_a[j]); \ + fprintf( stderr, "\n" ); \ + break; \ + } \ + for( int j=0; j<8; j++ ) \ + if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*sizeof(pixel)) ) \ + ok = 0; \ + if( !ok ) \ + { \ + fprintf( stderr, #name" [FAILED]\n" ); \ + for( int j=0; j<8; j++ ) \ + { \ + for( int k=0; k<8; k++ ) \ + fprintf( stderr, "%02x ", fdec1[k+j*FDEC_STRIDE] ); \ + fprintf( stderr, "\n" ); \ + } \ + fprintf( stderr, "\n" ); \ + for( int j=0; j<8; j++ ) \ + { \ + for( int k=0; k<8; k++ ) \ + fprintf( stderr, "%02x ", fdec2[k+j*FDEC_STRIDE] ); \ + fprintf( stderr, "\n" ); \ + } \ + fprintf( stderr, "\n" ); \ + break; \ + } \ + } \ + } + memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); ok = 1; used_asm = 0; TEST_INTRA_X3( intra_satd_x3_16x16, 0 ); @@ -526,9 +591,11 @@ static int check_pixel( int cpu_ref, int cpu_new ) report( "intra sad_x3 :" ); ok = 1; used_asm = 0; TEST_INTRA_X9( intra_satd_x9_4x4, satd ); + TEST_INTRA8_X9( intra_sa8d_x9_8x8, sa8d ); report( "intra satd_x9 :" ); ok = 1; used_asm = 0; TEST_INTRA_X9( intra_sad_x9_4x4, sad ); + TEST_INTRA8_X9( intra_sad_x9_8x8, sad ); report( "intra sad_x9 :" ); ok = 1; used_asm = 0; -- 2.40.0