From d94edd734304c16265f28dd11e8a2029cbdc5b7f Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sat, 13 Aug 2011 19:01:22 +0000 Subject: [PATCH] Optimize x86 intra_predict_4x4 and 8x8 High bit depth Penryn, Sandybridge cycles: 4x4_ddl: 11->10, 9-> 8 4x4_ddr: 15->13, 12->11 4x4_hd: , 15->12 4x4_hu: , 14->13 4x4_vr: 15->14, 14->12 8x8_ddl: 32->19, 19->14 8x8_ddr: 42->19, 21->14 8x8_hd: , 15->13 8x8_hu: 21->17, 16->12 8x8_vr: 33->19, 8-bit Penryn, Sandybridge cycles: 4x4_ddr: 24->15, 4x4_hd: 24->16, 4x4_hu: 23->15, 4x4_vr: 23->16, 4x4_vl: 10-> 9, 8x8_ddl: 23->15, 8x8_hd: , 17->14 8x8_hu: , 15->14 8x8_vr: 20->16, 17->13 --- common/osdep.h | 15 +- common/x86/const-a.asm | 1 - common/x86/predict-a.asm | 1216 ++++++++++++++++++++------------------ common/x86/predict-c.c | 29 +- common/x86/x86util.asm | 16 + encoder/analyse.c | 4 +- encoder/macroblock.c | 2 +- encoder/rdo.c | 2 +- tools/checkasm.c | 8 +- 9 files changed, 686 insertions(+), 607 deletions(-) diff --git a/common/osdep.h b/common/osdep.h index 6249dbe0..ad1ebe43 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -94,24 +94,27 @@ // - armcc can't either, but is nice enough to actually tell you so // - Apple gcc only maintains 4 byte alignment // - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils... + +#define ALIGNED_ARRAY_EMU( mask, type, name, sub1, ... )\ + uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + mask]; \ + type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask) + #if ARCH_ARM && SYS_MACOSX -#define ALIGNED_ARRAY_8( type, name, sub1, ... )\ - uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + 7]; \ - type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+7) & ~7) +#define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) #else #define ALIGNED_ARRAY_8( type, name, sub1, ... )\ ALIGNED_8( type name sub1 __VA_ARGS__ ) #endif #if ARCH_ARM -#define ALIGNED_ARRAY_16( type, name, sub1, ... )\ - uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + 15];\ - type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+15) & ~15) +#define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) #else #define ALIGNED_ARRAY_16( type, name, sub1, ... )\ ALIGNED_16( type name sub1 __VA_ARGS__ ) #endif +#define ALIGNED_ARRAY_32( ... ) ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) + #define UNINIT(x) x=x #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0) diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index 2f2492c3..78d1e555 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -59,5 +59,4 @@ const pd_ffff, times 4 dd 0xffff const pw_00ff, times 8 dw 0x00ff const pw_ff00, times 8 dw 0xff00 -const pb_reverse, db 7, 6, 5, 4, 3, 2, 1, 0 const sw_64, dd 64 diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 0bfc2db2..c22f296b 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -39,6 +39,9 @@ pb_0s_ff: times 7 db 0 db 0xff shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0 +shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7 +pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 SECTION .text @@ -50,8 +53,7 @@ cextern pw_2 cextern pw_4 cextern pw_8 cextern pw_16 -cextern pw_ff00 -cextern pb_reverse +cextern pw_00ff cextern pw_pixel_max %macro STORE8x8 2 @@ -137,54 +139,43 @@ cextern pw_pixel_max ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 -%macro PRED8x8_LOWPASS 5-6 -%ifidn %1, w - paddw %3, %4 - psrlw %3, 1 - pavgw %2, %5, %3 -%else - mova %6, %3 - pavgb %3, %4 - pxor %4, %6 - pand %4, [pb_1] - psubusb %3, %4 - pavgb %2, %5, %3 -%endif -%endmacro - -%macro LOAD_PLANE_ARGS 0 -%ifdef ARCH_X86_64 - movd mm0, r1d - movd mm2, r2d - movd mm4, r3d - pshufw mm0, mm0, 0 - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 +%macro PRED8x8_LOWPASS 4-5 +%ifdef HIGH_BIT_DEPTH + paddw %2, %3 + psrlw %2, 1 + pavgw %1, %4, %2 %else - pshufw mm0, r1m, 0 - pshufw mm2, r2m, 0 - pshufw mm4, r3m, 0 + mova %5, %2 + pavgb %2, %3 + pxor %3, %5 + pand %3, [pb_1] + psubusb %2, %3 + pavgb %1, %4, %2 %endif %endmacro ;----------------------------------------------------------------------------- ; void predict_4x4_ddl( pixel *src ) ;----------------------------------------------------------------------------- -%macro PREDICT_4x4_DDL 3 +%macro PREDICT_4x4_DDL 0 cglobal predict_4x4_ddl, 1,1 movu m1, [r0-FDEC_STRIDEB] - psll%1 m2, m1, %2 - mova m3, m1 - mova m4, m1 + PSLLPIX m2, m1, 1 + mova m0, m1 +%ifdef HIGH_BIT_DEPTH + PSRLPIX m1, m1, 1 + pshufhw m1, m1, q2210 +%else pxor m1, m2 - psrl%1 m1, %2 - pxor m3, m1 - PRED8x8_LOWPASS %3, m0, m2, m3, m4, m5 + PSRLPIX m1, m1, 1 + pxor m1, m0 +%endif + PRED8x8_LOWPASS m0, m2, m1, m0, m3 %assign Y 0 %rep 4 - psrl%1 m0, %2 - movh [r0+Y*FDEC_STRIDEB], m0 + PSRLPIX m0, m0, 1 + movh [r0+Y*FDEC_STRIDEB], m0 %assign Y (Y+1) %endrep @@ -193,189 +184,218 @@ cglobal predict_4x4_ddl, 1,1 %ifdef HIGH_BIT_DEPTH INIT_XMM sse2 -PREDICT_4x4_DDL dq, 2, w +PREDICT_4x4_DDL INIT_XMM avx -PREDICT_4x4_DDL dq, 2, w +PREDICT_4x4_DDL INIT_MMX mmx2 cglobal predict_4x4_ddl, 1,2 - mova m1, [r0-2*FDEC_STRIDE+4] - mova m2, [r0-2*FDEC_STRIDE+0] - mova m3, [r0-2*FDEC_STRIDE+2] - PRED8x8_LOWPASS w, m0, m1, m2, m3 - mova [r0+0*FDEC_STRIDE], m0 - - mova m5, [r0-2*FDEC_STRIDE+6] - mova m6, [r0-2*FDEC_STRIDE+8] - pshufw m7, m6, q3321 - PRED8x8_LOWPASS w, m4, m7, m5, m6 - mova [r0+6*FDEC_STRIDE], m4 - - psllq m0, 16 - PALIGNR m4, m0, 6, m1 - mova [r0+4*FDEC_STRIDE], m4 - + movu m1, [r0-FDEC_STRIDEB+4] + PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2] + mova m3, [r0-FDEC_STRIDEB+8] + mova [r0+0*FDEC_STRIDEB], m0 + pshufw m4, m3, q3321 + PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3 + mova [r0+3*FDEC_STRIDEB], m2 + pshufw m1, m0, q0021 + punpckldq m1, m2 + mova [r0+1*FDEC_STRIDEB], m1 psllq m0, 16 - PALIGNR m4, m0, 6, m0 - mova [r0+2*FDEC_STRIDE], m4 + PALIGNR m2, m0, 6, m0 + mova [r0+2*FDEC_STRIDEB], m2 RET -%else +%else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 -PREDICT_4x4_DDL q, 8, b +PREDICT_4x4_DDL %endif +;----------------------------------------------------------------------------- +; void predict_4x4_vr( pixel *src ) +;----------------------------------------------------------------------------- +%ifndef HIGH_BIT_DEPTH +INIT_MMX ssse3 +cglobal predict_4x4_vr, 1,1 + movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0 + mova m4, m1 + palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt + pavgb m4, m1 + palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0 + mova m0, m1 + palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1 + mova m2, m1 + palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2 + PRED8x8_LOWPASS m2, m0, m1, m2, m3 + pshufw m0, m2, 0 + psrlq m2, 16 + movd [r0+0*FDEC_STRIDEB], m4 + palignr m4, m0, 7 + movd [r0+1*FDEC_STRIDEB], m2 + psllq m0, 8 + movd [r0+2*FDEC_STRIDEB], m4 + palignr m2, m0, 7 + movd [r0+3*FDEC_STRIDEB], m2 + RET +%endif ; !HIGH_BIT_DEPTH + ;----------------------------------------------------------------------------- ; void predict_4x4_ddr( pixel *src ) ;----------------------------------------------------------------------------- -%macro PREDICT_4x4 6 +%macro PREDICT_4x4 4 cglobal predict_4x4_ddr, 1,1 - movu m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] - movq m2, [r0+0*FDEC_STRIDEB-8] %ifdef HIGH_BIT_DEPTH - movh m4, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] - punpckl%1 m2, m4 - movh m3, [r0-1*FDEC_STRIDEB] - punpckh%2 m1, m2 - PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1 - mova m1, m3 - movhps m4, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] - PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4 - mova m2, m3 - movhps m4, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] - PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4 + movu m2, [r0-1*FDEC_STRIDEB-8] + pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2 + pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1 + pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0 + movhps m3, [r0+3*FDEC_STRIDEB-8] +%else ; !HIGH_BIT_DEPTH + movd m0, [r0+2*FDEC_STRIDEB-4] + movd m1, [r0+0*FDEC_STRIDEB-4] + punpcklbw m0, [r0+1*FDEC_STRIDEB-4] + punpcklbw m1, [r0-1*FDEC_STRIDEB-4] + punpckhwd m0, m1 + movd m2, [r0-1*FDEC_STRIDEB] +%if cpuflag(ssse3) + palignr m2, m0, 4 %else - punpckh%1 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL] - movh m3, [r0-1*FDEC_STRIDEB] - punpckh%2 m1, m2 - PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1 - mova m1, m3 - PALIGNR m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4 - mova m2, m3 - PALIGNR m3, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4 + psllq m2, 32 + punpckhdq m0, m2 + SWAP 2, 0 %endif - PRED8x8_LOWPASS %4, m0, m3, m1, m2, m4 + movd m3, [r0+3*FDEC_STRIDEB-4] + psllq m3, 32 +%endif ; !HIGH_BIT_DEPTH + + PSRLPIX m1, m2, 1 + mova m0, m2 + PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3 + PRED8x8_LOWPASS m0, m2, m1, m0, m3 %assign Y 3 movh [r0+Y*FDEC_STRIDEB], m0 %rep 3 %assign Y (Y-1) - psrl%3 m0, %6 + PSRLPIX m0, m0, 1 movh [r0+Y*FDEC_STRIDEB], m0 %endrep RET -cglobal predict_4x4_vr, 1,1,6 - movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0 - mova m5, m0 +;----------------------------------------------------------------------------- +; void predict_4x4_vr( pixel *src ) +;----------------------------------------------------------------------------- +cglobal predict_4x4_vr, 1,1 %ifdef HIGH_BIT_DEPTH - movhps m1, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] - PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt - pavg%4 m5, m0 - movhps m1, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL] - PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0 - mova m1, m0 - movhps m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] - PALIGNR m0, m2, 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1 - mova m2, m0 - movhps m3, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] - PALIGNR m0, m3, 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2 + movu m1, [r0-1*FDEC_STRIDEB-8] + pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2 + pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1 + pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0 +%else ; !HIGH_BIT_DEPTH + movd m0, [r0+2*FDEC_STRIDEB-4] + movd m1, [r0+0*FDEC_STRIDEB-4] + punpcklbw m0, [r0+1*FDEC_STRIDEB-4] + punpcklbw m1, [r0-1*FDEC_STRIDEB-4] + punpckhwd m0, m1 + movd m1, [r0-1*FDEC_STRIDEB] +%if cpuflag(ssse3) + palignr m1, m0, 4 %else - PALIGNR m0, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt - pavg%4 m5, m0 - PALIGNR m0, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0 - mova m1, m0 - PALIGNR m0, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1 - mova m2, m0 - PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2 + psllq m1, 32 + punpckhdq m0, m1 + SWAP 1, 0 %endif - PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4 - psll%3 m1, m3, %6*6 - psrl%3 m3, %6*2 - movh [r0+0*FDEC_STRIDEB], m5 - movh [r0+1*FDEC_STRIDEB], m3 - PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2 - psll%3 m1, %6 - movh [r0+2*FDEC_STRIDEB], m5 - PALIGNR m3, m1, 7*SIZEOF_PIXEL, m1 - movh [r0+3*FDEC_STRIDEB], m3 +%endif ; !HIGH_BIT_DEPTH + PSRLPIX m2, m1, 1 + PSRLPIX m0, m1, 2 + pavg%1 m4, m1, m2 + PSRLPIX m4, m4, 3 + PRED8x8_LOWPASS m2, m0, m1, m2, m3 + PSLLPIX m0, m2, 6 + PSRLPIX m2, m2, 2 + movh [r0+0*FDEC_STRIDEB], m4 + PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3 + movh [r0+1*FDEC_STRIDEB], m2 + PSLLPIX m0, m0, 1 + movh [r0+2*FDEC_STRIDEB], m4 + PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0 + movh [r0+3*FDEC_STRIDEB], m2 RET -cglobal predict_4x4_hd, 1,1,6 - movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt .. +;----------------------------------------------------------------------------- +; void predict_4x4_hd( pixel *src ) +;----------------------------------------------------------------------------- +cglobal predict_4x4_hd, 1,1 %ifdef HIGH_BIT_DEPTH - movh m1, [r0-1*FDEC_STRIDEB] - punpckl%5 m0, m1 ; t3 t2 t1 t0 lt .. .. .. - psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. .. - movh m1, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l3 - movh m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] - punpckl%1 m1, m2 ; l2 l3 - movh m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l1 - movh m3, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL] - punpckl%1 m2, m3 ; l0 l1 + movu m1, [r0-1*FDEC_STRIDEB-8] + PSLLPIX m1, m1, 1 + pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3 + pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2 + pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1 + pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0 %else - punpckl%5 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. .. - psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. .. - movu m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3 - punpckh%1 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3 - movu m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1 - punpckh%1 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1 + movd m0, [r0-1*FDEC_STRIDEB-4] ; lt .. + punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. .. + PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. .. + movd m1, [r0+3*FDEC_STRIDEB-4] ; l3 + punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3 + movd m2, [r0+1*FDEC_STRIDEB-4] ; l1 + punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1 + punpckh%3 m1, m2 ; l0 l1 l2 l3 + punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 %endif - punpckh%2 m1, m2 ; l0 l1 l2 l3 - punpckh%5 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 - psrl%3 m2, m1, %6 ; .. t2 t1 t0 lt l0 l1 l2 - psrl%3 m0, m1, %6*2 ; .. .. t2 t1 t0 lt l0 l1 - pavg%4 m5, m1, m2 - PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4 - punpckl%1 m5, m3 - psrl%3 m3, %6*4 + PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2 + PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1 + pavg%1 m5, m1, m2 + PRED8x8_LOWPASS m3, m1, m0, m2, m4 + punpckl%2 m5, m3 + PSRLPIX m3, m3, 4 PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4 %assign Y 3 movh [r0+Y*FDEC_STRIDEB], m5 %rep 2 %assign Y (Y-1) - psrl%3 m5, %6*2 + PSRLPIX m5, m5, 2 movh [r0+Y*FDEC_STRIDEB], m5 %endrep movh [r0+0*FDEC_STRIDEB], m3 RET -%endmacro +%endmacro ; PREDICT_4x4 +;----------------------------------------------------------------------------- +; void predict_4x4_ddr( pixel *src ) +;----------------------------------------------------------------------------- %ifdef HIGH_BIT_DEPTH INIT_MMX mmx2 cglobal predict_4x4_ddr, 1,1 - movq m3, [r0+3*FDEC_STRIDEB-8] - psrlq m3, 48 - PALIGNR m3, [r0+2*FDEC_STRIDEB-8], 6, m6 - PALIGNR m3, [r0+1*FDEC_STRIDEB-8], 6, m7 - movq m6, [r0+0*FDEC_STRIDEB-8] - PALIGNR m3, m6, 6, m5 - - movq m4, [r0-1*FDEC_STRIDEB-8] - movq m2, m3 - movq m1, m3 - PALIGNR m2, m4, 6, m5 - movq m1, m2 - psllq m1, 16 - PRED8x8_LOWPASS w, m0, m3, m1, m2 - pshufw m0, m0, q0123 + mova m0, [r0+1*FDEC_STRIDEB-8] + punpckhwd m0, [r0+0*FDEC_STRIDEB-8] + mova m3, [r0+3*FDEC_STRIDEB-8] + punpckhwd m3, [r0+2*FDEC_STRIDEB-8] + punpckhdq m3, m0 + + pshufw m0, m3, q3321 + pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3 + pshufw m1, m0, q3321 + PRED8x8_LOWPASS m0, m1, m3, m0 movq [r0+3*FDEC_STRIDEB], m0 movq m2, [r0-1*FDEC_STRIDEB-0] - movq m5, m2 - PALIGNR m5, m4, 6, m4 - movq m3, m5 - PALIGNR m5, m6, 6, m6 - PRED8x8_LOWPASS w, m1, m5, m2, m3 + pshufw m4, m2, q2100 + pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0 + movq m1, m4 + PALIGNR m4, m3, 6, m3 + PRED8x8_LOWPASS m1, m4, m2, m1 movq [r0+0*FDEC_STRIDEB], m1 - psllq m0, 16 - PALIGNR m1, m0, 6, m2 - movq [r0+1*FDEC_STRIDEB], m1 + pshufw m2, m0, q3321 + punpckldq m2, m1 psllq m0, 16 PALIGNR m1, m0, 6, m0 - movq [r0+2*FDEC_STRIDEB], m1 - psrlq m1, 16 + movq [r0+1*FDEC_STRIDEB], m1 + movq [r0+2*FDEC_STRIDEB], m2 movd [r0+3*FDEC_STRIDEB+4], m1 RET +;----------------------------------------------------------------------------- +; void predict_4x4_hd( pixel *src ) +;----------------------------------------------------------------------------- cglobal predict_4x4_hd, 1,1 mova m0, [r0+1*FDEC_STRIDEB-8] punpckhwd m0, [r0+0*FDEC_STRIDEB-8] @@ -383,46 +403,43 @@ cglobal predict_4x4_hd, 1,1 punpckhwd m1, [r0+2*FDEC_STRIDEB-8] punpckhdq m1, m0 mova m0, m1 - mova m4, m1 movu m3, [r0-1*FDEC_STRIDEB-2] + pshufw m4, m1, q0032 mova m7, m3 - punpckhdq m4, [r0-1*FDEC_STRIDEB-6] + punpckldq m4, m3 PALIGNR m3, m1, 2, m2 - PRED8x8_LOWPASS w, m2, m4, m1, m3, m6 + PRED8x8_LOWPASS m2, m4, m1, m3 pavgw m0, m3 - mova m5, m0 - punpcklwd m5, m2 - mova m4, m0 - punpckhwd m4, m2 + punpcklwd m5, m0, m2 + punpckhwd m4, m0, m2 mova [r0+3*FDEC_STRIDEB], m5 mova [r0+1*FDEC_STRIDEB], m4 + psrlq m5, 32 + punpckldq m5, m4 + mova [r0+2*FDEC_STRIDEB], m5 - mova m4, m7 + pshufw m4, m7, q2100 mova m6, [r0-1*FDEC_STRIDEB+0] - PALIGNR m7, [r0+0*FDEC_STRIDEB-8], 6, m5 - PRED8x8_LOWPASS w, m3, m7, m6, m4, m1 - - PALIGNR m3, m0, 6, m5 + pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0 + PRED8x8_LOWPASS m3, m4, m6, m7 + PALIGNR m3, m0, 6, m0 mova [r0+0*FDEC_STRIDEB], m3 - psrlq m0, 16 - psrlq m2, 16 - punpcklwd m0, m2 - mova [r0+2*FDEC_STRIDEB], m0 RET INIT_XMM sse2 -PREDICT_4x4 wd, dq, dq, w, qdq, 2 +PREDICT_4x4 w, wd, dq, qdq INIT_XMM ssse3 -PREDICT_4x4 wd, dq, dq, w, qdq, 2 +PREDICT_4x4 w, wd, dq, qdq INIT_XMM avx -PREDICT_4x4 wd, dq, dq, w, qdq, 2 -%else +PREDICT_4x4 w, wd, dq, qdq +%else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 -PREDICT_4x4 bw, wd, q , b, dq , 8 +PREDICT_4x4 b, bw, wd, dq INIT_MMX ssse3 -PREDICT_4x4 bw, wd, q , b, dq , 8 +%define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64 +PREDICT_4x4 b, bw, wd, dq %endif ;----------------------------------------------------------------------------- @@ -431,78 +448,71 @@ PREDICT_4x4 bw, wd, q , b, dq , 8 %ifdef HIGH_BIT_DEPTH INIT_MMX cglobal predict_4x4_hu_mmx2, 1,1 - movq m0, [r0+0*FDEC_STRIDEB-4*2] - punpckhwd m0, [r0+1*FDEC_STRIDEB-4*2] - movq m1, [r0+2*FDEC_STRIDEB-4*2] - punpckhwd m1, [r0+3*FDEC_STRIDEB-4*2] + movq m0, [r0+0*FDEC_STRIDEB-8] + punpckhwd m0, [r0+1*FDEC_STRIDEB-8] + movq m1, [r0+2*FDEC_STRIDEB-8] + punpckhwd m1, [r0+3*FDEC_STRIDEB-8] punpckhdq m0, m1 pshufw m1, m1, q3333 movq [r0+3*FDEC_STRIDEB], m1 - movd [r0+2*FDEC_STRIDEB+4], m1 - mova m2, m0 - psrlq m2, 16 - pavgw m2, m0 - - pshufw m1, m0, q3321 - pshufw m5, m0, q3332 - PRED8x8_LOWPASS w, m3, m0, m5, m1, m7 - movq m6, m2 - punpcklwd m6, m3 - mova [r0+0*FDEC_STRIDEB], m6 + pshufw m3, m0, q3321 + pshufw m4, m0, q3332 + pavgw m2, m0, m3 + PRED8x8_LOWPASS m3, m0, m4, m3 + punpcklwd m4, m2, m3 + mova [r0+0*FDEC_STRIDEB], m4 psrlq m2, 16 psrlq m3, 16 punpcklwd m2, m3 mova [r0+1*FDEC_STRIDEB], m2 - psrlq m2, 32 - movd [r0+2*FDEC_STRIDEB+0], m2 + punpckhdq m2, m1 + mova [r0+2*FDEC_STRIDEB], m2 RET %else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_4x4_hu_mmx2, 1,1 - movq mm0, [r0+0*FDEC_STRIDE-8] - punpckhbw mm0, [r0+1*FDEC_STRIDE-8] - movq mm1, [r0+2*FDEC_STRIDE-8] - punpckhbw mm1, [r0+3*FDEC_STRIDE-8] - punpckhwd mm0, mm1 - movq mm1, mm0 - punpckhbw mm1, mm1 - pshufw mm1, mm1, q3333 - punpckhdq mm0, mm1 - movq mm2, mm0 - movq mm3, mm0 - movq mm7, mm0 - psrlq mm2, 16 - psrlq mm3, 8 - pavgb mm7, mm3 - PRED8x8_LOWPASS b, mm4, mm0, mm2, mm3, mm5 - punpcklbw mm7, mm4 -%assign Y 0 - movd [r0+Y*FDEC_STRIDE], mm7 -%rep 2 -%assign Y (Y+1) - psrlq mm7, 16 - movd [r0+Y*FDEC_STRIDE], mm7 -%endrep - movd [r0+3*FDEC_STRIDE], mm1 + movd m1, [r0+0*FDEC_STRIDEB-4] + punpcklbw m1, [r0+1*FDEC_STRIDEB-4] + movd m0, [r0+2*FDEC_STRIDEB-4] + punpcklbw m0, [r0+3*FDEC_STRIDEB-4] + punpckhwd m1, m0 + movq m0, m1 + punpckhbw m1, m1 + pshufw m1, m1, q3333 + punpckhdq m0, m1 + movq m2, m0 + movq m3, m0 + movq m5, m0 + psrlq m3, 8 + psrlq m2, 16 + pavgb m5, m3 + PRED8x8_LOWPASS m3, m0, m2, m3, m4 + movd [r0+3*FDEC_STRIDEB], m1 + punpcklbw m5, m3 + movd [r0+0*FDEC_STRIDEB], m5 + psrlq m5, 16 + movd [r0+1*FDEC_STRIDEB], m5 + psrlq m5, 16 + movd [r0+2*FDEC_STRIDEB], m5 RET %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_4x4_vl( pixel *src ) ;----------------------------------------------------------------------------- -%macro PREDICT_4x4_V1 3 -cglobal predict_4x4_vl, 1,1,6 +%macro PREDICT_4x4_V1 1 +cglobal predict_4x4_vl, 1,1 movu m1, [r0-FDEC_STRIDEB] - psrl%1 m3, m1, %2 - psrl%1 m2, m1, %2*2 - pavg%3 m4, m3, m1 - PRED8x8_LOWPASS %3, m0, m1, m2, m3, m5 + PSRLPIX m3, m1, 1 + PSRLPIX m2, m1, 2 + pavg%1 m4, m3, m1 + PRED8x8_LOWPASS m0, m1, m2, m3, m5 movh [r0+0*FDEC_STRIDEB], m4 movh [r0+1*FDEC_STRIDEB], m0 - psrl%1 m4, %2 - psrl%1 m0, %2 + PSRLPIX m4, m4, 1 + PSRLPIX m0, m0, 1 movh [r0+2*FDEC_STRIDEB], m4 movh [r0+3*FDEC_STRIDEB], m0 RET @@ -510,25 +520,23 @@ cglobal predict_4x4_vl, 1,1,6 %ifdef HIGH_BIT_DEPTH INIT_XMM sse2 -PREDICT_4x4_V1 dq, 2, w -%ifdef ARCH_X86_64 +PREDICT_4x4_V1 w INIT_XMM avx -PREDICT_4x4_V1 dq, 2, w -%endif +PREDICT_4x4_V1 w INIT_MMX mmx2 cglobal predict_4x4_vl, 1,4 mova m1, [r0-FDEC_STRIDEB+0] mova m2, [r0-FDEC_STRIDEB+8] - mova m3, m2 - PALIGNR m2, m1, 4, m6 - PALIGNR m3, m1, 2, m5 - mova m4, m3 - pavgw m4, m1 - mova [r0+0*FDEC_STRIDEB], m4 - psrlq m4, 16 - mova [r0+2*FDEC_STRIDEB], m4 - PRED8x8_LOWPASS w, m0, m1, m2, m3, m6 + mova m0, m2 + PALIGNR m2, m1, 4, m4 + PALIGNR m0, m1, 2, m4 + mova m3, m0 + pavgw m3, m1 + mova [r0+0*FDEC_STRIDEB], m3 + psrlq m3, 16 + mova [r0+2*FDEC_STRIDEB], m3 + PRED8x8_LOWPASS m0, m1, m2, m0 mova [r0+1*FDEC_STRIDEB], m0 psrlq m0, 16 mova [r0+3*FDEC_STRIDEB], m0 @@ -544,9 +552,9 @@ cglobal predict_4x4_vl, 1,4 mov [r0+2*FDEC_STRIDEB+6], r1w mov [r0+3*FDEC_STRIDEB+6], r3w RET -%else +%else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 -PREDICT_4x4_V1 q, 8, b +PREDICT_4x4_V1 b %endif ;----------------------------------------------------------------------------- @@ -572,31 +580,31 @@ cglobal predict_4x4_dc_mmx2, 1,1 mova [r0+3*FDEC_STRIDEB], m0 RET -%else +%else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_4x4_dc_mmx2, 1,4 pxor mm7, mm7 - movd mm0, [r0-FDEC_STRIDE] + movd mm0, [r0-FDEC_STRIDEB] psadbw mm0, mm7 movd r3d, mm0 movzx r1d, byte [r0-1] -%assign n 1 +%assign Y 1 %rep 3 - movzx r2d, byte [r0+FDEC_STRIDE*n-1] + movzx r2d, byte [r0+FDEC_STRIDEB*Y-1] add r1d, r2d -%assign n n+1 +%assign Y Y+1 %endrep lea r1d, [r1+r3+4] shr r1d, 3 imul r1d, 0x01010101 - mov [r0+FDEC_STRIDE*0], r1d - mov [r0+FDEC_STRIDE*1], r1d - mov [r0+FDEC_STRIDE*2], r1d - mov [r0+FDEC_STRIDE*3], r1d + mov [r0+FDEC_STRIDEB*0], r1d + mov [r0+FDEC_STRIDEB*1], r1d + mov [r0+FDEC_STRIDEB*2], r1d + mov [r0+FDEC_STRIDEB*3], r1d RET %endif ; HIGH_BIT_DEPTH -%macro PREDICT_FILTER 5 +%macro PREDICT_FILTER 4 ;----------------------------------------------------------------------------- ;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters ) ;----------------------------------------------------------------------------- @@ -629,12 +637,10 @@ cglobal predict_8x8_filter, 4,6,6 punpckh%3%4 m3, m1 mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL] mova m1, [src-1*FDEC_STRIDEB] - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0 - PALIGNR m1, m2, 1*SIZEOF_PIXEL, m2 - PRED8x8_LOWPASS %1, m2, m1, m4, m3, m5 - mova [t1+8*SIZEOF_PIXEL], m2 + PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0 + PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2 + PRED8x8_LOWPASS m3, m1, m4, m3, m5 + mova [t1+8*SIZEOF_PIXEL], m3 movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL] movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL] lea t4d, [t4*3+2] @@ -661,7 +667,7 @@ INIT_XMM cpuname psrldq m1, m3, 15 PALIGNR m2, m3, m0, 15, m0 PALIGNR m1, m3, 1, m5 - PRED8x8_LOWPASS %1, m0, m2, m1, m3, m5 + PRED8x8_LOWPASS m0, m2, m1, m3, m5 mova [t1+16*SIZEOF_PIXEL], m0 psrldq m0, 15 movd [t1+32*SIZEOF_PIXEL], m0 @@ -682,23 +688,21 @@ INIT_XMM cpuname .do_top: PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0 PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5 - PRED8x8_LOWPASS %1, m4, m2, m0, m3, m5 + PRED8x8_LOWPASS m4, m2, m0, m3, m5 mova [t1+16*SIZEOF_PIXEL], m4 test r3b, 4 je .done - mova m2, m1 - mova m4, m1 - psrl%4 m5, m1, 7*%5 - PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3 - PALIGNR m5, m4, 1*SIZEOF_PIXEL, m4 - PRED8x8_LOWPASS %1, m0, m2, m5, m1, m4 + PSRLPIX m5, m1, 7 + PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3 + PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4 + PRED8x8_LOWPASS m0, m2, m5, m1, m4 mova [t1+24*SIZEOF_PIXEL], m0 - psrl%4 m0, 7*%5 + PSRLPIX m0, m0, 7 movd [t1+32*SIZEOF_PIXEL], m0 .done: REP_RET .fix_lt_2: - psll%4 m0, m3, 7*%5 + PSLLPIX m0, m3, 7 test r2b, 4 jne .do_top .fix_tr_1: @@ -710,16 +714,16 @@ INIT_XMM cpuname %ifdef HIGH_BIT_DEPTH INIT_XMM sse2 -PREDICT_FILTER w, d, q, dq, 2 +PREDICT_FILTER w, d, q, dq INIT_XMM ssse3 -PREDICT_FILTER w, d, q, dq, 2 +PREDICT_FILTER w, d, q, dq INIT_XMM avx -PREDICT_FILTER w, d, q, dq, 2 +PREDICT_FILTER w, d, q, dq %else INIT_MMX mmx2 -PREDICT_FILTER b, w, d, q , 8 +PREDICT_FILTER b, w, d, q INIT_MMX ssse3 -PREDICT_FILTER b, w, d, q , 8 +PREDICT_FILTER b, w, d, q %endif ;----------------------------------------------------------------------------- @@ -749,12 +753,12 @@ cglobal predict_8x8_h, 2,2 add r0, 4*FDEC_STRIDEB punpckl%1 m2, m1, m1 punpckh%1 m1, m1 -%assign n 0 +%assign Y 0 %rep 8 -%assign i 1+n/4 - SPLAT%2 m0, m %+ i, (3-n)&3 - mova [r0+(n-4)*FDEC_STRIDEB], m0 -%assign n n+1 +%assign i 1+Y/4 + SPLAT%2 m0, m %+ i, (3-Y)&3 + mova [r0+(Y-4)*FDEC_STRIDEB], m0 +%assign Y Y+1 %endrep RET %endmacro @@ -782,7 +786,7 @@ cglobal predict_8x8_dc_sse2, 2,2 STORE8x8 m0, m0 REP_RET -%else +%else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_8x8_dc_mmx2, 2,2 pxor mm0, mm0 @@ -817,7 +821,7 @@ INIT_XMM PREDICT_8x8_DC predict_8x8_dc_top_sse2 , 32, mova PREDICT_8x8_DC predict_8x8_dc_left_sse2, 14, movu -%else +%else ; !HIGH_BIT_DEPTH %macro PREDICT_8x8_DC 2 cglobal %1, 2,2 pxor mm0, mm0 @@ -836,113 +840,132 @@ PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7 ; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe ; size on the 8-bit mmx functions below if we know sse2 is available. -%macro PREDICT_8x8 3 +%macro PREDICT_8x8_DDLR 0 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl, 2,2,8 - mova m5, [r1+16*SIZEOF_PIXEL] - movu m2, [r1+17*SIZEOF_PIXEL] - movu m3, [r1+23*SIZEOF_PIXEL] - movu m4, [r1+25*SIZEOF_PIXEL] - psll%2 m1, m5, %3 +cglobal predict_8x8_ddl, 2,2,7 + mova m0, [r1+16*SIZEOF_PIXEL] + mova m1, [r1+24*SIZEOF_PIXEL] +%if cpuflag(cache64) + movd m5, [r1+32*SIZEOF_PIXEL] + palignr m3, m1, m0, 1*SIZEOF_PIXEL + palignr m5, m5, m1, 1*SIZEOF_PIXEL + palignr m4, m1, m0, 7*SIZEOF_PIXEL +%else + movu m3, [r1+17*SIZEOF_PIXEL] + movu m4, [r1+23*SIZEOF_PIXEL] + movu m5, [r1+25*SIZEOF_PIXEL] +%endif + PSLLPIX m2, m0, 1 add r0, FDEC_STRIDEB*4 - PRED8x8_LOWPASS %1, m0, m1, m2, m5, m7 -%assign %%bak avx_enabled -%assign avx_enabled 0 - PRED8x8_LOWPASS %1, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6 -%assign avx_enabled %%bak -%assign Y 3 + PRED8x8_LOWPASS m0, m2, m3, m0, m6 + PRED8x8_LOWPASS m1, m4, m5, m1, m6 + mova [r0+3*FDEC_STRIDEB], m1 +%assign Y 2 %rep 6 + PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2 + PSLLPIX m0, m0, 1 mova [r0+Y*FDEC_STRIDEB], m1 - psll%2 m1, %3 - psrl%2 m2, m0, 7*%3 - psll%2 m0, %3 - por m1, m2 %assign Y (Y-1) %endrep - mova [r0+Y*FDEC_STRIDEB], m1 - psll%2 m1, %3 - psrl%2 m0, 7*%3 - por m1, m0 -%assign Y (Y-1) + PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0 mova [r0+Y*FDEC_STRIDEB], m1 RET ;----------------------------------------------------------------------------- ; void predict_8x8_ddr( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- -%if avx_enabled == 0 cglobal predict_8x8_ddr, 2,2,7 - movu m1, [r1+ 7*SIZEOF_PIXEL] - movu m2, [r1+ 9*SIZEOF_PIXEL] - movu m3, [r1+15*SIZEOF_PIXEL] - movu m4, [r1+17*SIZEOF_PIXEL] add r0, FDEC_STRIDEB*4 - PRED8x8_LOWPASS %1, m0, m1, m2, [r1+ 8*SIZEOF_PIXEL], m5 - PRED8x8_LOWPASS %1, m1, m3, m4, [r1+16*SIZEOF_PIXEL], m6 -%assign Y 3 + mova m0, [r1+ 8*SIZEOF_PIXEL] + mova m1, [r1+16*SIZEOF_PIXEL] + ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit + movu m2, [r1+ 7*SIZEOF_PIXEL] + movu m5, [r1+17*SIZEOF_PIXEL] +%if cpuflag(cache64) + palignr m3, m1, m0, 1*SIZEOF_PIXEL + palignr m4, m1, m0, 7*SIZEOF_PIXEL +%else + movu m3, [r1+ 9*SIZEOF_PIXEL] + movu m4, [r1+15*SIZEOF_PIXEL] +%endif + PRED8x8_LOWPASS m0, m2, m3, m0, m6 + PRED8x8_LOWPASS m1, m4, m5, m1, m6 + mova [r0+3*FDEC_STRIDEB], m0 +%assign Y -4 %rep 6 - mova [r0+Y*FDEC_STRIDEB], m0 - psrl%2 m0, %3 - psll%2 m2, m1, 7*%3 - psrl%2 m1, %3 - por m0, m2 -%assign Y (Y-1) + PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2 + PSLLPIX m0, m0, 1 + mova [r0+Y*FDEC_STRIDEB], m1 +%assign Y (Y+1) %endrep - mova [r0+Y*FDEC_STRIDEB], m0 - psrl%2 m0, %3 - psll%2 m1, 7*%3 - por m0, m1 -%assign Y (Y-1) - mova [r0+Y*FDEC_STRIDEB], m0 + PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0 + mova [r0+Y*FDEC_STRIDEB], m1 RET -%endif -%endmacro ; PREDICT_8x8 +%endmacro ; PREDICT_8x8_DDLR %ifdef HIGH_BIT_DEPTH INIT_XMM sse2 -PREDICT_8x8 w, dq, 2 -INIT_XMM avx -PREDICT_8x8 w, dq, 2 +PREDICT_8x8_DDLR +INIT_XMM ssse3 +PREDICT_8x8_DDLR +INIT_XMM ssse3, cache64 +PREDICT_8x8_DDLR %elifndef ARCH_X86_64 INIT_MMX mmx2 -PREDICT_8x8 b, q , 8 +PREDICT_8x8_DDLR %endif ;----------------------------------------------------------------------------- ; void predict_8x8_hu( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- -%macro PREDICT_8x8_HU 5 +%macro PREDICT_8x8_HU 2 cglobal predict_8x8_hu, 2,2,8 - movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 add r0, 4*FDEC_STRIDEB - pshuf%3 m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 - psll%2 m1, 7*%5 ; l7 .. .. .. .. .. .. .. +%ifdef HIGH_BIT_DEPTH +%if cpuflag(ssse3) + movu m5, [r1+7*SIZEOF_PIXEL] + pshufb m5, [pw_reverse] +%else + movq m6, [r1+7*SIZEOF_PIXEL] + movq m5, [r1+11*SIZEOF_PIXEL] + pshuflw m6, m6, q0123 + pshuflw m5, m5, q0123 + movlhps m5, m6 +%endif ; cpuflag + psrldq m2, m5, 2 + pshufd m3, m5, q0321 + pshufhw m2, m2, q2210 + pshufhw m3, m3, q1110 + pavgw m4, m5, m2 +%else ; !HIGH_BIT_DEPTH + movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 + pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 + psllq m1, 56 ; l7 .. .. .. .. .. .. .. mova m2, m0 - psll%3 m0, 8*SIZEOF_PIXEL - psrl%3 m2, 8*SIZEOF_PIXEL - por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0 + psllw m0, 8 + psrlw m2, 8 + por m2, m0 + mova m3, m2 mova m4, m2 - mova m5, m2 - psrl%2 m3, m2, 2*%5 - psrl%2 m2, %5 + mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0 + psrlq m3, 16 + psrlq m2, 8 por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1 - punpckh%4 m1, m1 + punpckhbw m1, m1 por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2 - pavg%1 m4, m2 - PRED8x8_LOWPASS %1, m1, m3, m5, m2, m6 - punpckh%4 m5, m4, m1 ; p8 p7 p6 p5 - punpckl%4 m4, m1 ; p4 p3 p2 p1 - mova m6, m5 - mova m7, m5 - mova m0, m5 - PALIGNR m5, m4, 2*SIZEOF_PIXEL, m1 - pshuf%3 m1, m6, q3321 - PALIGNR m6, m4, 4*SIZEOF_PIXEL, m2 - pshuf%3 m2, m7, q3332 - PALIGNR m7, m4, 6*SIZEOF_PIXEL, m3 - pshuf%3 m3, m0, q3333 + pavgb m4, m2 +%endif ; !HIGH_BIT_DEPTH + PRED8x8_LOWPASS m2, m3, m5, m2, m6 + punpckh%2 m0, m4, m2 ; p8 p7 p6 p5 + punpckl%2 m4, m2 ; p4 p3 p2 p1 + PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3 + pshuf%1 m1, m0, q3321 + PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3 + pshuf%1 m2, m0, q3332 + PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3 + pshuf%1 m3, m0, q3333 mova [r0-4*FDEC_STRIDEB], m4 mova [r0-3*FDEC_STRIDEB], m5 mova [r0-2*FDEC_STRIDEB], m6 @@ -956,62 +979,79 @@ cglobal predict_8x8_hu, 2,2,8 %ifdef HIGH_BIT_DEPTH INIT_XMM sse2 -PREDICT_8x8_HU w, dq, d, wd, 2 +PREDICT_8x8_HU d, wd INIT_XMM ssse3 -PREDICT_8x8_HU w, dq, d, wd, 2 +PREDICT_8x8_HU d, wd INIT_XMM avx -PREDICT_8x8_HU w, dq, d, wd, 2 +PREDICT_8x8_HU d, wd %elifndef ARCH_X86_64 INIT_MMX mmx2 -PREDICT_8x8_HU b, q , w, bw, 8 +PREDICT_8x8_HU w, bw %endif ;----------------------------------------------------------------------------- ; void predict_8x8_vr( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- -%macro PREDICT_8x8_VR 3 -cglobal predict_8x8_vr, 2,3,7 +%macro PREDICT_8x8_VR 1 +cglobal predict_8x8_vr, 2,3 mova m2, [r1+16*SIZEOF_PIXEL] +%ifidn cpuname, ssse3 + mova m0, [r1+8*SIZEOF_PIXEL] + palignr m3, m2, m0, 7*SIZEOF_PIXEL + palignr m1, m2, m0, 6*SIZEOF_PIXEL +%else movu m3, [r1+15*SIZEOF_PIXEL] movu m1, [r1+14*SIZEOF_PIXEL] +%endif pavg%1 m4, m3, m2 add r0, FDEC_STRIDEB*4 - PRED8x8_LOWPASS %1, m0, m1, m2, m3, m5 + PRED8x8_LOWPASS m3, m1, m2, m3, m5 mova [r0-4*FDEC_STRIDEB], m4 - mova [r0-3*FDEC_STRIDEB], m0 - mova m5, m0 - mova m6, m4 + mova [r0-3*FDEC_STRIDEB], m3 mova m1, [r1+8*SIZEOF_PIXEL] - mova m2, m1 - psll%2 m2, %3 - mova m3, m1 - psll%2 m3, 2*%3 - PRED8x8_LOWPASS %1, m0, m1, m3, m2, m4 + PSLLPIX m0, m1, 1 + PSLLPIX m2, m1, 2 + PRED8x8_LOWPASS m0, m1, m2, m0, m6 %assign Y -2 %rep 5 - %assign i (5 + ((Y+3)&1)) - PALIGNR m %+ i, m0, 7*SIZEOF_PIXEL, m2 - mova [r0+Y*FDEC_STRIDEB], m %+ i - psll%2 m0, %3 + PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5 + mova [r0+Y*FDEC_STRIDEB], m4 + PSLLPIX m0, m0, 1 + SWAP 3, 4 %assign Y (Y+1) %endrep - PALIGNR m5, m0, 7*SIZEOF_PIXEL, m0 - mova [r0+Y*FDEC_STRIDEB], m5 + PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0 + mova [r0+Y*FDEC_STRIDEB], m4 RET %endmacro %ifdef HIGH_BIT_DEPTH INIT_XMM sse2 -PREDICT_8x8_VR w, dq, 2 +PREDICT_8x8_VR w INIT_XMM ssse3 -PREDICT_8x8_VR w, dq, 2 +PREDICT_8x8_VR w INIT_XMM avx -PREDICT_8x8_VR w, dq, 2 +PREDICT_8x8_VR w %elifndef ARCH_X86_64 INIT_MMX mmx2 -PREDICT_8x8_VR b, q , 8 +PREDICT_8x8_VR b +%endif + +%macro LOAD_PLANE_ARGS 0 +%ifdef ARCH_X86_64 + movd mm0, r1d + movd mm2, r2d + movd mm4, r3d + pshufw mm0, mm0, 0 + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 +%else + pshufw mm0, r1m, 0 + pshufw mm2, r2m, 0 + pshufw mm4, r3m, 0 %endif +%endmacro ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c ) @@ -1216,107 +1256,122 @@ PREDICT_16x16_P ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl, 2,2 - movdqa xmm3, [r1+16] - movdqu xmm2, [r1+17] - pslldq xmm1, xmm3, 1 - add r0, FDEC_STRIDE*4 - PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4 + mova m0, [r1+16] +%ifidn cpuname, ssse3 + movd m2, [r1+32] + palignr m2, m0, 1 +%else + movu m2, [r1+17] +%endif + pslldq m1, m0, 1 + add r0, FDEC_STRIDE*4 + PRED8x8_LOWPASS m0, m1, m2, m0, m3 %assign Y -4 %rep 8 - psrldq xmm0, 1 - movq [r0+Y*FDEC_STRIDE], xmm0 + psrldq m0, 1 + movq [r0+Y*FDEC_STRIDE], m0 %assign Y (Y+1) %endrep RET +%ifnidn cpuname, ssse3 ;----------------------------------------------------------------------------- ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr, 2,2 - movdqu xmm3, [r1+8] - movdqu xmm1, [r1+7] - psrldq xmm2, xmm3, 1 - add r0, FDEC_STRIDE*4 - PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4 + movu m0, [r1+8] + movu m1, [r1+7] + psrldq m2, m0, 1 + add r0, FDEC_STRIDE*4 + PRED8x8_LOWPASS m0, m1, m2, m0, m3 - psrldq xmm1, xmm0, 1 + psrldq m1, m0, 1 %assign Y 3 %rep 3 - movq [r0+Y*FDEC_STRIDE], xmm0 - movq [r0+(Y-1)*FDEC_STRIDE], xmm1 - psrldq xmm0, 2 - psrldq xmm1, 2 + movq [r0+Y*FDEC_STRIDE], m0 + movq [r0+(Y-1)*FDEC_STRIDE], m1 + psrldq m0, 2 + psrldq m1, 2 %assign Y (Y-2) %endrep - movq [r0-3*FDEC_STRIDE], xmm0 - movq [r0-4*FDEC_STRIDE], xmm1 + movq [r0-3*FDEC_STRIDE], m0 + movq [r0-4*FDEC_STRIDE], m1 RET ;----------------------------------------------------------------------------- ; void predict_8x8_vl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vl, 2,2 - movdqa xmm4, [r1+16] - pslldq xmm1, xmm4, 1 - psrldq xmm2, xmm4, 1 - pavgb xmm3, xmm4, xmm2 - add r0, FDEC_STRIDE*4 - PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm4, xmm5 -; xmm0: (t0 + 2*t1 + t2 + 2) >> 2 -; xmm3: (t0 + t1 + 1) >> 1 + mova m0, [r1+16] + pslldq m1, m0, 1 + psrldq m2, m0, 1 + pavgb m3, m0, m2 + add r0, FDEC_STRIDE*4 + PRED8x8_LOWPASS m0, m1, m2, m0, m5 +; m0: (t0 + 2*t1 + t2 + 2) >> 2 +; m3: (t0 + t1 + 1) >> 1 %assign Y -4 %rep 3 - psrldq xmm0, 1 - movq [r0+ Y *FDEC_STRIDE], xmm3 - movq [r0+(Y+1)*FDEC_STRIDE], xmm0 - psrldq xmm3, 1 + psrldq m0, 1 + movq [r0+ Y *FDEC_STRIDE], m3 + movq [r0+(Y+1)*FDEC_STRIDE], m0 + psrldq m3, 1 %assign Y (Y+2) %endrep - psrldq xmm0, 1 - movq [r0+ Y *FDEC_STRIDE], xmm3 - movq [r0+(Y+1)*FDEC_STRIDE], xmm0 - + psrldq m0, 1 + movq [r0+ Y *FDEC_STRIDE], m3 + movq [r0+(Y+1)*FDEC_STRIDE], m0 RET +%endif ; !ssse3 ;----------------------------------------------------------------------------- ; void predict_8x8_vr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_vr, 2,2,7 - movdqu xmm0, [r1+8] - movdqa xmm6, [pw_ff00] +cglobal predict_8x8_vr, 2,2 + movu m2, [r1+8] add r0, 4*FDEC_STRIDE - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - pslldq xmm1, xmm0, 2 - pslldq xmm0, 1 - pavgb xmm2, xmm0 - PRED8x8_LOWPASS b, xmm4, xmm3, xmm1, xmm0, xmm5 - pandn xmm6, xmm4 - movdqa xmm5, xmm4 - psrlw xmm4, 8 - packuswb xmm6, xmm4 - movhlps xmm4, xmm6 - movhps [r0-3*FDEC_STRIDE], xmm5 - movhps [r0-4*FDEC_STRIDE], xmm2 - psrldq xmm5, 4 - movss xmm5, xmm6 - psrldq xmm2, 4 - movss xmm2, xmm4 -%assign Y 3 -%rep 3 - psrldq xmm5, 1 - psrldq xmm2, 1 - movq [r0+Y*FDEC_STRIDE], xmm5 - movq [r0+(Y-1)*FDEC_STRIDE], xmm2 -%assign Y (Y-2) -%endrep + pslldq m1, m2, 2 + pslldq m0, m2, 1 + pavgb m3, m2, m0 + PRED8x8_LOWPASS m0, m2, m1, m0, m4 + movhps [r0-4*FDEC_STRIDE], m3 + movhps [r0-3*FDEC_STRIDE], m0 +%if cpuflag(ssse3) + movhlps m3, m3 + pshufb m0, [shuf_vr] + palignr m3, m0, 13 +%else + mova m2, m0 + mova m1, [pw_00ff] + pand m1, m0 + psrlw m0, 8 + packuswb m1, m0 + pslldq m1, 4 + movhlps m3, m1 + shufps m1, m2, q3210 + psrldq m3, 5 + psrldq m1, 5 + SWAP 0, 1 +%endif + movq [r0+3*FDEC_STRIDE], m0 + movq [r0+2*FDEC_STRIDE], m3 + psrldq m0, 1 + psrldq m3, 1 + movq [r0+1*FDEC_STRIDE], m0 + movq [r0+0*FDEC_STRIDE], m3 + psrldq m0, 1 + psrldq m3, 1 + movq [r0-1*FDEC_STRIDE], m0 + movq [r0-2*FDEC_STRIDE], m3 RET %endmacro ; PREDICT_8x8 INIT_XMM sse2 PREDICT_8x8 +INIT_XMM ssse3 +PREDICT_8x8 INIT_XMM avx PREDICT_8x8 @@ -1325,117 +1380,96 @@ PREDICT_8x8 ;----------------------------------------------------------------------------- ; void predict_8x8_hd( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- -%macro PREDICT_8x8_HD 4 -cglobal predict_8x8_hd, 2,2,8 +%macro PREDICT_8x8_HD 2 +cglobal predict_8x8_hd, 2,2 add r0, 4*FDEC_STRIDEB - mova m0, [r1] ; l7 .. .. .. .. .. .. .. - mova m1, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6 + mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6 + movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 +%ifidn cpuname, ssse3 mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0 - mova m3, m1 ; lt l0 l1 l2 l3 l4 l5 l6 mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0 - PALIGNR m2, m1, 7*SIZEOF_PIXEL, m5 ; t6 t5 t4 t3 t2 t1 t0 lt - PALIGNR m1, m0, 7*SIZEOF_PIXEL, m6 ; l0 l1 l2 l3 l4 l5 l6 l7 - PALIGNR m4, m3, 1*SIZEOF_PIXEL, m7 ; t0 lt l0 l1 l2 l3 l4 l5 - mova m5, m3 - pavg%1 m3, m1 - PRED8x8_LOWPASS %1, m0, m4, m1, m5, m7 - psrl%2 m4, m2, 2*%4 ; .. .. t6 t5 t4 t3 t2 t1 - psrl%2 m1, m2, %4 ; .. t6 t5 t4 t3 t2 t1 t0 - PRED8x8_LOWPASS %1, m6, m4, m2, m1, m5 + palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt + palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5 +%else + movu m2, [r1+15*SIZEOF_PIXEL] + movu m4, [r1+ 9*SIZEOF_PIXEL] +%endif ; cpuflag + pavg%1 m3, m0, m1 + PRED8x8_LOWPASS m0, m4, m1, m0, m5 + PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1 + PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0 + PRED8x8_LOWPASS m1, m4, m2, m1, m5 ; .. p11 p10 p9 - punpckh%3 m7, m3, m0 ; p8 p7 p6 p5 - punpckl%3 m3, m0 ; p4 p3 p2 p1 - mova m1, m7 - mova m0, m7 - mova m4, m7 + punpckh%2 m2, m3, m0 ; p8 p7 p6 p5 + punpckl%2 m3, m0 ; p4 p3 p2 p1 mova [r0+3*FDEC_STRIDEB], m3 - PALIGNR m7, m3, 2*SIZEOF_PIXEL, m5 - mova [r0+2*FDEC_STRIDEB], m7 - PALIGNR m1, m3, 4*SIZEOF_PIXEL, m5 - mova [r0+1*FDEC_STRIDEB], m1 - PALIGNR m0, m3, 6*SIZEOF_PIXEL, m3 + PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5 + mova [r0+2*FDEC_STRIDEB], m0 + PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5 + mova [r0+1*FDEC_STRIDEB], m0 + PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3 mova [r0+0*FDEC_STRIDEB], m0 - mova m2, m6 - mova m3, m6 - mova [r0-1*FDEC_STRIDEB], m4 - PALIGNR m6, m4, 2*SIZEOF_PIXEL, m5 - mova [r0-2*FDEC_STRIDEB], m6 - PALIGNR m2, m4, 4*SIZEOF_PIXEL, m5 - mova [r0-3*FDEC_STRIDEB], m2 - PALIGNR m3, m4, 6*SIZEOF_PIXEL, m4 - mova [r0-4*FDEC_STRIDEB], m3 + mova [r0-1*FDEC_STRIDEB], m2 + PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5 + mova [r0-2*FDEC_STRIDEB], m0 + PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5 + mova [r0-3*FDEC_STRIDEB], m0 + PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2 + mova [r0-4*FDEC_STRIDEB], m1 RET %endmacro %ifdef HIGH_BIT_DEPTH INIT_XMM sse2 -PREDICT_8x8_HD w, dq, wd, 2 +PREDICT_8x8_HD w, wd INIT_XMM ssse3 -PREDICT_8x8_HD w, dq, wd, 2 +PREDICT_8x8_HD w, wd INIT_XMM avx -PREDICT_8x8_HD w, dq, wd, 2 +PREDICT_8x8_HD w, wd %else INIT_MMX mmx2 -PREDICT_8x8_HD b, q , bw, 8 +PREDICT_8x8_HD b, bw ;----------------------------------------------------------------------------- ; void predict_8x8_hd( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HD 0 cglobal predict_8x8_hd, 2,2 - add r0, 4*FDEC_STRIDE - movdqa xmm0, [r1] - movdqa xmm1, [r1+16] - movdqa xmm2, xmm1 - movdqa xmm3, xmm1 - PALIGNR xmm1, xmm0, 7, xmm4 - PALIGNR xmm2, xmm0, 9, xmm5 - PALIGNR xmm3, xmm0, 8, xmm0 - pavgb xmm4, xmm1, xmm3 - PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm5 - punpcklbw xmm4, xmm0 - movhlps xmm0, xmm4 + add r0, 4*FDEC_STRIDE + movu m1, [r1+7] + movu m3, [r1+8] + movu m2, [r1+9] + pavgb m4, m1, m3 + PRED8x8_LOWPASS m0, m1, m2, m3, m5 + punpcklbw m4, m0 + movhlps m0, m4 %assign Y 3 %rep 3 - movq [r0+(Y)*FDEC_STRIDE], xmm4 - movq [r0+(Y-4)*FDEC_STRIDE], xmm0 - psrldq xmm4, 2 - psrldq xmm0, 2 + movq [r0+(Y)*FDEC_STRIDE], m4 + movq [r0+(Y-4)*FDEC_STRIDE], m0 + psrldq m4, 2 + psrldq m0, 2 %assign Y (Y-1) %endrep - movq [r0+(Y)*FDEC_STRIDE], xmm4 - movq [r0+(Y-4)*FDEC_STRIDE], xmm0 + movq [r0+(Y)*FDEC_STRIDE], m4 + movq [r0+(Y-4)*FDEC_STRIDE], m0 RET %endmacro INIT_XMM sse2 PREDICT_8x8_HD -INIT_XMM ssse3 -PREDICT_8x8_HD INIT_XMM avx PREDICT_8x8_HD %endif ; HIGH_BIT_DEPTH +%ifndef HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_8x8_hu( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -%macro PREDICT_8x8_HU 0 -cglobal predict_8x8_hu, 2,2 +INIT_MMX +cglobal predict_8x8_hu_sse2, 2,2 add r0, 4*FDEC_STRIDE -%if cpuflag(ssse3) - movq mm5, [r1+7] - movq mm6, [pb_reverse] - movq mm1, mm5 - movq mm2, mm5 - movq mm3, mm5 - pshufb mm5, mm6 - psrlq mm6, 8 - pshufb mm2, mm6 - psrlq mm6, 8 - pshufb mm3, mm6 - movq mm4, mm5 -%else movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 movq mm2, mm0 @@ -1451,9 +1485,8 @@ cglobal predict_8x8_hu, 2,2 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 punpckhbw mm1, mm1 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 -%endif pavgb mm4, mm2 - PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6 + PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 movq2dq xmm0, mm4 movq2dq xmm1, mm1 @@ -1474,14 +1507,29 @@ cglobal predict_8x8_hu, 2,2 movq [r0+2*FDEC_STRIDE], mm6 movq [r0+3*FDEC_STRIDE], mm7 RET -%endmacro -%ifndef HIGH_BIT_DEPTH -INIT_MMX sse2 -PREDICT_8x8_HU -INIT_MMX ssse3 -PREDICT_8x8_HU -%endif +INIT_XMM +cglobal predict_8x8_hu_ssse3, 2,2 + add r0, 4*FDEC_STRIDE + movq m3, [r1+7] + pshufb m3, [shuf_hu] + psrldq m1, m3, 1 + psrldq m2, m3, 2 + pavgb m0, m1, m3 + PRED8x8_LOWPASS m1, m3, m2, m1, m4 + punpcklbw m0, m1 +%assign Y -4 +%rep 3 + movq [r0+ Y *FDEC_STRIDE], m0 + movhps [r0+(Y+4)*FDEC_STRIDE], m0 + psrldq m0, 2 + pshufhw m0, m0, q2210 +%assign Y (Y+1) +%endrep + movq [r0+ Y *FDEC_STRIDE], m0 + movhps [r0+(Y+4)*FDEC_STRIDE], m0 + RET +%endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_8x8c_v( uint8_t *src ) @@ -1508,14 +1556,14 @@ INIT_MMX cglobal predict_8x8c_v_mmx, 1,1 mova m0, [r0 - FDEC_STRIDEB] mova m1, [r0 - FDEC_STRIDEB + 8] -%assign n 0 +%assign Y 0 %rep 8 - mova [r0 + (n&1)*FDEC_STRIDEB], m0 - mova [r0 + (n&1)*FDEC_STRIDEB + 8], m1 -%if (n&1) && (n!=7) + mova [r0 + (Y&1)*FDEC_STRIDEB], m0 + mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1 +%if (Y&1) && (Y!=7) add r0, FDEC_STRIDEB*2 %endif -%assign n n+1 +%assign Y Y+1 %endrep RET @@ -1529,16 +1577,16 @@ cglobal predict_8x8c_v_mmx, 1,1 INIT_XMM sse2 cglobal predict_8x8c_h, 1,1 add r0, FDEC_STRIDEB*4 -%assign n -4 +%assign Y -4 %rep 8 - movd m0, [r0+FDEC_STRIDEB*n-SIZEOF_PIXEL*2] + movd m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2] SPLATW m0, m0, 1 - mova [r0+FDEC_STRIDEB*n], m0 -%assign n n+1 + mova [r0+FDEC_STRIDEB*Y], m0 +%assign Y Y+1 %endrep RET -%else +%else ; !HIGH_BIT_DEPTH %macro PREDICT_8x8C_H 0 cglobal predict_8x8c_h, 1,1 @@ -1546,11 +1594,11 @@ cglobal predict_8x8c_h, 1,1 mova m1, [pb_3] %endif add r0, FDEC_STRIDE*4 -%assign n -4 +%assign Y -4 %rep 8 - SPLATB_LOAD m0, r0+FDEC_STRIDE*n-1, m1 - mova [r0+FDEC_STRIDE*n], m0 -%assign n n+1 + SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1 + mova [r0+FDEC_STRIDE*Y], m0 +%assign Y Y+1 %endrep RET %endmacro @@ -1573,7 +1621,7 @@ cglobal predict_8x8c_dc, 1,3 movq m1, [r0-FDEC_STRIDEB+8] HADDW m0, m2 HADDW m1, m2 -%else +%else ; !HIGH_BIT_DEPTH movd m0, [r0-FDEC_STRIDEB+0] movd m1, [r0-FDEC_STRIDEB+4] psadbw m0, m7 ; s0 @@ -1613,37 +1661,37 @@ cglobal predict_8x8c_dc, 1,3 punpcklwd xmm0, xmm0 pshufd xmm1, xmm0, q3322 punpckldq xmm0, xmm0 -%assign n 0 +%assign Y 0 %rep 8 -%assign i (0 + (n/4)) - movdqa [r0+FDEC_STRIDEB*(n-4)+0], xmm %+ i -%assign n n+1 +%assign i (0 + (Y/4)) + movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i +%assign Y Y+1 %endrep -%else +%else ; !sse2 pshufw m1, m0, q0000 pshufw m2, m0, q1111 pshufw m3, m0, q2222 pshufw m4, m0, q3333 -%assign n 0 +%assign Y 0 %rep 8 -%assign i (1 + (n/4)*2) -%assign j (2 + (n/4)*2) - movq [r0+FDEC_STRIDEB*(n-4)+0], m %+ i - movq [r0+FDEC_STRIDEB*(n-4)+8], m %+ j -%assign n n+1 +%assign i (1 + (Y/4)*2) +%assign j (2 + (Y/4)*2) + movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i + movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j +%assign Y Y+1 %endrep %endif -%else +%else ; !HIGH_BIT_DEPTH packuswb m0, m0 punpcklbw m0, m0 movq m1, m0 punpcklbw m0, m0 punpckhbw m1, m1 -%assign n 0 +%assign Y 0 %rep 8 -%assign i (0 + (n/4)) - movq [r0+FDEC_STRIDEB*(n-4)], m %+ i -%assign n n+1 +%assign i (0 + (Y/4)) + movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i +%assign Y Y+1 %endrep %endif RET @@ -1657,7 +1705,6 @@ PREDICT_8x8C_DC %endif %ifdef HIGH_BIT_DEPTH - INIT_XMM cglobal predict_8x8c_dc_top_sse2, 1,1 pxor m2, m2 @@ -1671,9 +1718,7 @@ cglobal predict_8x8c_dc_top_sse2, 1,1 pavgw m0, m2 STORE8x8 m0, m0 RET - -%else - +%else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_8x8c_dc_top_mmx2, 1,1 movq mm0, [r0 - FDEC_STRIDE] @@ -1692,7 +1737,6 @@ cglobal predict_8x8c_dc_top_mmx2, 1,1 packuswb mm0, mm1 ; dc0,dc1 (b) STORE8x8 mm0, mm0 RET - %endif ;----------------------------------------------------------------------------- @@ -1713,7 +1757,7 @@ cglobal predict_16x16_v_sse2, 2,2 mova m1, [r0 - FDEC_STRIDEB+16] STORE16x16_SSE2 m0, m1 REP_RET -%else +%else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_16x16_v_mmx, 1,2 movq m0, [r0 - FDEC_STRIDE + 0] @@ -1735,32 +1779,32 @@ cglobal predict_16x16_h, 1,2 mov r1, 12*FDEC_STRIDEB %ifdef HIGH_BIT_DEPTH .vloop: -%assign n 0 +%assign Y 0 %rep 4 - movd m0, [r0+r1+n*FDEC_STRIDEB-2*SIZEOF_PIXEL] + movd m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL] SPLATW m0, m0, 1 - mova [r0+r1+n*FDEC_STRIDEB+ 0], m0 - mova [r0+r1+n*FDEC_STRIDEB+16], m0 + mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0 + mova [r0+r1+Y*FDEC_STRIDEB+16], m0 %if mmsize==8 - mova [r0+r1+n*FDEC_STRIDEB+ 8], m0 - mova [r0+r1+n*FDEC_STRIDEB+24], m0 + mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0 + mova [r0+r1+Y*FDEC_STRIDEB+24], m0 %endif -%assign n n+1 +%assign Y Y+1 %endrep -%else +%else ; !HIGH_BIT_DEPTH %if cpuflag(ssse3) mova m1, [pb_3] %endif .vloop: -%assign n 0 +%assign Y 0 %rep 4 - SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*n-1, m1 - mova [r0+r1+FDEC_STRIDE*n], m0 + SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1 + mova [r0+r1+FDEC_STRIDE*Y], m0 %if mmsize==8 - mova [r0+r1+FDEC_STRIDE*n+8], m0 + mova [r0+r1+FDEC_STRIDE*Y+8], m0 %endif -%assign n n+1 +%assign Y Y+1 %endrep %endif ; HIGH_BIT_DEPTH sub r1, 4*FDEC_STRIDEB @@ -1794,7 +1838,7 @@ PREDICT_16x16_H psrlw m0, %2 SPLATW m0, m0 STORE16x16 m0, m0, m0, m0 -%else +%else ; !HIGH_BIT_DEPTH pxor m0, m0 pxor m1, m1 psadbw m0, [r0 - FDEC_STRIDE] @@ -1830,7 +1874,7 @@ cglobal predict_16x16_dc_left_core_mmx2, 1,2 SPLATW m0, m0 STORE16x16 m0, m0, m0, m0 REP_RET -%else +%else ; !HIGH_BIT_DEPTH cglobal predict_16x16_dc_left_core_mmx2, 1,1 movd m0, r1m pshufw m0, m0, 0 @@ -1852,7 +1896,7 @@ cglobal predict_16x16_dc_left_core_mmx2, 1,1 psrlw m0, %2 SPLATW m0, m0 STORE16x16_SSE2 m0, m0 -%else +%else ; !HIGH_BIT_DEPTH pxor m0, m0 psadbw m0, [r0 - FDEC_STRIDE] movhlps m1, m0 @@ -1882,7 +1926,7 @@ cglobal predict_16x16_dc_left_core_sse2, 1,2 SPLATW m0, m0 STORE16x16_SSE2 m0, m0 REP_RET -%else +%else ; !HIGH_BIT_DEPTH cglobal predict_16x16_dc_left_core_sse2, 1,1 movd m0, r1m SPLATW m0, m0 diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 59cce04b..a4f75adc 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -69,15 +69,19 @@ void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] ); void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] ); + void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] ); + void x264_predict_8x8_ddl_ssse3_cache64( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] ); + void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] ); + void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] ); void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] ); - void x264_predict_8x8_vr_ssse3( uint16_t *src, uint16_t edge[36] ); + void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] ); void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] ); void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] ); void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] ); @@ -99,6 +103,7 @@ void x264_predict_4x4_vr_mmx2( uint8_t *src ); void x264_predict_4x4_vr_sse2( uint16_t *src ); void x264_predict_4x4_vr_ssse3( pixel *src ); + void x264_predict_4x4_vr_ssse3_cache64( uint8_t *src ); void x264_predict_4x4_vr_avx( uint16_t *src ); void x264_predict_4x4_hd_mmx2( pixel *src ); void x264_predict_4x4_hd_sse2( uint16_t *src ); @@ -466,13 +471,19 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_ *predict_8x8_filter = x264_predict_8x8_filter_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_ssse3; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3; *predict_8x8_filter = x264_predict_8x8_filter_ssse3; + if( cpu&X264_CPU_CACHELINE_64 ) + { + pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_ssse3_cache64; + pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_ssse3_cache64; + } if( !(cpu&X264_CPU_AVX) ) return; - pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_avx; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx; @@ -501,7 +512,8 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3; + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; *predict_8x8_filter = x264_predict_8x8_filter_ssse3; if( !(cpu&X264_CPU_AVX) ) @@ -532,21 +544,26 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] ) pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_sse2; pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2; pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_sse2; + if( !(cpu&X264_CPU_SSSE3) ) + return; + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3; + pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3; + pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3; if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_avx; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_avx; pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_avx; -#if ARCH_X86_64 pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_avx; -#endif pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx; #else pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmx2; -#endif // HIGH_BIT_DEPTH if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3; pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3; pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3; + if( cpu&X264_CPU_CACHELINE_64 ) + pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3_cache64; +#endif // HIGH_BIT_DEPTH } diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index fcfa16f8..07581d22 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -322,6 +322,22 @@ %endif %endmacro +; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes +; values shifted in are undefined +; faster if dst==src +%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift +%define PSRLPIX PSXLPIX r, 1, ;dst, src, shift +%macro PSXLPIX 5 + %if mmsize == 8 + %if %5&1 + ps%1lq %3, %4, %5*8 + %else + pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff + %endif + %else + ps%1ldq %3, %4, %5*2 + %endif +%endmacro %macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from %ifnum %5 diff --git a/encoder/analyse.c b/encoder/analyse.c index 2352c3e2..f4652e74 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -834,7 +834,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ /* 8x8 prediction selection */ if( flags & X264_ANALYSE_I8x8 ) { - ALIGNED_ARRAY_16( pixel, edge,[36] ); + ALIGNED_ARRAY_32( pixel, edge,[36] ); x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8]; int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); @@ -1189,7 +1189,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) } else if( h->mb.i_type == I_8x8 ) { - ALIGNED_ARRAY_16( pixel, edge,[3],[48] ); + ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap pixel4 pels_h[3][2] = {{0}}; pixel pels_v[3][7] = {{0}}; uint16_t nnz[3][2] = {{0}}; //shut up gcc diff --git a/encoder/macroblock.c b/encoder/macroblock.c index adee51de..a8768c57 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -171,7 +171,7 @@ void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE]; pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE]; ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] ); - ALIGNED_ARRAY_16( pixel, edge_buf,[36] ); + ALIGNED_ARRAY_32( pixel, edge_buf,[36] ); if( !edge ) { diff --git a/encoder/rdo.c b/encoder/rdo.c index 97b68f4a..f994fa02 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -273,7 +273,7 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel ) return (i_ssd<<8) + i_bits; } -static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode, pixel edge[3][48] ) +static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode, pixel edge[4][32] ) { uint64_t i_ssd, i_bits; int plane_count = CHROMA444 ? 3 : 1; diff --git a/tools/checkasm.c b/tools/checkasm.c index f6d3bc93..b25d745a 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1784,8 +1784,8 @@ static int check_quant( int cpu_ref, int cpu_new ) static int check_intra( int cpu_ref, int cpu_new ) { int ret = 0, ok = 1, used_asm = 0; - ALIGNED_16( pixel edge[36] ); - ALIGNED_16( pixel edge2[36] ); + ALIGNED_ARRAY_32( pixel, edge,[36] ); + ALIGNED_ARRAY_32( pixel, edge2,[36] ); ALIGNED_16( pixel fdec[FDEC_STRIDE*20] ); struct { @@ -1871,8 +1871,8 @@ static int check_intra( int cpu_ref, int cpu_new ) if( !(i&7) || ((i&MB_TOPRIGHT) && !(i&MB_TOP)) ) continue; int neighbor = (i&24)>>1; - memset( edge, 0, sizeof(edge) ); - memset( edge2, 0, sizeof(edge2) ); + memset( edge, 0, 36*sizeof(pixel) ); + memset( edge2, 0, 36*sizeof(pixel) ); call_c( ip_c.predict_8x8_filter, pbuf1+48, edge, neighbor, i&7 ); call_a( ip_a.predict_8x8_filter, pbuf1+48, edge2, neighbor, i&7 ); if( !(neighbor&MB_TOPLEFT) ) -- 2.40.0