From abde94f64a2232f2ef6fb423d6138633442ef87a Mon Sep 17 00:00:00 2001 From: Oskar Arvidsson Date: Sat, 30 Oct 2010 20:16:33 +0200 Subject: [PATCH] x86 asm for high-bit-depth pixel metrics Overall speed change from these 6 asm patches: ~4.4x. But there's still tons more asm to do -- patches welcome! Breakdown from this patch: ~13x faster SAD than C. ~11.5x faster SATD than C (only MMX done). ~18.5x faster SA8D than C. ~19.2x faster hadamard_ac than C. ~8.3x faster SSD than C. ~12.4x faster VAR than C. ~3-4.2x faster intra SAD than C. ~7.9x faster intra SATD than C. --- Makefile | 7 +- common/pixel.c | 150 +++++++-- common/x86/pixel-a.asm | 698 ++++++++++++++++++++++++++++++++--------- common/x86/pixel.h | 59 ++-- common/x86/sad16-a.asm | 432 +++++++++++++++++++++++++ common/x86/x86util.asm | 26 ++ 6 files changed, 1175 insertions(+), 197 deletions(-) create mode 100644 common/x86/sad16-a.asm diff --git a/Makefile b/Makefile index ebf1f621..8957ee10 100644 --- a/Makefile +++ b/Makefile @@ -59,8 +59,13 @@ endif # MMX/SSE optims ifneq ($(AS),) X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \ - mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \ + mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm \ cpu-a.asm dct-32.asm bitstream-a.asm +ifneq ($(findstring X264_HIGH_BIT_DEPTH, $(CONFIG)),) +X86SRC0 += sad16-a.asm +else +X86SRC0 += sad-a.asm +endif X86SRC = $(X86SRC0:%=common/x86/%) ifeq ($(ARCH),X86) diff --git a/common/pixel.c b/common/pixel.c index eb442dc1..c932583d 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -482,51 +482,72 @@ SATD_X_DECL6( cpu )\ SATD_X( 4x4, cpu ) SATD_X_DECL7() -#if !X264_HIGH_BIT_DEPTH #if HAVE_MMX SATD_X_DECL7( _mmxext ) +#if !X264_HIGH_BIT_DEPTH SATD_X_DECL6( _sse2 ) SATD_X_DECL7( _ssse3 ) SATD_X_DECL7( _sse4 ) +#endif // !X264_HIGH_BIT_DEPTH #endif +#if !X264_HIGH_BIT_DEPTH #if HAVE_ARMV6 SATD_X_DECL7( _neon ) #endif #endif // !X264_HIGH_BIT_DEPTH -#define INTRA_MBCMP_8x8( mbcmp )\ -void x264_intra_##mbcmp##_x3_8x8( pixel *fenc, pixel edge[33], int res[3] )\ +#define INTRA_MBCMP_8x8( mbcmp, cpu )\ +void x264_intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[33], int res[3] )\ {\ pixel pix[8*FDEC_STRIDE];\ x264_predict_8x8_v_c( pix, edge );\ - res[0] = x264_pixel_##mbcmp##_8x8( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ + res[0] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_8x8_h_c( pix, edge );\ - res[1] = x264_pixel_##mbcmp##_8x8( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ + res[1] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_8x8_dc_c( pix, edge );\ - res[2] = x264_pixel_##mbcmp##_8x8( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ + res[2] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ } -INTRA_MBCMP_8x8(sad) -INTRA_MBCMP_8x8(sa8d) +INTRA_MBCMP_8x8( sad, ) +INTRA_MBCMP_8x8(sa8d, ) +#if X264_HIGH_BIT_DEPTH && HAVE_MMX +INTRA_MBCMP_8x8( sad, _mmxext) +INTRA_MBCMP_8x8( sad, _sse2 ) +INTRA_MBCMP_8x8( sad, _ssse3 ) +INTRA_MBCMP_8x8(sa8d, _sse2 ) +#endif -#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\ -void x264_intra_##mbcmp##_x3_##size##x##size##chroma( pixel *fenc, pixel *fdec, int res[3] )\ +#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu )\ +void x264_intra_##mbcmp##_x3_##size##x##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\ {\ x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\ - res[0] = x264_pixel_##mbcmp##_##size##x##size( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ + res[0] = x264_pixel_##mbcmp##_##size##x##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\ - res[1] = x264_pixel_##mbcmp##_##size##x##size( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ + res[1] = x264_pixel_##mbcmp##_##size##x##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\ - res[2] = x264_pixel_##mbcmp##_##size##x##size( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ + res[2] = x264_pixel_##mbcmp##_##size##x##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ } -INTRA_MBCMP(sad, 4, v, h, dc, ) -INTRA_MBCMP(satd, 4, v, h, dc, ) -INTRA_MBCMP(sad, 8, dc, h, v, c ) -INTRA_MBCMP(satd, 8, dc, h, v, c ) -INTRA_MBCMP(sad, 16, v, h, dc, ) -INTRA_MBCMP(satd, 16, v, h, dc, ) +INTRA_MBCMP( sad, 4, v, h, dc, , ) +INTRA_MBCMP(satd, 4, v, h, dc, , ) +INTRA_MBCMP( sad, 8, dc, h, v, c, ) +INTRA_MBCMP(satd, 8, dc, h, v, c, ) +INTRA_MBCMP( sad, 16, v, h, dc, , ) +INTRA_MBCMP(satd, 16, v, h, dc, , ) + +#if X264_HIGH_BIT_DEPTH && HAVE_MMX +INTRA_MBCMP( sad, 4, v, h, dc, , _mmxext) +INTRA_MBCMP(satd, 4, v, h, dc, , _mmxext) +INTRA_MBCMP( sad, 8, dc, h, v, c, _mmxext) +INTRA_MBCMP(satd, 8, dc, h, v, c, _mmxext) +INTRA_MBCMP( sad, 16, v, h, dc, , _mmxext) +INTRA_MBCMP(satd, 16, v, h, dc, , _mmxext) +INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 ) +INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 ) +INTRA_MBCMP( sad, 8, dc, h, v, c, _ssse3 ) +INTRA_MBCMP( sad, 16, v, h, dc, , _ssse3 ) +#endif /**************************************************************************** * structural similarity metric @@ -719,7 +740,94 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16; -#if !X264_HIGH_BIT_DEPTH +#if X264_HIGH_BIT_DEPTH +#if HAVE_MMX + if( cpu&X264_CPU_MMXEXT ) + { + INIT7( sad, _mmxext ); + INIT7( sad_x3, _mmxext ); + INIT7( sad_x4, _mmxext ); + INIT7( satd, _mmxext ); + INIT7( satd_x3, _mmxext ); + INIT7( satd_x4, _mmxext ); + INIT4( hadamard_ac, _mmxext ); + INIT7( ssd, _mmxext ); + INIT_ADS( _mmxext ); + + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext; + pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext; + + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmxext; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmxext; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext; + } + if( cpu&X264_CPU_SSE2 ) + { + INIT4_NAME( sad_aligned, sad, _sse2_aligned ); + INIT5( ssd, _sse2 ); + + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; +#if ARCH_X86_64 + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; +#endif + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; + pixf->var2_8x8 = x264_pixel_var2_8x8_sse2; + } + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) + { + INIT5( sad, _sse2 ); + INIT2( sad_x3, _sse2 ); + INIT2( sad_x4, _sse2 ); + INIT_ADS( _sse2 ); + + if( !(cpu&X264_CPU_STACK_MOD4) ) + { + INIT4( hadamard_ac, _sse2 ); + } + + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; + } + if( cpu&X264_CPU_SSE2_IS_FAST ) + { + pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2; + pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2; + pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2; + pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2; + pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2; + pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2; + pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2; + } + if( cpu&X264_CPU_SSSE3 ) + { + INIT7( sad, _ssse3 ); + INIT7( sad_x3, _ssse3 ); + INIT7( sad_x4, _ssse3 ); + INIT_ADS( _ssse3 ); + + if( !(cpu&X264_CPU_STACK_MOD4) ) + { + INIT4( hadamard_ac, _ssse3 ); + } + + pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; + } +#endif // HAVE_MMX +#else // !X264_HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) { @@ -947,7 +1055,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } } #endif -#endif // !X264_HIGH_BIT_DEPTH +#endif // X264_HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) { diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 35a454d0..420b04b3 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -8,6 +8,7 @@ ;* Laurent Aimar ;* Alex Izvorski ;* Fiona Glaser +;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -46,7 +47,7 @@ mask_1100: times 2 dd 0, -1 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 pd_f0: times 4 dd 0xffff0000 -pq_0f: times 2 dd 0xffffffff, 0 +sq_0f: times 1 dq 0xffffffff SECTION .text @@ -55,36 +56,95 @@ cextern pw_00ff cextern hsub_mul -%macro HADDD 2 ; sum junk -%if mmsize == 16 - movhlps %2, %1 - paddd %1, %2 - pshuflw %2, %1, 0xE - paddd %1, %2 -%else - pshufw %2, %1, 0xE - paddd %1, %2 -%endif -%endmacro +;============================================================================= +; SSD +;============================================================================= -%macro HADDW 2 - pmaddwd %1, [pw_1] - HADDD %1, %2 +%ifdef X264_HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int ) +;----------------------------------------------------------------------------- +%macro SSD_ONE 3 +cglobal pixel_ssd_%1x%2_%3, 4,5,6*(mmsize/16) + mov r4, %1*%2/mmsize + pxor m0, m0 +.loop + mova m1, [r0] +%if %1 <= mmsize/2 + mova m3, [r0+r1*2] + %define offset r3*2 + %define num_rows 2 +%else + mova m3, [r0+mmsize] + %define offset mmsize + %define num_rows 1 +%endif + psubw m1, [r2] + psubw m3, [r2+offset] + pmaddwd m1, m1 + pmaddwd m3, m3 + dec r4 + lea r0, [r0+r1*2*num_rows] + lea r2, [r2+r3*2*num_rows] + paddd m0, m1 + paddd m0, m3 + jg .loop + HADDD m0, m5 + movd eax, m0 + RET %endmacro -%macro HADDUW 2 - mova %2, %1 - pslld %1, 16 - psrld %2, 16 - psrld %1, 16 - paddd %1, %2 - HADDD %1, %2 +%macro SSD_16_MMX 2 +cglobal pixel_ssd_%1x%2_mmxext, 4,5 + mov r4, %1*%2/mmsize/2 + pxor m0, m0 +.loop + mova m1, [r0] + mova m2, [r2] + mova m3, [r0+mmsize] + mova m4, [r2+mmsize] + mova m5, [r0+mmsize*2] + mova m6, [r2+mmsize*2] + mova m7, [r0+mmsize*3] + psubw m1, m2 + psubw m3, m4 + mova m2, [r2+mmsize*3] + psubw m5, m6 + pmaddwd m1, m1 + psubw m7, m2 + pmaddwd m3, m3 + pmaddwd m5, m5 + dec r4 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + pmaddwd m7, m7 + paddd m1, m3 + paddd m5, m7 + paddd m0, m1 + paddd m0, m5 + jg .loop + HADDD m0, m7 + movd eax, m0 + RET %endmacro -;============================================================================= -; SSD -;============================================================================= - +INIT_MMX +SSD_ONE 4, 4, mmxext +SSD_ONE 4, 8, mmxext +SSD_ONE 8, 4, mmxext +SSD_ONE 8, 8, mmxext +SSD_ONE 8, 16, mmxext +SSD_16_MMX 16, 8 +SSD_16_MMX 16, 16 +INIT_XMM +SSD_ONE 8, 4, sse2 +SSD_ONE 8, 8, sse2 +SSD_ONE 8, 16, sse2 +SSD_ONE 16, 8, sse2 +SSD_ONE 16, 16, sse2 +%endif ; X264_HIGH_BIT_DEPTH + +%ifndef X264_HIGH_BIT_DEPTH %macro SSD_LOAD_FULL 5 mova m1, [t0+%1] mova m2, [t2+%2] @@ -310,9 +370,89 @@ INIT_MMX SSD 4, 4, ssse3 SSD 4, 8, ssse3 %assign function_align 16 +%endif ; !X264_HIGH_BIT_DEPTH + +;----------------------------------------------------------------------------- +; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2, +; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) +; +; The maximum width this function can handle without risk of overflow is given +; in the following equation: +; +; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2 +; +; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane +; distortion levels it will take much more than that though. +;----------------------------------------------------------------------------- +%ifdef X264_HIGH_BIT_DEPTH +%macro SSD_NV12 1-2 0 +cglobal pixel_ssd_nv12_core_%1, 6,7,7*(mmsize/16) + shl r4d, 2 + FIX_STRIDES r1, r3 + add r0, r4 + add r2, r4 + xor r6, r6 + pxor m4, m4 + pxor m5, m5 + mova m6, [sq_0f] +.loopy: + mov r6, r4 + neg r6 + pxor m2, m2 + pxor m3, m3 +.loopx: + mova m0, [r0+r6] + mova m1, [r0+r6+mmsize] + psubw m0, [r2+r6] + psubw m1, [r2+r6+mmsize] +%if mmsize == 8 + pshufw m0, m0, 11011000b + pshufw m1, m1, 11011000b +%else + pshuflw m0, m0, 11011000b + pshuflw m1, m1, 11011000b + pshufhw m0, m0, 11011000b + pshufhw m1, m1, 11011000b +%endif + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + add r6, 2*mmsize + jl .loopx +%if mmsize == 8 + SBUTTERFLY dq, 2, 3, 1 +%else + mova m1, m2 + shufps m2, m3, 10001000b + shufps m3, m1, 11011101b +%endif + HADDD m2, m1 + HADDD m3, m1 + pand m2, m6 + pand m3, m6 + paddq m4, m2 + paddq m5, m3 + add r0, r1 + add r2, r3 + dec r5d + jg .loopy + mov r3, r6m + mov r4, r7m + movq [r3], m4 + movq [r4], m5 + RET +%endmacro ; SSD_NV12 +%endif ; X264_HIGH_BIT_DEPTH +%ifndef X264_HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; uint64_t pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, int width, int height ) +; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, +; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) +; +; This implementation can potentially overflow on image widths >= 11008 (or +; 6604 if interlaced), since it is called on blocks of height up to 12 (resp +; 20). At sane distortion levels it will take much more than that though. ;----------------------------------------------------------------------------- %macro SSD_NV12 1-2 0 cglobal pixel_ssd_nv12_core_%1, 6,7 @@ -346,7 +486,7 @@ cglobal pixel_ssd_nv12_core_%1, 6,7 jg .loopy mov r3, r6m mov r4, r7m - mova m5, [pq_0f] + mova m5, [sq_0f] HADDD m3, m0 HADDD m4, m0 pand m3, m5 @@ -355,6 +495,7 @@ cglobal pixel_ssd_nv12_core_%1, 6,7 movq [r4], m4 RET %endmacro ; SSD_NV12 +%endif ; !X264_HIGHT_BIT_DEPTH INIT_MMX SSD_NV12 mmxext @@ -368,15 +509,25 @@ SSD_NV12 sse2 %macro VAR_START 1 pxor m5, m5 ; sum pxor m6, m6 ; sum squared +%ifndef X264_HIGH_BIT_DEPTH %if %1 mova m7, [pw_00ff] %else pxor m7, m7 ; zero %endif +%endif ; !X264_HIGH_BIT_DEPTH %endmacro -%macro VAR_END 0 - HADDW m5, m7 +%macro VAR_END 2 +%ifdef X264_HIGH_BIT_DEPTH +%if mmsize == 8 && %1*%2 == 256 + HADDUW m5, m2 +%else + HADDW m5, m2 +%endif +%else ; !X264_HIGH_BIT_DEPTH + HADDW m5, m2 +%endif ; X264_HIGH_BIT_DEPTH movd eax, m5 HADDD m6, m1 movd edx, m6 @@ -405,19 +556,28 @@ SSD_NV12 sse2 %macro VAR_2ROW 2 mov r2d, %2 .loop: +%ifdef X264_HIGH_BIT_DEPTH + mova m0, [r0] + mova m1, [r0+mmsize] + mova m3, [r0+%1] + mova m4, [r0+%1+mmsize] +%else ; !X264_HIGH_BIT_DEPTH mova m0, [r0] mova m1, m0 mova m3, [r0+%1] mova m4, m3 punpcklbw m0, m7 punpckhbw m1, m7 +%endif ; X264_HIGH_BIT_DEPTH %ifidn %1, r1 lea r0, [r0+%1*2] %else add r0, r1 %endif +%ifndef X264_HIGH_BIT_DEPTH punpcklbw m3, m7 punpckhbw m4, m7 +%endif ; !X264_HIGH_BIT_DEPTH dec r2d VAR_CORE jg .loop @@ -428,16 +588,43 @@ SSD_NV12 sse2 ;----------------------------------------------------------------------------- INIT_MMX cglobal pixel_var_16x16_mmxext, 2,3 + FIX_STRIDES r1 VAR_START 0 - VAR_2ROW 8, 16 - VAR_END + VAR_2ROW 8*SIZEOF_PIXEL, 16 + VAR_END 16, 16 cglobal pixel_var_8x8_mmxext, 2,3 + FIX_STRIDES r1 VAR_START 0 VAR_2ROW r1, 4 - VAR_END + VAR_END 8, 8 INIT_XMM +%ifdef X264_HIGH_BIT_DEPTH +cglobal pixel_var_16x16_sse2, 2,3,8 + FIX_STRIDES r1 + VAR_START 0 + VAR_2ROW r1, 8 + VAR_END 16, 16 + +cglobal pixel_var_8x8_sse2, 2,3,8 + lea r2, [r1*3] + VAR_START 0 + mova m0, [r0] + mova m1, [r0+r1*2] + mova m3, [r0+r1*4] + mova m4, [r0+r2*2] + lea r0, [r0+r1*8] + VAR_CORE + mova m0, [r0] + mova m1, [r0+r1*2] + mova m3, [r0+r1*4] + mova m4, [r0+r2*2] + VAR_CORE + VAR_END 8, 8 +%endif ; X264_HIGH_BIT_DEPTH + +%ifndef X264_HIGH_BIT_DEPTH cglobal pixel_var_16x16_sse2, 2,3,8 VAR_START 1 mov r2d, 8 @@ -449,7 +636,7 @@ cglobal pixel_var_16x16_sse2, 2,3,8 VAR_CORE dec r2d jg .loop - VAR_END + VAR_END 16, 16 cglobal pixel_var_8x8_sse2, 2,4,8 VAR_START 1 @@ -465,7 +652,8 @@ cglobal pixel_var_8x8_sse2, 2,4,8 VAR_CORE dec r2d jg .loop - VAR_END + VAR_END 8, 8 +%endif ; !X264_HIGH_BIT_DEPTH %macro VAR2_END 0 HADDW m5, m7 @@ -480,17 +668,22 @@ cglobal pixel_var_8x8_sse2, 2,4,8 %endmacro ;----------------------------------------------------------------------------- -; int pixel_var2_8x8( uint8_t *, int, uint8_t *, int, int * ) +; int pixel_var2_8x8( pixel *, int, pixel *, int, int * ) ;----------------------------------------------------------------------------- -%ifndef ARCH_X86_64 INIT_MMX cglobal pixel_var2_8x8_mmxext, 5,6 + FIX_STRIDES r1, r3 VAR_START 0 mov r5d, 8 .loop: +%ifdef X264_HIGH_BIT_DEPTH + mova m0, [r0] + mova m1, [r0+mmsize] + psubw m0, [r2] + psubw m1, [r2+mmsize] +%else ; !X264_HIGH_BIT_DEPTH movq m0, [r0] movq m1, m0 - movq m4, m0 movq m2, [r2] movq m3, m2 punpcklbw m0, m7 @@ -499,6 +692,7 @@ cglobal pixel_var2_8x8_mmxext, 5,6 punpckhbw m3, m7 psubw m0, m2 psubw m1, m3 +%endif ; X264_HIGH_BIT_DEPTH paddw m5, m0 paddw m5, m1 pmaddwd m0, m0 @@ -511,18 +705,24 @@ cglobal pixel_var2_8x8_mmxext, 5,6 jg .loop VAR2_END RET -%endif INIT_XMM cglobal pixel_var2_8x8_sse2, 5,6,8 VAR_START 1 mov r5d, 4 .loop: +%ifdef X264_HIGH_BIT_DEPTH + mova m0, [r0] + mova m1, [r0+r1*2] + mova m2, [r2] + mova m3, [r2+r3*2] +%else ; !X264_HIGH_BIT_DEPTH movq m1, [r0] movhps m1, [r0+r1] movq m3, [r2] movhps m3, [r2+r3] DEINTB 0, 1, 2, 3, 7 +%endif ; X264_HIGH_BIT_DEPTH psubw m0, m2 psubw m1, m3 paddw m5, m0 @@ -531,13 +731,14 @@ cglobal pixel_var2_8x8_sse2, 5,6,8 pmaddwd m1, m1 paddd m6, m0 paddd m6, m1 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] + lea r0, [r0+r1*2*SIZEOF_PIXEL] + lea r2, [r2+r3*2*SIZEOF_PIXEL] dec r5d jg .loop VAR2_END RET +%ifndef X264_HIGH_BIT_DEPTH cglobal pixel_var2_8x8_ssse3, 5,6,8 pxor m5, m5 ; sum pxor m6, m6 ; sum squared @@ -580,6 +781,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 jg .loop VAR2_END RET +%endif ; !X264_HIGH_BIT_DEPTH ;============================================================================= ; SATD @@ -697,10 +899,11 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 ; out: %1 = satd %macro SATD_4x4_MMX 3 %xdefine %%n n%1 - LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2] - LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2] - LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2] - LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2] + %assign offset %2*SIZEOF_PIXEL + LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] + LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] + LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset] + LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset] %if %3 lea r0, [r0+4*r1] lea r2, [r2+4*r3] @@ -733,17 +936,23 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 %endmacro %macro SATD_START_MMX 0 + FIX_STRIDES r1, r3 lea r4, [3*r1] ; 3*stride1 lea r5, [3*r3] ; 3*stride2 %endmacro %macro SATD_END_MMX 0 +%ifdef X264_HIGH_BIT_DEPTH + HADDUW m0, m1 + movd eax, m0 +%else ; !X264_HIGH_BIT_DEPTH pshufw m1, m0, 01001110b paddw m0, m1 pshufw m1, m0, 10110001b paddw m0, m1 movd eax, m0 and eax, 0xffff +%endif ; X264_HIGH_BIT_DEPTH RET %endmacro @@ -777,6 +986,35 @@ pixel_satd_8x4_internal_mmxext: paddw m0, m1 ret +%ifdef X264_HIGH_BIT_DEPTH +%macro SATD_MxN_MMX 3 +cglobal pixel_satd_%1x%2_mmxext, 4,7 + SATD_START_MMX + pxor m0, m0 + call pixel_satd_%1x%3_internal_mmxext + HADDUW m0, m1 + movd r6d, m0 +%rep %2/%3-1 + pxor m0, m0 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_%1x%3_internal_mmxext + movd m2, r4 + HADDUW m0, m1 + movd r4, m0 + add r6, r4 + movd r4, m2 +%endrep + movifnidn eax, r6d + RET +%endmacro + +SATD_MxN_MMX 16, 16, 4 +SATD_MxN_MMX 16, 8, 4 +SATD_MxN_MMX 8, 16, 8 +%endif ; X264_HIGH_BIT_DEPTH + +%ifndef X264_HIGH_BIT_DEPTH cglobal pixel_satd_16x16_mmxext, 4,6 SATD_START_MMX pxor m0, m0 @@ -807,6 +1045,7 @@ cglobal pixel_satd_8x16_mmxext, 4,6 lea r2, [r2+4*r3] call pixel_satd_8x8_internal_mmxext SATD_END_MMX +%endif ; !X264_HIGH_BIT_DEPTH cglobal pixel_satd_8x8_mmxext, 4,6 SATD_START_MMX @@ -1000,7 +1239,31 @@ cglobal pixel_satd_8x4_%1, 4,6,8 SATD_END_SSE2 %1, m6 %endmacro ; SATDS_SSE2 +%macro SA8D_INTER 0 +%ifdef ARCH_X86_64 + %define lh m10 + %define rh m0 +%else + %define lh m0 + %define rh [esp+48] +%endif +%ifdef X264_HIGH_BIT_DEPTH + HADDUW m0, m1 + paddd lh, rh +%else + paddusw lh, rh +%endif ; X264_HIGH_BIT_DEPTH +%endmacro + %macro SA8D 1 +%ifdef X264_HIGH_BIT_DEPTH + %define vertical 1 +%elifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things + %define vertical 1 +%else + %define vertical 0 +%endif + %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- ; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int ) @@ -1010,7 +1273,7 @@ cglobal pixel_sa8d_8x8_internal_%1 lea r11, [r2+4*r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11 -%ifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things +%if vertical HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax %else ; non-sse2 HADAMARD4_V m0, m1, m2, m8, m6 @@ -1033,39 +1296,51 @@ cglobal pixel_sa8d_8x8_internal_%1 ret cglobal pixel_sa8d_8x8_%1, 4,6,12 + FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] -%ifnidn %1, sse2 +%if vertical == 0 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal_%1 +%ifdef X264_HIGH_BIT_DEPTH + HADDUW m0, m1 +%else HADDW m0, m1 +%endif ; X264_HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_16x16_%1, 4,6,12 + FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] -%ifnidn %1, sse2 +%if vertical == 0 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal_%1 ; pix[0] - add r2, 8 - add r0, 8 + add r2, 8*SIZEOF_PIXEL + add r0, 8*SIZEOF_PIXEL +%ifdef X264_HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif mova m10, m0 call pixel_sa8d_8x8_internal_%1 ; pix[8] lea r2, [r2+8*r3] lea r0, [r0+8*r1] - paddusw m10, m0 + SA8D_INTER call pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8] - sub r2, 8 - sub r0, 8 - paddusw m10, m0 + sub r2, 8*SIZEOF_PIXEL + sub r0, 8*SIZEOF_PIXEL + SA8D_INTER call pixel_sa8d_8x8_internal_%1 ; pix[8*stride] - paddusw m0, m10 + SA8D_INTER + SWAP m0, m10 +%ifndef X264_HIGH_BIT_DEPTH HADDUW m0, m1 +%endif movd eax, m0 add eax, 1 shr eax, 1 @@ -1077,7 +1352,7 @@ cglobal pixel_sa8d_8x8_internal_%1 %define spill0 [esp+4] %define spill1 [esp+20] %define spill2 [esp+36] -%ifidn %1, sse2 +%if vertical LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 HADAMARD4_2D 0, 1, 2, 3, 4 movdqa spill0, m3 @@ -1124,20 +1399,26 @@ cglobal pixel_sa8d_8x8_internal_%1 %endif ; ifndef mmxext cglobal pixel_sa8d_8x8_%1, 4,7 - mov r6, esp - and esp, ~15 - sub esp, 48 - lea r4, [3*r1] - lea r5, [3*r3] + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 48 + lea r4, [3*r1] + lea r5, [3*r3] call pixel_sa8d_8x8_internal_%1 - HADDW m0, m1 - movd eax, m0 - add eax, 1 - shr eax, 1 - mov esp, r6 +%ifdef X264_HIGH_BIT_DEPTH + HADDUW m0, m1 +%else + HADDW m0, m1 +%endif ; X264_HIGH_BIT_DEPTH + movd eax, m0 + add eax, 1 + shr eax, 1 + mov esp, r6 RET cglobal pixel_sa8d_16x16_%1, 4,7 + FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 @@ -1147,14 +1428,17 @@ cglobal pixel_sa8d_16x16_%1, 4,7 %ifidn %1, mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] +%endif +%ifdef X264_HIGH_BIT_DEPTH + HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal_%1 mov r0, [r6+20] mov r2, [r6+28] - add r0, 8 - add r2, 8 - paddusw m0, [esp+48] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal_%1 %ifidn %1, mmxext @@ -1162,10 +1446,13 @@ cglobal pixel_sa8d_16x16_%1, 4,7 lea r2, [r2+4*r3] %endif %if mmsize == 16 - paddusw m0, [esp+48] + SA8D_INTER %endif mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal_%1 +%ifdef X264_HIGH_BIT_DEPTH + SA8D_INTER +%else ; !X264_HIGH_BIT_DEPTH paddusw m0, [esp+64-mmsize] %if mmsize == 16 HADDUW m0, m1 @@ -1183,6 +1470,7 @@ cglobal pixel_sa8d_16x16_%1, 4,7 paddd m0, m2 HADDD m0, m1 %endif +%endif ; X264_HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 @@ -1669,6 +1957,12 @@ cglobal intra_satd_x3_8x8c_%1, 0,6 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0 ; out: [tmp]=hadamard4, m0=satd cglobal hadamard_ac_4x4_mmxext +%ifdef X264_HIGH_BIT_DEPTH + mova m0, [r0] + mova m1, [r0+r1] + mova m2, [r0+r1*2] + mova m3, [r0+r2] +%else ; !X264_HIGH_BIT_DEPTH movh m0, [r0] movh m1, [r0+r1] movh m2, [r0+r1*2] @@ -1677,6 +1971,7 @@ cglobal hadamard_ac_4x4_mmxext punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 +%endif ; X264_HIGH_BIT_DEPTH HADAMARD4_2D 0, 1, 2, 3, 4 mova [r3], m0 mova [r3+8], m1 @@ -1703,30 +1998,60 @@ cglobal hadamard_ac_2x2max_mmxext ABS4 m0, m2, m1, m3, m4, m5 HADAMARD 0, max, 0, 2, 4, 5 HADAMARD 0, max, 1, 3, 4, 5 +%ifdef X264_HIGH_BIT_DEPTH + pmaddwd m0, m7 + pmaddwd m1, m7 + paddd m6, m0 + paddd m6, m1 +%else ; !X264_HIGH_BIT_DEPTH paddw m7, m0 paddw m7, m1 +%endif ; X264_HIGH_BIT_DEPTH SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmxext ret +%macro AC_PREP 2 +%ifdef X264_HIGH_BIT_DEPTH + pmaddwd %1, %2 +%endif +%endmacro + +%macro AC_PADD 3 +%ifdef X264_HIGH_BIT_DEPTH + AC_PREP %2, %3 + paddd %1, %2 +%else + paddw %1, %2 +%endif ; X264_HIGH_BIT_DEPTH +%endmacro + cglobal hadamard_ac_8x8_mmxext mova m6, [mask_ac4] +%ifdef X264_HIGH_BIT_DEPTH + mova m7, [pw_1] +%else pxor m7, m7 +%endif ; X264_HIGH_BIT_DEPTH call hadamard_ac_4x4_mmxext - add r0, 4 + add r0, 4*SIZEOF_PIXEL add r3, 32 mova m5, m0 + AC_PREP m5, m7 call hadamard_ac_4x4_mmxext lea r0, [r0+4*r1] add r3, 64 - paddw m5, m0 + AC_PADD m5, m0, m7 call hadamard_ac_4x4_mmxext - sub r0, 4 + sub r0, 4*SIZEOF_PIXEL sub r3, 32 - paddw m5, m0 + AC_PADD m5, m0, m7 call hadamard_ac_4x4_mmxext - paddw m5, m0 + AC_PADD m5, m0, m7 sub r3, 40 mova [rsp+gprsize+8], m5 ; save satd +%ifdef X264_HIGH_BIT_DEPTH + pxor m6, m6 +%endif %rep 3 call hadamard_ac_2x2max_mmxext %endrep @@ -1738,20 +2063,77 @@ cglobal hadamard_ac_8x8_mmxext HADAMARD 0, sumsub, 0, 2, 4, 5 ABS4 m1, m3, m0, m2, m4, m5 HADAMARD 0, max, 1, 3, 4, 5 +%ifdef X264_HIGH_BIT_DEPTH + pand m0, [mask_ac4] + pmaddwd m1, m7 + pmaddwd m0, m7 + pmaddwd m2, m7 + paddd m6, m1 + paddd m0, m2 + paddd m6, m6 + paddd m0, m6 + SWAP m0, m6 +%else ; !X264_HIGH_BIT_DEPTH pand m6, m0 paddw m7, m1 paddw m6, m2 paddw m7, m7 paddw m6, m7 +%endif ; X264_HIGH_BIT_DEPTH mova [rsp+gprsize], m6 ; save sa8d SWAP m0, m6 SAVE_MM_PERMUTATION hadamard_ac_8x8_mmxext ret +%macro HADAMARD_AC_WXH_SUM_MMXEXT 2 + mova m1, [rsp+1*mmsize] +%ifdef X264_HIGH_BIT_DEPTH +%if %1*%2 >= 128 + paddd m0, [rsp+2*mmsize] + paddd m1, [rsp+3*mmsize] +%endif +%if %1*%2 == 256 + mova m2, [rsp+4*mmsize] + paddd m1, [rsp+5*mmsize] + paddd m2, [rsp+6*mmsize] + mova m3, m0 + paddd m1, [rsp+7*mmsize] + paddd m0, m2 +%endif + psrld m0, 1 + HADDD m0, m2 + psrld m1, 1 + HADDD m1, m3 +%else ; !X264_HIGH_BIT_DEPTH +%if %1*%2 >= 128 + paddusw m0, [rsp+2*mmsize] + paddusw m1, [rsp+3*mmsize] +%endif +%if %1*%2 == 256 + mova m2, [rsp+4*mmsize] + paddusw m1, [rsp+5*mmsize] + paddusw m2, [rsp+6*mmsize] + mova m3, m0 + paddusw m1, [rsp+7*mmsize] + pxor m3, m2 + pand m3, [pw_1] + pavgw m0, m2 + psubusw m0, m3 + HADDUW m0, m2 +%else + psrlw m0, 1 + HADDW m0, m2 +%endif + psrlw m1, 1 + HADDW m1, m3 +%endif ; X264_HIGH_BIT_DEPTH +%endmacro + %macro HADAMARD_AC_WXH_MMX 2 cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 + FIX_STRIDES r1 sub rsp, 16+128+pad lea r2, [r1*3] lea r3, [rsp+16] @@ -1765,7 +2147,7 @@ cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4 %if %1==16 neg ysub sub rsp, 16 - lea r0, [r0+ysub*4+8] + lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] neg ysub call hadamard_ac_8x8_mmxext %if %2==16 @@ -1774,28 +2156,7 @@ cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4 call hadamard_ac_8x8_mmxext %endif %endif - mova m1, [rsp+0x08] -%if %1*%2 >= 128 - paddusw m0, [rsp+0x10] - paddusw m1, [rsp+0x18] -%endif -%if %1*%2 == 256 - mova m2, [rsp+0x20] - paddusw m1, [rsp+0x28] - paddusw m2, [rsp+0x30] - mova m3, m0 - paddusw m1, [rsp+0x38] - pxor m3, m2 - pand m3, [pw_1] - pavgw m0, m2 - psubusw m0, m3 - HADDUW m0, m2 -%else - psrlw m0, 1 - HADDW m0, m2 -%endif - psrlw m1, 1 - HADDW m1, m3 + HADAMARD_AC_WXH_SUM_MMXEXT %1, %2 movd edx, m0 movd eax, m1 shr edx, 1 @@ -1813,6 +2174,15 @@ HADAMARD_AC_WXH_MMX 16, 8 HADAMARD_AC_WXH_MMX 8, 8 %macro LOAD_INC_8x4W_SSE2 5 +%ifdef X264_HIGH_BIT_DEPTH + movu m%1, [r0] + movu m%2, [r0+r1] + movu m%3, [r0+r1*2] + movu m%4, [r0+r2] +%ifidn %1, 0 + lea r0, [r0+r1*4] +%endif +%else ; !X264_HIGH_BIT_DEPTH movh m%1, [r0] movh m%2, [r0+r1] movh m%3, [r0+r1*2] @@ -1824,6 +2194,7 @@ HADAMARD_AC_WXH_MMX 8, 8 punpcklbw m%2, m%5 punpcklbw m%3, m%5 punpcklbw m%4, m%5 +%endif ; X264_HIGH_BIT_DEPTH %endmacro %macro LOAD_INC_8x4W_SSSE3 5 @@ -1848,15 +2219,19 @@ cglobal hadamard_ac_8x8_%1 %define spill1 [rsp+gprsize+16] %define spill2 [rsp+gprsize+32] %endif -%ifnidn %1, sse2 - ;LOAD_INC loads sumsubs - mova m7, [hmul_8p] -%else +%ifdef X264_HIGH_BIT_DEPTH + %define vertical 1 +%elifidn %1, sse2 + %define vertical 1 ;LOAD_INC only unpacks to words pxor m7, m7 +%else + %define vertical 0 + ;LOAD_INC loads sumsubs + mova m7, [hmul_8p] %endif LOAD_INC_8x4W 0, 1, 2, 3, 7 -%ifidn %1, sse2 +%if vertical HADAMARD4_2D_SSE 0, 1, 2, 3, 4 %else HADAMARD4_V m0, m1, m2, m3, m4 @@ -1864,13 +2239,13 @@ cglobal hadamard_ac_8x8_%1 mova spill0, m1 SWAP 1, 7 LOAD_INC_8x4W 4, 5, 6, 7, 1 -%ifidn %1, sse2 +%if vertical HADAMARD4_2D_SSE 4, 5, 6, 7, 1 %else HADAMARD4_V m4, m5, m6, m7, m1 %endif -%ifnidn %1, sse2 +%if vertical == 0 mova m1, spill0 mova spill0, m6 mova spill1, m7 @@ -1892,23 +2267,24 @@ cglobal hadamard_ac_8x8_%1 ABS_MOV m3, m5 paddw m1, m2 SUMSUB_BA m0, m4; m2 -%ifnidn %1, sse2 - pand m1, [mask_ac4b] -%else +%if vertical pand m1, [mask_ac4] +%else + pand m1, [mask_ac4b] %endif + AC_PREP m1, [pw_1] ABS_MOV m2, spill0 - paddw m1, m3 + AC_PADD m1, m3, [pw_1] ABS_MOV m3, spill1 - paddw m1, m2 + AC_PADD m1, m2, [pw_1] ABS_MOV m2, spill2 - paddw m1, m3 + AC_PADD m1, m3, [pw_1] ABS_MOV m3, m6 - paddw m1, m2 + AC_PADD m1, m2, [pw_1] ABS_MOV m2, m7 - paddw m1, m3 + AC_PADD m1, m3, [pw_1] mova m3, m7 - paddw m1, m2 + AC_PADD m1, m2, [pw_1] mova m2, m6 psubw m7, spill2 paddw m3, spill2 @@ -1918,30 +2294,31 @@ cglobal hadamard_ac_8x8_%1 paddw m2, spill1 psubw m5, spill0 paddw m1, spill0 -%ifnidn %1, sse2 + %assign %%x 2 +%if vertical + %assign %%x 4 +%endif mova spill1, m4 - HADAMARD 2, amax, 3, 7, 4 - HADAMARD 2, amax, 2, 6, 7, 4 - mova m4, spill1 - HADAMARD 2, amax, 1, 5, 6, 7 - HADAMARD 2, sumsub, 0, 4, 5, 6 + HADAMARD %%x, amax, 3, 7, 4 + HADAMARD %%x, amax, 2, 6, 7, 4 + mova m4, spill1 + HADAMARD %%x, amax, 1, 5, 6, 7 + HADAMARD %%x, sumsub, 0, 4, 5, 6 + AC_PREP m2, [pw_1] + AC_PADD m2, m3, [pw_1] + AC_PADD m2, m1, [pw_1] +%ifdef X264_HIGH_BIT_DEPTH + paddd m2, m2 %else - mova spill1, m4 - HADAMARD 4, amax, 3, 7, 4 - HADAMARD 4, amax, 2, 6, 7, 4 - mova m4, spill1 - HADAMARD 4, amax, 1, 5, 6, 7 - HADAMARD 4, sumsub, 0, 4, 5, 6 -%endif - paddw m2, m3 - paddw m2, m1 - paddw m2, m2 + paddw m2, m2 +%endif ; X264_HIGH_BIT_DEPTH ABS1 m4, m7 pand m0, [mask_ac8] ABS1 m0, m7 - paddw m2, m4 - paddw m0, m2 - mova [rsp+gprsize+16], m0 ; save sa8d + AC_PADD m2, m4, [pw_1] + AC_PADD m2, m0, [pw_1] + mova [rsp+gprsize+16], m2 ; save sa8d + SWAP m0, m2 SAVE_MM_PERMUTATION hadamard_ac_8x8_%1 ret @@ -1951,11 +2328,45 @@ HADAMARD_AC_WXH_SSE2 16, 8, %1 HADAMARD_AC_WXH_SSE2 8, 8, %1 %endmacro ; HADAMARD_AC_SSE2 +%macro HADAMARD_AC_WXH_SUM_SSE2 2 + mova m1, [rsp+2*mmsize] +%ifdef X264_HIGH_BIT_DEPTH +%if %1*%2 >= 128 + paddd m0, [rsp+3*mmsize] + paddd m1, [rsp+4*mmsize] +%endif +%if %1*%2 == 256 + paddd m0, [rsp+5*mmsize] + paddd m1, [rsp+6*mmsize] + paddd m0, [rsp+7*mmsize] + paddd m1, [rsp+8*mmsize] + psrld m0, 1 +%endif + HADDD m0, m2 + HADDD m1, m3 +%else ; !X264_HIGH_BIT_DEPTH +%if %1*%2 >= 128 + paddusw m0, [rsp+3*mmsize] + paddusw m1, [rsp+4*mmsize] +%endif +%if %1*%2 == 256 + paddusw m0, [rsp+5*mmsize] + paddusw m1, [rsp+6*mmsize] + paddusw m0, [rsp+7*mmsize] + paddusw m1, [rsp+8*mmsize] + psrlw m0, 1 +%endif + HADDUW m0, m2 + HADDW m1, m3 +%endif ; X264_HIGH_BIT_DEPTH +%endmacro + ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) %macro HADAMARD_AC_WXH_SSE2 3 cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 + FIX_STRIDES r1 sub rsp, 48+pad lea r2, [r1*3] call hadamard_ac_8x8_%3 @@ -1968,7 +2379,7 @@ cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11 %if %1==16 neg ysub sub rsp, 32 - lea r0, [r0+ysub*4+8] + lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] neg ysub call hadamard_ac_8x8_%3 %if %2==16 @@ -1977,20 +2388,7 @@ cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11 call hadamard_ac_8x8_%3 %endif %endif - mova m1, [rsp+0x20] -%if %1*%2 >= 128 - paddusw m0, [rsp+0x30] - paddusw m1, [rsp+0x40] -%endif -%if %1*%2 == 256 - paddusw m0, [rsp+0x50] - paddusw m1, [rsp+0x60] - paddusw m0, [rsp+0x70] - paddusw m1, [rsp+0x80] - psrlw m0, 1 -%endif - HADDW m0, m2 - HADDW m1, m3 + HADAMARD_AC_WXH_SUM_SSE2 %1, %2 movd edx, m0 movd eax, m1 shr edx, 2 - (%1*%2 >> 8) @@ -2025,7 +2423,9 @@ INIT_XMM SA8D sse2 SATDS_SSE2 sse2 INTRA_SA8D_SSE2 sse2 +%ifndef X264_HIGH_BIT_DEPTH INTRA_SATDS_MMX mmxext +%endif HADAMARD_AC_SSE2 sse2 %define ABS1 ABS1_SSSE3 @@ -2034,9 +2434,11 @@ HADAMARD_AC_SSE2 sse2 %define DIFFOP DIFF_SUMSUB_SSSE3 %define JDUP JDUP_CONROE %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE +%ifndef X264_HIGH_BIT_DEPTH %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 +%endif SATDS_SSE2 ssse3 SA8D ssse3 HADAMARD_AC_SSE2 ssse3 diff --git a/common/x86/pixel.h b/common/x86/pixel.h index d1051038..dfedd7a5 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -38,21 +38,24 @@ ret x264_pixel_##name##_4x4_##suffix args;\ #define DECL_X1( name, suffix ) \ - DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) ) + DECL_PIXELS( int, name, suffix, ( pixel *, int, pixel *, int ) ) #define DECL_X4( name, suffix ) \ - DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\ - DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) ) + DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, int, int * ) )\ + DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int * ) ) DECL_X1( sad, mmxext ) DECL_X1( sad, sse2 ) DECL_X4( sad, sse2_misalign ) DECL_X1( sad, sse3 ) DECL_X1( sad, sse2_aligned ) +DECL_X1( sad, ssse3 ) DECL_X4( sad, mmxext ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) +DECL_X4( sad, ssse3 ) DECL_X1( ssd, mmx ) +DECL_X1( ssd, mmxext ) DECL_X1( ssd, sse2slow ) DECL_X1( ssd, sse2 ) DECL_X1( ssd, ssse3 ) @@ -73,49 +76,51 @@ DECL_X4( sad, cache64_mmxext ); DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); -DECL_PIXELS( uint64_t, var, mmxext, ( uint8_t *pix, int i_stride )) -DECL_PIXELS( uint64_t, var, sse2, ( uint8_t *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( uint8_t *pix, int i_stride )) +DECL_PIXELS( uint64_t, var, mmxext, ( pixel *pix, int i_stride )) +DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( pixel *pix, int i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride )) -void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_4x4_mmxext ( pixel *, pixel *, int * ); void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * ); -void x264_intra_sad_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_4x4_mmxext ( pixel *, pixel *, int * ); void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_8x8c_mmxext ( pixel *, pixel *, int * ); void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); -void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * ); -void x264_intra_sad_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_8x8c_mmxext ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_8x8c_sse2 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_8x8c_ssse3 ( pixel *, pixel *, int * ); +void x264_intra_satd_x3_16x16_mmxext( pixel *, pixel *, int * ); void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * ); -void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * ); -void x264_intra_sad_x3_16x16_sse2 ( uint8_t *, uint8_t *, int * ); -void x264_intra_sad_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_16x16_mmxext ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_16x16_sse2 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * ); void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * ); void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * ); -void x264_intra_sad_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * ); -void x264_intra_sad_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_8x8_mmxext ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * ); void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * ); void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * ); void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * ); -void x264_pixel_ssd_nv12_core_mmxext( uint8_t *pixuv1, int stride1, - uint8_t *pixuv2, int stride2, int width, +void x264_pixel_ssd_nv12_core_mmxext( pixel *pixuv1, int stride1, + pixel *pixuv2, int stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); -void x264_pixel_ssd_nv12_core_sse2( uint8_t *pixuv1, int stride1, - uint8_t *pixuv2, int stride2, int width, +void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, int stride1, + pixel *pixuv2, int stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); -int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * ); -int x264_pixel_var2_8x8_sse2( uint8_t *, int, uint8_t *, int, int * ); +int x264_pixel_var2_8x8_mmxext( pixel *, int, pixel *, int, int * ); +int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * ); int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * ); #define DECL_ADS( size, suffix ) \ diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm new file mode 100644 index 00000000..d19d05b3 --- /dev/null +++ b/common/x86/sad16-a.asm @@ -0,0 +1,432 @@ +;***************************************************************************** +;* sad16-a.asm: x86 high depth sad functions +;***************************************************************************** +;* Copyright (C) 2010 x264 project +;* +;* Authors: Oskar Arvidsson +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licensing@x264.com. +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION .text + +cextern pw_1 + +;============================================================================= +; SAD MMX +;============================================================================= + +%macro SAD_INC_1x16P_MMX 0 + movu m1, [r0+ 0] + movu m2, [r0+ 8] + movu m3, [r0+16] + movu m4, [r0+24] + psubw m1, [r2+ 0] + psubw m2, [r2+ 8] + psubw m3, [r2+16] + psubw m4, [r2+24] + ABS2 m1, m2, m5, m6 + ABS2 m3, m4, m7, m5 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + paddw m1, m2 + paddw m3, m4 + paddw m0, m1 + paddw m0, m3 +%endmacro + +%macro SAD_INC_2x8P_MMX 0 + movu m1, [r0+0] + movu m2, [r0+8] + movu m3, [r0+2*r1+0] + movu m4, [r0+2*r1+8] + psubw m1, [r2+0] + psubw m2, [r2+8] + psubw m3, [r2+2*r3+0] + psubw m4, [r2+2*r3+8] + ABS2 m1, m2, m5, m6 + ABS2 m3, m4, m7, m5 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m1, m2 + paddw m3, m4 + paddw m0, m1 + paddw m0, m3 +%endmacro + +%macro SAD_INC_2x4P_MMX 0 + movu m1, [r0] + movu m2, [r0+2*r1] + psubw m1, [r2] + psubw m2, [r2+2*r3] + ABS2 m1, m2, m3, m4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m0, m1 + paddw m0, m2 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int ) +;----------------------------------------------------------------------------- +%macro SAD_MMX 4 +cglobal pixel_sad_%1x%2_%4, 4,4 + pxor m0, m0 +%rep %2/%3 + SAD_INC_%3x%1P_MMX +%endrep +%if %1*%2 == 256 + HADDUW m0, m1 +%else + HADDW m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_MMX +%define ABS1 ABS1_MMX +%define ABS2 ABS2_MMX +SAD_MMX 16, 16, 1, mmxext +SAD_MMX 16, 8, 1, mmxext +SAD_MMX 8, 16, 2, mmxext +SAD_MMX 8, 8, 2, mmxext +SAD_MMX 8, 4, 2, mmxext +SAD_MMX 4, 8, 2, mmxext +SAD_MMX 4, 4, 2, mmxext +%define ABS1 ABS1_SSSE3 +%define ABS2 ABS2_SSSE3 +SAD_MMX 4, 8, 2, ssse3 +SAD_MMX 4, 4, 2, ssse3 +%undef ABS1 +%undef ABS2 + +;============================================================================= +; SAD XMM +;============================================================================= + +%macro SAD_INC_2x16P_XMM 0 + movu m1, [r2+ 0] + movu m2, [r2+16] + movu m3, [r2+2*r3+ 0] + movu m4, [r2+2*r3+16] + psubw m1, [r0+ 0] + psubw m2, [r0+16] + psubw m3, [r0+2*r1+ 0] + psubw m4, [r0+2*r1+16] + ABS2 m1, m2, m5, m6 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + ABS2 m3, m4, m7, m5 + paddw m1, m2 + paddw m3, m4 + paddw m0, m1 + paddw m0, m3 +%endmacro + +%macro SAD_INC_2x8P_XMM 0 + movu m1, [r2] + movu m2, [r2+2*r3] + psubw m1, [r0] + psubw m2, [r0+2*r1] + ABS2 m1, m2, m3, m4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m0, m1 + paddw m0, m2 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int ) +;----------------------------------------------------------------------------- +%macro SAD_XMM 3 +cglobal pixel_sad_%1x%2_%3, 4,4,8 + pxor m0, m0 +%rep %2/2 + SAD_INC_2x%1P_XMM +%endrep + HADDW m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM +%define ABS1 ABS1_MMX +%define ABS2 ABS2_MMX +SAD_XMM 16, 16, sse2 +SAD_XMM 16, 8, sse2 +SAD_XMM 8, 16, sse2 +SAD_XMM 8, 8, sse2 +SAD_XMM 8, 4, sse2 +%define movdqu movdqa +SAD_XMM 16, 16, sse2_aligned +SAD_XMM 16, 8, sse2_aligned +SAD_XMM 8, 16, sse2_aligned +SAD_XMM 8, 8, sse2_aligned +%undef movdqu +%define ABS1 ABS1_SSSE3 +%define ABS2 ABS2_SSSE3 +SAD_XMM 16, 16, ssse3 +SAD_XMM 16, 8, ssse3 +SAD_XMM 8, 16, ssse3 +SAD_XMM 8, 8, ssse3 +SAD_XMM 8, 4, ssse3 +%undef ABS1 +%undef ABS2 + +;============================================================================= +; SAD x3/x4 +;============================================================================= + +%macro SAD_X3_INC_P 0 + add r0, 4*FENC_STRIDE + lea r1, [r1+4*r4] + lea r2, [r2+4*r4] + lea r3, [r3+4*r4] +%endmacro + +%macro SAD_X3_ONE_START 0 + mova m3, [r0] + movu m0, [r1] + movu m1, [r2] + movu m2, [r3] + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + ABS2 m0, m1, m4, m5 + ABS1 m2, m6 +%endmacro + +%macro SAD_X3_ONE 2 + mova m6, [r0+%1] + movu m3, [r1+%2] + movu m4, [r2+%2] + movu m5, [r3+%2] + psubw m3, m6 + psubw m4, m6 + psubw m5, m6 + ABS2 m3, m4, m7, m6 + ABS1 m5, m6 + paddw m0, m3 + paddw m1, m4 + paddw m2, m5 +%endmacro + +%macro SAD_X3_END 2 +%if mmsize == 8 && %1*%2 == 256 + HADDUW m0, m3 + HADDUW m1, m4 + HADDUW m2, m5 +%else + HADDW m0, m3 + HADDW m1, m4 + HADDW m2, m5 +%endif +%ifdef UNIX64 + movd [r5+0], m0 + movd [r5+4], m1 + movd [r5+8], m2 +%else + mov r0, r5mp + movd [r0+0], m0 + movd [r0+4], m1 + movd [r0+8], m2 +%endif + RET +%endmacro + +%macro SAD_X4_INC_P 0 + add r0, 4*FENC_STRIDE + lea r1, [r1+4*r5] + lea r2, [r2+4*r5] + lea r3, [r3+4*r5] + lea r4, [r4+4*r5] +%endmacro + +%macro SAD_X4_ONE_START 0 + mova m4, [r0] + movu m0, [r1] + movu m1, [r2] + movu m2, [r3] + movu m3, [r4] + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + ABS2 m0, m1, m5, m6 + ABS2 m2, m3, m4, m7 +%endmacro + +%macro SAD_X4_ONE 2 + mova m4, [r0+%1] + movu m5, [r1+%2] + movu m6, [r2+%2] +%if num_mmregs > 8 + movu m7, [r3+%2] + movu m8, [r4+%2] + psubw m5, m4 + psubw m6, m4 + psubw m7, m4 + psubw m8, m4 + ABS2 m5, m6, m9, m10 + ABS2 m7, m8, m9, m10 + paddw m0, m5 + paddw m1, m6 + paddw m2, m7 + paddw m3, m8 +%elifidn ABS1, ABS1_SSSE3 + movu m7, [r3+%2] + psubw m5, m4 + psubw m6, m4 + psubw m7, m4 + movu m4, [r4+%2] + pabsw m5, m5 + psubw m4, [r0+%1] + pabsw m6, m6 + pabsw m7, m7 + pabsw m4, m4 + paddw m0, m5 + paddw m1, m6 + paddw m2, m7 + paddw m3, m4 +%else ; num_mmregs == 8 && !ssse3 + psubw m5, m4 + psubw m6, m4 + ABS1 m5, m7 + ABS1 m6, m7 + paddw m0, m5 + paddw m1, m6 + movu m5, [r3+%2] + movu m6, [r4+%2] + psubw m5, m4 + psubw m6, m4 + ABS2 m5, m6, m7, m4 + paddw m2, m5 + paddw m3, m6 +%endif +%endmacro + +%macro SAD_X4_END 2 +%if mmsize == 8 && %1*%2 == 256 + HADDUW m0, m4 + HADDUW m1, m5 + HADDUW m2, m6 + HADDUW m3, m7 +%else + HADDW m0, m4 + HADDW m1, m5 + HADDW m2, m6 + HADDW m3, m7 +%endif + mov r0, r6mp + movd [r0+ 0], m0 + movd [r0+ 4], m1 + movd [r0+ 8], m2 + movd [r0+12], m3 + RET +%endmacro + +%macro SAD_X_2xNP 4 + %assign x %3 +%rep %4 + SAD_X%1_ONE x*mmsize, x*mmsize + SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize + %assign x x+1 +%endrep +%endmacro + +;----------------------------------------------------------------------------- +; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, +; uint16_t *pix2, int i_stride, int scores[3] ) +;----------------------------------------------------------------------------- +%macro SAD_X 4 +cglobal pixel_sad_x%1_%2x%3_%4, 6,7,XMM_REGS + %assign regnum %1+1 + %xdefine STRIDE r %+ regnum +%ifdef WIN64 + movsxd STRIDE, STRIDE %+ d +%endif + mov r6, %3/2-1 + SAD_X%1_ONE_START + SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE + SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1 +.loop: + SAD_X%1_INC_P + dec r6 + SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2) + jg .loop +%if %1 == 4 + mov r6, r6m +%endif + SAD_X%1_END %2, %3 +%endmacro + +INIT_MMX +%define XMM_REGS 0 +%define ABS1 ABS1_MMX +%define ABS2 ABS2_MMX +SAD_X 3, 16, 16, mmxext +SAD_X 3, 16, 8, mmxext +SAD_X 3, 8, 16, mmxext +SAD_X 3, 8, 8, mmxext +SAD_X 3, 8, 4, mmxext +SAD_X 3, 4, 8, mmxext +SAD_X 3, 4, 4, mmxext +SAD_X 4, 16, 16, mmxext +SAD_X 4, 16, 8, mmxext +SAD_X 4, 8, 16, mmxext +SAD_X 4, 8, 8, mmxext +SAD_X 4, 8, 4, mmxext +SAD_X 4, 4, 8, mmxext +SAD_X 4, 4, 4, mmxext +%define ABS1 ABS1_SSSE3 +%define ABS2 ABS2_SSSE3 +SAD_X 3, 4, 8, ssse3 +SAD_X 3, 4, 4, ssse3 +SAD_X 4, 4, 8, ssse3 +SAD_X 4, 4, 4, ssse3 +INIT_XMM +%define XMM_REGS 9 +SAD_X 3, 16, 16, ssse3 +SAD_X 3, 16, 8, ssse3 +SAD_X 3, 8, 16, ssse3 +SAD_X 3, 8, 8, ssse3 +SAD_X 3, 8, 4, ssse3 +SAD_X 4, 16, 16, ssse3 +SAD_X 4, 16, 8, ssse3 +SAD_X 4, 8, 16, ssse3 +SAD_X 4, 8, 8, ssse3 +SAD_X 4, 8, 4, ssse3 +%define XMM_REGS 11 +%define ABS1 ABS1_MMX +%define ABS2 ABS2_MMX +SAD_X 3, 16, 16, sse2 +SAD_X 3, 16, 8, sse2 +SAD_X 3, 8, 16, sse2 +SAD_X 3, 8, 8, sse2 +SAD_X 3, 8, 4, sse2 +SAD_X 4, 16, 16, sse2 +SAD_X 4, 16, 8, sse2 +SAD_X 4, 8, 16, sse2 +SAD_X 4, 8, 8, sse2 +SAD_X 4, 8, 4, sse2 +%undef ABS1 +%undef ABS2 diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index cf827d23..afa87045 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -605,6 +605,32 @@ pminsw %1, %3 %endmacro +%macro HADDD 2 ; sum junk +%if mmsize == 16 + movhlps %2, %1 + paddd %1, %2 + pshuflw %2, %1, 0xE + paddd %1, %2 +%else + pshufw %2, %1, 0xE + paddd %1, %2 +%endif +%endmacro + +%macro HADDW 2 + pmaddwd %1, [pw_1] + HADDD %1, %2 +%endmacro + +%macro HADDUW 2 + mova %2, %1 + pslld %1, 16 + psrld %2, 16 + psrld %1, 16 + paddd %1, %2 + HADDD %1, %2 +%endmacro + %macro FIX_STRIDES 1-* %ifdef X264_HIGH_BIT_DEPTH %rep %0 -- 2.40.0