From: Holger Lubitz Date: Sat, 7 Mar 2009 02:16:30 +0000 (-0800) Subject: Vastly faster SATD/SA8D/Hadamard_AC/SSD/DCT/IDCT X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=54e38917b413e80b474d3ed7ba344e7c489b020c;p=libx264 Vastly faster SATD/SA8D/Hadamard_AC/SSD/DCT/IDCT Heavily optimized for Core 2 and Nehalem, but performance should improve on all modern x86 CPUs. 16x16 SATD: +18% speed on K8(64bit), +22% on K10(32bit), +42% on Penryn(64bit), +44% on Nehalem(64bit), +50% on P4(32bit), +98% on Conroe(64bit) Similar performance boosts in SATD-like functions (SA8D, hadamard_ac) and somewhat less in DCT/IDCT/SSD. Overall performance boost is up to ~15% on 64-bit Conroe. --- diff --git a/common/dct.c b/common/dct.c index f6095409..04301a92 100644 --- a/common/dct.c +++ b/common/dct.c @@ -446,6 +446,11 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) if( cpu&X264_CPU_SSSE3 ) { + dctf->sub4x4_dct = x264_sub4x4_dct_ssse3; + dctf->sub8x8_dct = x264_sub8x8_dct_ssse3; + dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; + dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; + dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; } diff --git a/common/pixel.c b/common/pixel.c index 7fa9830b..38c39260 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -426,7 +426,7 @@ SATD_X_DECL7() SATD_X_DECL7( _mmxext ) SATD_X_DECL6( _sse2 ) SATD_X_DECL7( _ssse3 ) -SATD_X_DECL6( _ssse3_phadd ) +SATD_X_DECL7( _sse4 ) #endif /**************************************************************************** @@ -667,11 +667,28 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext; } + if( cpu&X264_CPU_SSE2 ) + { + INIT5( ssd, _sse2slow ); + INIT2_NAME( sad_aligned, sad, _sse2_aligned ); + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; + pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; +#ifdef ARCH_X86_64 + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; +#endif + } + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { INIT2( sad, _sse2 ); INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); + INIT6( satd, _sse2 ); + INIT6( satd_x3, _sse2 ); + INIT6( satd_x4, _sse2 ); if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _sse2 ); @@ -679,9 +696,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT_ADS( _sse2 ); pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; - - if( cpu&X264_CPU_CACHELINE_64 ) + if( cpu&X264_CPU_CACHELINE_64 ) { + INIT2( ssd, _sse2); /* faster for width 16 on p4 */ #ifdef ARCH_X86 INIT2( sad, _cache64_sse2 ); INIT2( sad_x3, _cache64_sse2 ); @@ -700,31 +717,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x4, _sse2_misalign ); } } - if( cpu&X264_CPU_SSE2 ) - { - INIT5( ssd, _sse2 ); - if( cpu&X264_CPU_SSE2_IS_FAST ) - { - INIT6( satd, _sse2 ); - INIT6( satd_x3, _sse2 ); - INIT6( satd_x4, _sse2 ); - } - else - { - INIT5( satd, _sse2 ); - INIT5( satd_x3, _sse2 ); - INIT5( satd_x4, _sse2 ); - } - INIT2_NAME( sad_aligned, sad, _sse2_aligned ); - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; - pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; - pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; - pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; - pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; -#ifdef ARCH_X86_64 - pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; -#endif - } if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) ) { @@ -747,6 +739,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( cpu&X264_CPU_SSSE3 ) { + INIT7( ssd, _ssse3 ); INIT7( satd, _ssse3 ); INIT7( satd_x3, _ssse3 ); INIT7( satd_x4, _ssse3 ); @@ -770,18 +763,23 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x3, _cache64_ssse3 ); INIT2( sad_x4, _cache64_ssse3 ); } - if( cpu&X264_CPU_PHADD_IS_FAST ) + if( !(cpu&X264_CPU_PHADD_IS_FAST) ) { - INIT6( satd, _ssse3_phadd ); - INIT6( satd_x3, _ssse3_phadd ); - INIT6( satd_x4, _ssse3_phadd ); + INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */ } } if( cpu&X264_CPU_SSE4 ) { - pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sse4; - pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sse4; + INIT7( satd, _sse4 ); + INIT7( satd_x3, _sse4 ); + INIT7( satd_x4, _sse4 ); + if( !(cpu&X264_CPU_STACK_MOD4) ) + { + INIT4( hadamard_ac, _sse4 ); + } + pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; } #endif //HAVE_MMX diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm index 987c9385..173f2234 100644 --- a/common/x86/dct-32.asm +++ b/common/x86/dct-32.asm @@ -3,10 +3,11 @@ ;***************************************************************************** ;* Copyright (C) 2003-2008 x264 project ;* -;* Authors: Laurent Aimar (initial version) -;* Loren Merritt (misc) -;* Min Chen (converted to nasm) -;* Christian Heine (dct8/idct8 functions) +;* Authors: Laurent Aimar +;* Loren Merritt +;* Holger Lubitz +;* Min Chen +;* Christian Heine ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -29,6 +30,7 @@ SECTION_RODATA pw_32: times 8 dw 32 +hsub_mul: times 8 db 1, -1 SECTION .text @@ -340,26 +342,64 @@ global x264_add8x8_idct8_mmx.skip_prologue ADD_STORE_ROW 7, m7, [r1+0x78] ret - - INIT_XMM +%macro DCT_SUB8 1 +cglobal x264_sub8x8_dct_%1, 3,3 + add r2, 4*FDEC_STRIDE +global x264_sub8x8_dct_%1.skip_prologue +.skip_prologue: +%ifnidn %1, sse2 + mova m7, [hsub_mul GLOBAL] +%endif + LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE + SPILL r0, 1,2 + SWAP 2, 7 + LOAD_DIFF8x4 4, 5, 6, 7, 1, 2, r1, r2-4*FDEC_STRIDE + UNSPILL r0, 1 + SPILL r0, 7 + SWAP 2, 7 + UNSPILL r0, 2 + DCT4_1D 0, 1, 2, 3, 7 + TRANSPOSE2x4x4W 0, 1, 2, 3, 7 + UNSPILL r0, 7 + SPILL r0, 2 + DCT4_1D 4, 5, 6, 7, 2 + TRANSPOSE2x4x4W 4, 5, 6, 7, 2 + UNSPILL r0, 2 + SPILL r0, 6 + DCT4_1D 0, 1, 2, 3, 6 + UNSPILL r0, 6 + STORE_DCT 0, 1, 2, 3, r0, 0 + DCT4_1D 4, 5, 6, 7, 3 + STORE_DCT 4, 5, 6, 7, r0, 64 + ret ;----------------------------------------------------------------------------- ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_sse2, 3,3 -global x264_sub8x8_dct8_sse2.skip_prologue +cglobal x264_sub8x8_dct8_%1, 3,3 + add r2, 4*FDEC_STRIDE +global x264_sub8x8_dct8_%1.skip_prologue .skip_prologue: - LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] - LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] - LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] - LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] - LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] - LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] +%ifidn %1, sse2 + LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE] + LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE] + LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE] + LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2-1*FDEC_STRIDE] + LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+0*FDEC_STRIDE] + LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+1*FDEC_STRIDE] SPILL r0, 0 - LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] - LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] + LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE] + LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE] UNSPILL r0, 0 +%else + mova m7, [hsub_mul GLOBAL] + LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE + SPILL r0, 0,1 + SWAP 1, 7 + LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE + UNSPILL r0, 0,1 +%endif DCT8_1D 0,1,2,3,4,5,6,7,r0 UNSPILL r0, 0,4 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1 @@ -367,11 +407,59 @@ global x264_sub8x8_dct8_sse2.skip_prologue DCT8_1D 0,1,2,3,4,5,6,7,r0 SPILL r0, 1,2,3,5,7 ret +%endmacro + +%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2 +%define movdqa movaps +%define punpcklqdq movlhps +DCT_SUB8 sse2 +%undef movdqa +%undef punpcklqdq +%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3 +DCT_SUB8 ssse3 + +;----------------------------------------------------------------------------- +; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_add8x8_idct_sse2, 2,2 + add r0, 4*FDEC_STRIDE +global x264_add8x8_idct_sse2.skip_prologue +.skip_prologue: + UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3 + SBUTTERFLY qdq, 0, 1, 4 + SBUTTERFLY qdq, 2, 3, 4 + UNSPILL_SHUFFLE r1, 4,6,5,7, 4,5,6,7 + SPILL r1, 0 + SBUTTERFLY qdq, 4, 5, 0 + SBUTTERFLY qdq, 6, 7, 0 + UNSPILL r1,0 + IDCT4_1D 0,1,2,3,r1 + SPILL r1, 4 + TRANSPOSE2x4x4W 0,1,2,3,4 + UNSPILL r1, 4 + IDCT4_1D 4,5,6,7,r1 + SPILL r1, 0 + TRANSPOSE2x4x4W 4,5,6,7,0 + UNSPILL r1, 0 + paddw m0, [pw_32 GLOBAL] + IDCT4_1D 0,1,2,3,r1 + paddw m4, [pw_32 GLOBAL] + IDCT4_1D 4,5,6,7,r1 + SPILL r1, 6,7 + pxor m7, m7 + DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5 + DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5 + UNSPILL_SHUFFLE r1, 0,2, 6,7 + DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5 + DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5 + STORE_IDCT m1, m3, m5, m2 + ret ;----------------------------------------------------------------------------- ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- cglobal x264_add8x8_idct8_sse2, 2,2 + add r0, 4*FDEC_STRIDE global x264_add8x8_idct8_sse2.skip_prologue .skip_prologue: UNSPILL r1, 1,2,3,5,6,7 @@ -383,14 +471,10 @@ global x264_add8x8_idct8_sse2.skip_prologue IDCT8_1D 0,1,2,3,4,5,6,7,r1 SPILL r1, 6,7 pxor m7, m7 - STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*0] - STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*1] - STORE_DIFF m2, m6, m7, [r0+FDEC_STRIDE*2] - STORE_DIFF m3, m6, m7, [r0+FDEC_STRIDE*3] - STORE_DIFF m4, m6, m7, [r0+FDEC_STRIDE*4] - STORE_DIFF m5, m6, m7, [r0+FDEC_STRIDE*5] - UNSPILL_SHUFFLE r1, 0,1, 6,7 - STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*6] - STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*7] + DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5 + DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5 + UNSPILL_SHUFFLE r1, 0,2, 6,7 + DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5 + DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5 + STORE_IDCT m1, m3, m5, m2 ret - diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm index 2aecf3c0..be43c1b3 100644 --- a/common/x86/dct-64.asm +++ b/common/x86/dct-64.asm @@ -3,9 +3,10 @@ ;***************************************************************************** ;* Copyright (C) 2003-2008 x264 project ;* -;* Authors: Laurent Aimar (initial version) -;* Loren Merritt (dct8, misc) -;* Min Chen (converted to nasm) +;* Authors: Laurent Aimar +;* Loren Merritt +;* Holger Lubitz +;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -27,18 +28,19 @@ SECTION_RODATA pw_32: times 8 dw 32 +hsub_mul: times 8 db 1, -1 SECTION .text INIT_XMM %macro DCT8_1D 10 - SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07 - SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16 - SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25 SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34 + SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25 + SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16 + SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07 - SUMSUB_BA m%5, m%8 ; %5=a0, %8=a2 - SUMSUB_BA m%6, m%7 ; %6=a1, %7=a3 + SUMSUB_BA m%6, m%7, m%10 ; %6=a1, %7=a3 + SUMSUB_BA m%5, m%8, m%10 ; %5=a0, %8=a2 movdqa m%9, m%1 psraw m%9, 1 @@ -60,14 +62,14 @@ INIT_XMM psubw m%1, m%3 ; %1=a5 psubw m%4, m%2 ; %4=a6 - SUMSUB_BA m%6, m%5 ; %6=b0, %5=b4 - movdqa m%2, m%10 psraw m%2, 2 paddw m%2, m%9 ; %2=b1 psraw m%9, 2 psubw m%9, m%10 ; %9=b7 + SUMSUB_BA m%6, m%5, m%10 ; %6=b0, %5=b4 + movdqa m%3, m%7 psraw m%3, 1 paddw m%3, m%8 ; %3=b2 @@ -83,41 +85,8 @@ INIT_XMM SWAP %1, %6, %4, %7, %8, %9 %endmacro -;----------------------------------------------------------------------------- -; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) -;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_sse2, 3,3,10 - LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] - LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] - LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] - LOAD_DIFF m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] - LOAD_DIFF m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] - LOAD_DIFF m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] - LOAD_DIFF m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] - LOAD_DIFF m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] - - DCT8_1D 0,1,2,3,4,5,6,7,8,9 - TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 - DCT8_1D 0,1,2,3,4,5,6,7,8,9 - - movdqa [r0+0x00], m0 - movdqa [r0+0x10], m1 - movdqa [r0+0x20], m2 - movdqa [r0+0x30], m3 - movdqa [r0+0x40], m4 - movdqa [r0+0x50], m5 - movdqa [r0+0x60], m6 - movdqa [r0+0x70], m7 - RET - - %macro IDCT8_1D 10 - SUMSUB_BA m%5, m%1 ; %5=a0, %1=a2 - movdqa m%10, m%3 - psraw m%3, 1 - psubw m%3, m%7 ; %3=a4 - psraw m%7, 1 - paddw m%7, m%10 ; %7=a6 + SUMSUB_BA m%5, m%1, m%9 ; %5=a0, %1=a2 movdqa m%9, m%2 psraw m%9, 1 @@ -125,6 +94,12 @@ cglobal x264_sub8x8_dct8_sse2, 3,3,10 paddw m%9, m%4 paddw m%9, m%6 ; %9=a7 + movdqa m%10, m%3 + psraw m%3, 1 + psubw m%3, m%7 ; %3=a4 + psraw m%7, 1 + paddw m%7, m%10 ; %7=a6 + movdqa m%10, m%6 psraw m%10, 1 paddw m%10, m%6 @@ -140,34 +115,108 @@ cglobal x264_sub8x8_dct8_sse2, 3,3,10 psubw m%2, m%4 ; %2=a3 psubw m%6, m%8 ; %6=a1 - SUMSUB_BA m%7, m%5 ; %7=b0, %5=b6 - SUMSUB_BA m%3, m%1 ; %3=b2, %1=b4 - movdqa m%4, m%9 psraw m%4, 2 paddw m%4, m%6 ; %4=b1 psraw m%6, 2 psubw m%9, m%6 ; %9=b7 + SUMSUB_BA m%7, m%5, m%6 ; %7=b0, %5=b6 + SUMSUB_BA m%3, m%1, m%6; %3=b2, %1=b4 + movdqa m%8, m%10 psraw m%8, 2 paddw m%8, m%2 ; %8=b3 psraw m%2, 2 psubw m%2, m%10 ; %2=b5 - SUMSUB_BA m%9, m%7 ; %9=c0, %7=c7 - SUMSUB_BA m%2, m%3 ; %2=c1, %3=c6 - SUMSUB_BA m%8, m%1 ; %8=c2, %1=c5 - SUMSUB_BA m%4, m%5 ; %4=c3, %5=c4 + SUMSUB_BA m%9, m%7, m%6 ; %9=c0, %7=c7 + SUMSUB_BA m%2, m%3, m%6 ; %2=c1, %3=c6 + SUMSUB_BA m%8, m%1, m%6 ; %8=c2, %1=c5 + SUMSUB_BA m%4, m%5, m%6 ; %4=c3, %5=c4 SWAP %1, %9, %6 SWAP %3, %8, %7 %endmacro +%macro DCT_SUB8 1 +cglobal x264_sub8x8_dct_%1, 3,3,11 + add r2, 4*FDEC_STRIDE +%ifnidn %1, sse2 + mova m7, [hsub_mul GLOBAL] +%endif +%ifdef WIN64 + call .skip_prologue + RET +%endif +global x264_sub8x8_dct_%1.skip_prologue +.skip_prologue: + SWAP 7, 9 + LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE + LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE + DCT4_1D 0, 1, 2, 3, 8 + TRANSPOSE2x4x4W 0, 1, 2, 3, 8 + DCT4_1D 4, 5, 6, 7, 8 + TRANSPOSE2x4x4W 4, 5, 6, 7, 8 + DCT4_1D 0, 1, 2, 3, 8 + STORE_DCT 0, 1, 2, 3, r0, 0 + DCT4_1D 4, 5, 6, 7, 8 + STORE_DCT 4, 5, 6, 7, r0, 64 + ret + +;----------------------------------------------------------------------------- +; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) +;----------------------------------------------------------------------------- +cglobal x264_sub8x8_dct8_%1, 3,3,11 + add r2, 4*FDEC_STRIDE +%ifnidn %1, sse2 + mova m7, [hsub_mul GLOBAL] +%endif +%ifdef WIN64 + call .skip_prologue + RET +%endif +global x264_sub8x8_dct8_%1.skip_prologue +.skip_prologue: + SWAP 7, 10 + LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE + LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE + DCT8_1D 0,1,2,3,4,5,6,7,8,9 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 + DCT8_1D 0,1,2,3,4,5,6,7,8,9 + movdqa [r0+0x00], m0 + movdqa [r0+0x10], m1 + movdqa [r0+0x20], m2 + movdqa [r0+0x30], m3 + movdqa [r0+0x40], m4 + movdqa [r0+0x50], m5 + movdqa [r0+0x60], m6 + movdqa [r0+0x70], m7 + ret +%endmacro + +%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2 +%define movdqa movaps +%define punpcklqdq movlhps +DCT_SUB8 sse2 +%undef movdqa +%undef punpcklqdq +%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3 +DCT_SUB8 ssse3 + ;----------------------------------------------------------------------------- ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2, 2,2,10 +cglobal x264_add8x8_idct8_sse2, 2,2,11 + add r0, 4*FDEC_STRIDE + pxor m7, m7 +%ifdef WIN64 + call .skip_prologue + RET +%endif +global x264_add8x8_idct8_sse2.skip_prologue +.skip_prologue: + SWAP 7, 9 movdqa m0, [r1+0x00] movdqa m1, [r1+0x10] movdqa m2, [r1+0x20] @@ -176,21 +225,53 @@ cglobal x264_add8x8_idct8_sse2, 2,2,10 movdqa m5, [r1+0x50] movdqa m6, [r1+0x60] movdqa m7, [r1+0x70] - - IDCT8_1D 0,1,2,3,4,5,6,7,8,9 + IDCT8_1D 0,1,2,3,4,5,6,7,8,10 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end - IDCT8_1D 0,1,2,3,4,5,6,7,8,9 - - pxor m9, m9 - STORE_DIFF m0, m8, m9, [r0+0*FDEC_STRIDE] - STORE_DIFF m1, m8, m9, [r0+1*FDEC_STRIDE] - STORE_DIFF m2, m8, m9, [r0+2*FDEC_STRIDE] - STORE_DIFF m3, m8, m9, [r0+3*FDEC_STRIDE] - STORE_DIFF m4, m8, m9, [r0+4*FDEC_STRIDE] - STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE] - STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE] - STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE] - RET - + IDCT8_1D 0,1,2,3,4,5,6,7,8,10 + DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE] + DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE] + DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE] + DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE] + STORE_IDCT m1, m3, m5, m7 + ret +;----------------------------------------------------------------------------- +; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_add8x8_idct_sse2, 2,2,11 + add r0, 4*FDEC_STRIDE + pxor m7, m7 +%ifdef WIN64 + call .skip_prologue + RET +%endif +global x264_add8x8_idct_sse2.skip_prologue +.skip_prologue: + SWAP 7, 9 + mova m0, [r1+ 0] + mova m2, [r1+16] + mova m1, [r1+32] + mova m3, [r1+48] + SBUTTERFLY qdq, 0, 1, 4 + SBUTTERFLY qdq, 2, 3, 4 + mova m4, [r1+64] + mova m6, [r1+80] + mova m5, [r1+96] + mova m7, [r1+112] + SBUTTERFLY qdq, 4, 5, 8 + SBUTTERFLY qdq, 6, 7, 8 + IDCT4_1D 0,1,2,3,8,10 + TRANSPOSE2x4x4W 0,1,2,3,8 + IDCT4_1D 4,5,6,7,8,10 + TRANSPOSE2x4x4W 4,5,6,7,8 + paddw m0, [pw_32 GLOBAL] + IDCT4_1D 0,1,2,3,8,10 + paddw m4, [pw_32 GLOBAL] + IDCT4_1D 4,5,6,7,8,10 + DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE] + DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE] + DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE] + DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE] + STORE_IDCT m1, m3, m5, m7 + ret diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 1cf702a2..b1e9d374 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -3,9 +3,9 @@ ;***************************************************************************** ;* Copyright (C) 2003-2008 x264 project ;* -;* Authors: Laurent Aimar +;* Authors: Holger Lubitz +;* Laurent Aimar ;* Loren Merritt -;* Holger Lubitz ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify @@ -29,6 +29,7 @@ SECTION_RODATA pw_32: times 8 dw 32 pw_8000: times 8 dw 0x8000 +hsub_mul: times 8 db 1, -1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15 @@ -38,38 +39,39 @@ pb_1: times 8 db 1 SECTION .text -%macro HADAMARD4_1D 4 - SUMSUB_BADC m%2, m%1, m%4, m%3 - SUMSUB_BADC m%4, m%2, m%3, m%1 +%macro WALSH4_1D 5 + SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 + SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 SWAP %1, %4, %3 %endmacro %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000 movq m%3, m%4 - paddw m%1, m%4 + pxor m%1, m%4 psubw m%3, m%2 - paddw m%2, m%4 + pxor m%2, m%4 pavgw m%3, m%1 pavgw m%2, m%1 - psubw m%3, m%4 - psubw m%2, m%4 + pxor m%3, m%4 + pxor m%2, m%4 SWAP %1, %2, %3 %endmacro +INIT_MMX ;----------------------------------------------------------------------------- ; void x264_dct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- cglobal x264_dct4x4dc_mmx, 1,1 - movq m0, [r0+ 0] - movq m1, [r0+ 8] - movq m2, [r0+16] movq m3, [r0+24] + movq m2, [r0+16] + movq m1, [r0+ 8] + movq m0, [r0+ 0] movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works - HADAMARD4_1D 0,1,2,3 + WALSH4_1D 0,1,2,3,4 TRANSPOSE4x4W 0,1,2,3,4 - SUMSUB_BADC m1, m0, m3, m2 - SWAP 0,1 - SWAP 2,3 + SUMSUB_BADC m1, m0, m3, m2, m4 + SWAP 0, 1 + SWAP 2, 3 SUMSUB_17BIT 0,2,4,7 SUMSUB_17BIT 1,3,5,7 movq [r0+0], m0 @@ -82,123 +84,78 @@ cglobal x264_dct4x4dc_mmx, 1,1 ; void x264_idct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- cglobal x264_idct4x4dc_mmx, 1,1 - movq m0, [r0+ 0] - movq m1, [r0+ 8] - movq m2, [r0+16] - movq m3, [r0+24] - HADAMARD4_1D 0,1,2,3 + movq m3, [r0+24] + movq m2, [r0+16] + movq m1, [r0+ 8] + movq m0, [r0+ 0] + WALSH4_1D 0,1,2,3,4 TRANSPOSE4x4W 0,1,2,3,4 - HADAMARD4_1D 0,1,2,3 + WALSH4_1D 0,1,2,3,4 movq [r0+ 0], m0 movq [r0+ 8], m1 movq [r0+16], m2 movq [r0+24], m3 RET -%macro DCT4_1D 5 - SUMSUB_BADC m%4, m%1, m%3, m%2 - SUMSUB_BA m%3, m%4 - SUMSUB2_AB m%1, m%2, m%5 - SWAP %1, %3, %4, %5, %2 -%endmacro - -%macro IDCT4_1D 6 - SUMSUB_BA m%3, m%1 - SUMSUBD2_AB m%2, m%4, m%6, m%5 - SUMSUB_BADC m%2, m%3, m%5, m%1 - SWAP %1, %2, %5, %4, %3 -%endmacro - +%macro SUB_DCT4 1 ;----------------------------------------------------------------------------- ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub4x4_dct_mmx, 3,3 +cglobal x264_sub4x4_dct_%1, 3,3 +%ifidn %1, mmx .skip_prologue: -%macro SUB_DCT4 1 - LOAD_DIFF m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] - LOAD_DIFF m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] - LOAD_DIFF m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] - LOAD_DIFF m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] + LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] + LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] + LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] + LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] +%else + mova m5, [hsub_mul GLOBAL] + LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2 +%endif DCT4_1D 0,1,2,3,4 - TRANSPOSE%1 0,1,2,3,4 + TRANSPOSE4x4W 0,1,2,3,4 DCT4_1D 0,1,2,3,4 movq [r0+ 0], m0 movq [r0+ 8], m1 movq [r0+16], m2 movq [r0+24], m3 -%endmacro - SUB_DCT4 4x4W RET +%endmacro + +SUB_DCT4 mmx +SUB_DCT4 ssse3 ;----------------------------------------------------------------------------- ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- cglobal x264_add4x4_idct_mmx, 2,2 + pxor m7, m7 .skip_prologue: - movq m0, [r1+ 0] movq m1, [r1+ 8] - movq m2, [r1+16] movq m3, [r1+24] -%macro ADD_IDCT4 1 + movq m2, [r1+16] + movq m0, [r1+ 0] IDCT4_1D 0,1,2,3,4,5 - TRANSPOSE%1 0,1,2,3,4 + TRANSPOSE4x4W 0,1,2,3,4 paddw m0, [pw_32 GLOBAL] IDCT4_1D 0,1,2,3,4,5 - pxor m7, m7 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE] STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE] STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE] STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE] -%endmacro - ADD_IDCT4 4x4W - RET - -INIT_XMM - -cglobal x264_sub8x8_dct_sse2, 3,3,8 -.skip_prologue: - call .8x4 - add r0, 64 - add r1, 4*FENC_STRIDE - add r2, 4*FDEC_STRIDE -%ifdef WIN64 - call .8x4 - RET -%endif -.8x4: - SUB_DCT4 2x4x4W - movhps [r0+32], m0 - movhps [r0+40], m1 - movhps [r0+48], m2 - movhps [r0+56], m3 - ret - -cglobal x264_add8x8_idct_sse2, 2,2,8 -.skip_prologue: - call .8x4 - add r1, 64 - add r0, 4*FDEC_STRIDE -%ifdef WIN64 - call .8x4 RET -%endif -.8x4: - movq m0, [r1+ 0] - movq m1, [r1+ 8] - movq m2, [r1+16] - movq m3, [r1+24] - movhps m0, [r1+32] - movhps m1, [r1+40] - movhps m2, [r1+48] - movhps m3, [r1+56] - ADD_IDCT4 2x4x4W - ret ;----------------------------------------------------------------------------- ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- %macro SUB_NxN_DCT 6 -cglobal %1, 3,3 +cglobal %1, 3,3,11 +%if mmsize == 8 + pxor m7, m7 +%else + add r2, 4*FDEC_STRIDE + mova m7, [hsub_mul GLOBAL] +%endif .skip_prologue: %ifdef WIN64 sub rsp, 8 @@ -217,15 +174,22 @@ cglobal %1, 3,3 add r2, %4-%5-%6*FDEC_STRIDE %ifdef WIN64 add rsp, 8 -%endif + call %2 + RET +%else jmp %2 +%endif %endmacro ;----------------------------------------------------------------------------- ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- -%macro ADD_NxN_IDCT 6 -cglobal %1, 2,2 +%macro ADD_NxN_IDCT 6-7 +cglobal %1, 2,2,11 + pxor m7, m7 +%if mmsize==16 + add r0, 4*FDEC_STRIDE +%endif .skip_prologue: %ifdef WIN64 sub rsp, 8 @@ -241,8 +205,11 @@ cglobal %1, 2,2 add r1, %3 %ifdef WIN64 add rsp, 8 -%endif + call %2 + RET +%else jmp %2 +%endif %endmacro %ifndef ARCH_X86_64 @@ -255,19 +222,25 @@ cextern x264_sub8x8_dct8_mmx.skip_prologue cextern x264_add8x8_idct8_mmx.skip_prologue SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0 -%define x264_sub8x8_dct_sse2 x264_sub8x8_dct_sse2.skip_prologue -%define x264_add8x8_idct_sse2 x264_add8x8_idct_sse2.skip_prologue -%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue -%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue %endif -SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2, 64, 8, 0, 4 -ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2, 64, 8, 0, 4 +INIT_XMM + +cextern x264_sub8x8_dct_sse2.skip_prologue +cextern x264_sub8x8_dct_ssse3.skip_prologue +SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0 +SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0 +cextern x264_add8x8_idct_sse2.skip_prologue +ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0 + +cextern x264_sub8x8_dct8_sse2.skip_prologue +cextern x264_add8x8_idct8_sse2.skip_prologue +SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0 +ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0 + +cextern x264_sub8x8_dct8_ssse3.skip_prologue +SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0 -cextern x264_sub8x8_dct8_sse2 -cextern x264_add8x8_idct8_sse2 -SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0 -ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0 ;----------------------------------------------------------------------------- ; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 ) diff --git a/common/x86/dct.h b/common/x86/dct.h index 7617ea58..5b83d342 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -29,6 +29,10 @@ void x264_sub8x8_dct_mmx ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *p void x264_sub16x16_dct_mmx ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub4x4_dct_ssse3 ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_ssse3 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); + void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] ); void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] ); @@ -48,6 +52,9 @@ void x264_sub8x8_dct8_mmx ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix void x264_sub16x16_dct8_mmx ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct8_sse2 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct8_sse2 ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct8_ssse3 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct8_ssse3( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); + void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct[8][8] ); void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][8][8] ); diff --git a/common/x86/pixel-32.asm b/common/x86/pixel-32.asm index f4a01b0b..c0cf0ece 100644 --- a/common/x86/pixel-32.asm +++ b/common/x86/pixel-32.asm @@ -71,7 +71,7 @@ cglobal x264_pixel_sa8d_8x8_internal_mmxext %define spill esp+0x60 ; +16 %define trans esp+0 ; +96 LOAD_DIFF_4x8P 0 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7 movq [spill], m1 TRANSPOSE4x4W 4, 5, 6, 7, 1 @@ -89,7 +89,7 @@ cglobal x264_pixel_sa8d_8x8_internal_mmxext mov r0, [args+4] mov r2, [args] LOAD_DIFF_4x8P 4 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7 movq [spill], m7 TRANSPOSE4x4W 0, 1, 2, 3, 7 @@ -104,7 +104,7 @@ cglobal x264_pixel_sa8d_8x8_internal_mmxext movq m2, [trans+0x10] movq m3, [trans+0x18] - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7 SUM4x8_MM movq [trans], m0 @@ -117,7 +117,7 @@ cglobal x264_pixel_sa8d_8x8_internal_mmxext movq m6, [trans+0x50] movq m7, [trans+0x58] - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7 SUM4x8_MM pavgw m0, [trans] @@ -180,7 +180,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_mmxext %define trans esp+0 ; +96 %define sum esp+0 ; +32 LOAD_4x8P 0 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7 movq [spill], m0 TRANSPOSE4x4W 4, 5, 6, 7, 0 @@ -196,7 +196,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_mmxext movq [trans+0x38], m3 LOAD_4x8P 4 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7 movq [spill], m7 TRANSPOSE4x4W 0, 1, 2, 3, 7 @@ -211,7 +211,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_mmxext movq m2, [trans+0x10] movq m3, [trans+0x18] - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7 movq [spill+0], m0 movq [spill+8], m1 @@ -246,7 +246,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_mmxext movq m6, [trans+0x50] movq m7, [trans+0x58] - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7 movd [sum+0x10], m0 movd [sum+0x12], m1 diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 8e21fe4a..67ff3b07 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -4,6 +4,7 @@ ;* Copyright (C) 2003-2008 x264 project ;* ;* Authors: Loren Merritt +;* Holger Lubitz ;* Laurent Aimar ;* Alex Izvorski ;* Fiona Glaser @@ -27,13 +28,21 @@ %include "x86util.asm" SECTION_RODATA -pw_1: times 8 dw 1 -ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 -ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 -mask_ff: times 16 db 0xff - times 16 db 0 -mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1 -mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1 +pw_1: times 8 dw 1 +pw_00ff: times 8 dw 0xff +ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 +ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 +mask_ff: times 16 db 0xff + times 16 db 0 +mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1 +mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1 +mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1 +hsub_mul: times 8 db 1, -1 +hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 +hmul_8p: times 8 db 1 + times 4 db 1, -1 +mask_10: times 4 dw 0, -1 +mask_1100: times 2 dd 0, -1 SECTION .text @@ -44,8 +53,7 @@ SECTION .text pshuflw %2, %1, 0xE paddd %1, %2 %else - mova %2, %1 - psrlq %2, 32 + pshufw %2, %1, 0xE paddd %1, %2 %endif %endmacro @@ -68,122 +76,171 @@ SECTION .text ; SSD ;============================================================================= -%macro SSD_FULL 6 +%macro SSD_LOAD_FULL 5 mova m1, [r0+%1] mova m2, [r2+%2] mova m3, [r0+%3] mova m4, [r2+%4] - - mova m5, m2 - mova m6, m4 - psubusb m2, m1 - psubusb m4, m3 - psubusb m1, m5 - psubusb m3, m6 - por m1, m2 - por m3, m4 - - mova m2, m1 - mova m4, m3 - punpcklbw m1, m7 - punpcklbw m3, m7 - punpckhbw m2, m7 - punpckhbw m4, m7 - pmaddwd m1, m1 - pmaddwd m2, m2 - pmaddwd m3, m3 - pmaddwd m4, m4 - -%if %6 +%if %5 lea r0, [r0+2*r1] lea r2, [r2+2*r3] %endif - paddd m1, m2 - paddd m3, m4 +%endmacro + +%macro LOAD 5 + movh m%1, %3 + movh m%2, %4 %if %5 - paddd m0, m1 -%else - SWAP m0, m1 + lea r0, [r0+2*r1] %endif - paddd m0, m3 %endmacro -%macro SSD_HALF 6 - movh m1, [r0+%1] - movh m2, [r2+%2] - movh m3, [r0+%3] - movh m4, [r2+%4] +%macro JOIN 7 + movh m%3, %5 + movh m%4, %6 +%if %7 + lea r2, [r2+2*r3] +%endif + punpcklbw m%1, m7 + punpcklbw m%3, m7 + psubw m%1, m%3 + punpcklbw m%2, m7 + punpcklbw m%4, m7 + psubw m%2, m%4 +%endmacro - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - psubw m1, m2 - psubw m3, m4 - pmaddwd m1, m1 - pmaddwd m3, m3 +%macro JOIN_SSE2 7 + movh m%3, %5 + movh m%4, %6 +%if %7 + lea r2, [r2+2*r3] +%endif + punpcklqdq m%1, m%2 + punpcklqdq m%3, m%4 + DEINTB %2, %1, %4, %3, 7 + psubw m%2, m%4 + psubw m%1, m%3 +%endmacro -%if %6 - lea r0, [r0+2*r1] +%macro JOIN_SSSE3 7 + movh m%3, %5 + movh m%4, %6 +%if %7 lea r2, [r2+2*r3] %endif -%if %5 - paddd m0, m1 -%else - SWAP m0, m1 + punpcklbw m%1, m%3 + punpcklbw m%2, m%4 +%endmacro + +%macro SSD_LOAD_HALF 5 + LOAD 1, 2, [r0+%1], [r0+%3], 1 + JOIN 1, 2, 3, 4, [r2+%2], [r2+%4], 1 + LOAD 3, 4, [r0+%1], [r0+%3], %5 + JOIN 3, 4, 5, 6, [r2+%2], [r2+%4], %5 +%endmacro + +%macro SSD_CORE 7-8 +%ifidn %8, FULL + mova m%6, m%2 + mova m%7, m%4 + psubusb m%2, m%1 + psubusb m%4, m%3 + psubusb m%1, m%6 + psubusb m%3, m%7 + por m%1, m%2 + por m%3, m%4 + mova m%2, m%1 + mova m%4, m%3 + punpckhbw m%1, m%5 + punpckhbw m%3, m%5 + punpcklbw m%2, m%5 + punpcklbw m%4, m%5 %endif - paddd m0, m3 + pmaddwd m%1, m%1 + pmaddwd m%2, m%2 + pmaddwd m%3, m%3 + pmaddwd m%4, m%4 %endmacro -%macro SSD_QUARTER 6 - movd m1, [r0+%1] - movd m2, [r2+%2] - movd m3, [r0+%3] - movd m4, [r2+%4] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - pinsrd m1, [r0+%1], 1 - pinsrd m2, [r2+%2], 1 - pinsrd m3, [r0+%3], 1 - pinsrd m4, [r2+%4], 1 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - psubw m1, m2 - psubw m3, m4 - pmaddwd m1, m1 - pmaddwd m3, m3 +%macro SSD_CORE_SSE2 7-8 +%ifidn %8, FULL + DEINTB %6, %1, %7, %2, %5 + psubw m%6, m%7 + psubw m%1, m%2 + SWAP %2, %6 + DEINTB %6, %3, %7, %4, %5 + psubw m%6, m%7 + psubw m%3, m%4 + SWAP %4, %6 +%endif + pmaddwd m%1, m%1 + pmaddwd m%2, m%2 + pmaddwd m%3, m%3 + pmaddwd m%4, m%4 +%endmacro -%if %6 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] +%macro SSD_CORE_SSSE3 7-8 +%ifidn %8, FULL + mova m%6, m%1 + mova m%7, m%3 + punpcklbw m%1, m%2 + punpcklbw m%3, m%4 + punpckhbw m%6, m%2 + punpckhbw m%7, m%4 + SWAP %6, %2 + SWAP %7, %4 %endif -%if %5 + pmaddubsw m%1, m%5 + pmaddubsw m%2, m%5 + pmaddubsw m%3, m%5 + pmaddubsw m%4, m%5 + pmaddwd m%1, m%1 + pmaddwd m%2, m%2 + pmaddwd m%3, m%3 + pmaddwd m%4, m%4 +%endmacro + +%macro SSD_END 1 + paddd m1, m2 + paddd m3, m4 +%if %1 paddd m0, m1 %else - SWAP m0, m1 + SWAP 0, 1 %endif paddd m0, m3 %endmacro +%macro SSD_ITER 7 + SSD_LOAD_%1 %2,%3,%4,%5,%7 + SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 + SSD_END %6 +%endmacro + ;----------------------------------------------------------------------------- ; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SSD 3-4 0 cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4 -%if %1 >= mmsize +%ifidn %3, ssse3 + mova m7, [hsub_mul GLOBAL] +%elifidn %3, sse2 + mova m7, [pw_00ff GLOBAL] +%elif %1 >= mmsize pxor m7, m7 %endif %assign i 0 -%rep %2/2 +%rep %2/4 %if %1 > mmsize - SSD_FULL 0, 0, mmsize, mmsize, i, 0 - SSD_FULL r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/2-1 + SSD_ITER FULL, 0, 0, mmsize, mmsize, i, 0 + SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, 1 + SSD_ITER FULL, 0, 0, mmsize, mmsize, 1, 0 + SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/4-1 %elif %1 == mmsize - SSD_FULL 0, 0, r1, r3, i, i<%2/2-1 + SSD_ITER FULL, 0, 0, r1, r3, i, 1 + SSD_ITER FULL, 0, 0, r1, r3, 1, i<%2/4-1 %else - SSD_HALF 0, 0, r1, r3, i, i<%2/2-1 + SSD_ITER HALF, 0, 0, r1, r3, i, i<%2/4-1 %endif %assign i i+1 %endrep @@ -201,25 +258,28 @@ SSD 8, 4, mmx SSD 4, 8, mmx SSD 4, 4, mmx INIT_XMM +SSD 16, 16, sse2slow, 8 +SSD 16, 8, sse2slow, 8 +SSD 8, 16, sse2slow, 8 +SSD 8, 8, sse2slow, 8 +SSD 8, 4, sse2slow, 8 +%define SSD_CORE SSD_CORE_SSE2 +%define JOIN JOIN_SSE2 SSD 16, 16, sse2, 8 SSD 16, 8, sse2, 8 -SSD 8, 16, sse2, 5 -SSD 8, 8, sse2, 5 -SSD 8, 4, sse2, 5 - -cglobal x264_pixel_ssd_4x8_sse4, 4,4 - SSD_QUARTER 0, 0, r1, r3, 0, 1 - SSD_QUARTER 0, 0, r1, r3, 1, 0 - HADDD m0, m1 - movd eax, m0 - RET - -cglobal x264_pixel_ssd_4x4_sse4, 4,4 - SSD_QUARTER 0, 0, r1, r3, 0, 0 - HADDD m0, m1 - movd eax, m0 - RET - +SSD 8, 16, sse2, 8 +SSD 8, 8, sse2, 8 +SSD 8, 4, sse2, 8 +%define SSD_CORE SSD_CORE_SSSE3 +%define JOIN JOIN_SSSE3 +SSD 16, 16, ssse3, 8 +SSD 16, 8, ssse3, 8 +SSD 8, 16, ssse3, 8 +SSD 8, 8, ssse3, 8 +SSD 8, 4, ssse3, 8 +INIT_MMX +SSD 4, 8, ssse3 +SSD 4, 4, ssse3 ;============================================================================= ; variance @@ -324,42 +384,165 @@ cglobal x264_pixel_var_8x8_sse2, 2,3,8 ; SATD ;============================================================================= -; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower: -; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1. -; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging, -; whereas phaddw-based transform doesn't care what order the coefs end up in. +%macro TRANS_SSE2 5-6 +; TRANSPOSE2x2 +; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq +; %2: ord/unord (for compat with sse4, unused) +; %3/%4: source regs +; %5/%6: tmp regs +%ifidn %1, d +%define mask [mask_10 GLOBAL] +%define shift 16 +%elifidn %1, q +%define mask [mask_1100 GLOBAL] +%define shift 32 +%endif +%if %0==6 ; less dependency if we have two tmp + mova m%5, mask ; ff00 + mova m%6, m%4 ; x5x4 + psll%1 m%4, shift ; x4.. + pand m%6, m%5 ; x5.. + pandn m%5, m%3 ; ..x0 + psrl%1 m%3, shift ; ..x1 + por m%4, m%5 ; x4x0 + por m%3, m%6 ; x5x1 +%else ; more dependency, one insn less. sometimes faster, sometimes not + mova m%5, m%4 ; x5x4 + psll%1 m%4, shift ; x4.. + pxor m%4, m%3 ; (x4^x1)x0 + pand m%4, mask ; (x4^x1).. + pxor m%3, m%4 ; x4x0 + psrl%1 m%4, shift ; ..(x1^x4) + pxor m%5, m%4 ; x5x1 + SWAP %4, %3, %5 +%endif +%endmacro + +%define TRANS TRANS_SSE2 -%macro PHSUMSUB 3 - movdqa m%3, m%1 - phaddw m%1, m%2 - phsubw m%3, m%2 - SWAP %2, %3 +%macro TRANS_SSE4 5-6 ; see above +%ifidn %1, d +%define mask 10101010b +%define shift 16 +%elifidn %1, q +%define mask 11001100b +%define shift 32 +%endif + mova m%5, m%3 +%ifidn %2, ord + psrl%1 m%3, shift +%endif + pblendw m%3, m%4, mask + psll%1 m%4, shift +%ifidn %2, ord + pblendw m%4, m%5, 255^mask +%else + psrl%1 m%5, shift + por m%4, m%5 +%endif %endmacro -%macro HADAMARD4_ROW_PHADD 5 - PHSUMSUB %1, %2, %5 - PHSUMSUB %3, %4, %5 - PHSUMSUB %1, %3, %5 - PHSUMSUB %2, %4, %5 - SWAP %3, %4 +%macro JDUP_SSE2 2 + punpckldq %1, %2 + ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d %endmacro -%macro HADAMARD4_1D 4 - SUMSUB_BADC %1, %2, %3, %4 - SUMSUB_BADC %1, %3, %2, %4 +%macro JDUP_CONROE 2 + ; join 2x 32 bit and duplicate them + ; emulating shufps is faster on conroe + punpcklqdq %1, %2 + movsldup %1, %1 %endmacro -%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) - %xdefine %%n n%1 - HADAMARD4_1D m4, m5, m6, m7 - TRANSPOSE4x4W 4, 5, 6, 7, %%n - HADAMARD4_1D m4, m5, m6, m7 - ABS2 m4, m5, m3, m %+ %%n - ABS2 m6, m7, m3, m %+ %%n - paddw m6, m4 - paddw m7, m5 - pavgw m6, m7 - SWAP %%n, 6 +%macro JDUP_PENRYN 2 + ; just use shufps on anything post conroe + shufps %1, %2, 0 +%endmacro + +%macro HSUMSUB 5 + pmaddubsw m%2, m%5 + pmaddubsw m%1, m%5 + pmaddubsw m%4, m%5 + pmaddubsw m%3, m%5 +%endmacro + +%macro DIFF_UNPACK_SSE2 5 + punpcklbw m%1, m%5 + punpcklbw m%2, m%5 + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + psubw m%1, m%2 + psubw m%3, m%4 +%endmacro + +%macro DIFF_SUMSUB_SSSE3 5 + HSUMSUB %1, %2, %3, %4, %5 + psubw m%1, m%2 + psubw m%3, m%4 +%endmacro + +%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer + movd %1, %3 + movd %2, %4 + JDUP %1, %2 +%endmacro + +%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer + movddup m%3, %6 + movddup m%4, %8 + movddup m%1, %5 + movddup m%2, %7 +%endmacro + +%macro LOAD_DUP_4x8P_PENRYN 8 + ; penryn and nehalem run punpcklqdq and movddup in different units + movh m%3, %6 + movh m%4, %8 + punpcklqdq m%3, m%3 + movddup m%1, %5 + punpcklqdq m%4, m%4 + movddup m%2, %7 +%endmacro + +%macro LOAD_SUMSUB_8x2P 9 + LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] + LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr + movddup m%1, [%7] + movddup m%2, [%7+8] + mova m%4, [%6] + movddup m%3, m%4 + punpckhqdq m%4, m%4 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr + movu m%4, [%7] + mova m%2, [%6] + DEINTB %1, %2, %3, %4, %5 + psubw m%1, m%3 + psubw m%2, m%4 + SUMSUB_BA m%1, m%2, m%3 +%endmacro + +%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none +; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] + LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 + LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 + LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 + LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 %endmacro ; in: r4=3*stride1, r5=3*stride2 @@ -368,6 +551,7 @@ cglobal x264_pixel_var_8x8_sse2, 2,3,8 ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 + %xdefine %%n n%1 LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2] LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2] LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2] @@ -376,22 +560,31 @@ cglobal x264_pixel_var_8x8_sse2, 2,3,8 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif - HADAMARD4x4_SUM %1 + HADAMARD4_2D 4, 5, 6, 7, 3, %%n + paddw m4, m6 + SWAP %%n, 4 %endmacro -%macro SATD_8x4_SSE2 1 - HADAMARD4_1D m0, m1, m2, m3 -%ifidn %1, ssse3_phadd - HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4 +%macro SATD_8x4_SSE 8-9 +%ifidn %1, sse2 + HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax +%else + HADAMARD4_V m%2, m%3, m%4, m%5, m%6 + ; doing the abs first is a slight advantage + ABS4 m%2, m%4, m%3, m%5, m%6, m%7 + HADAMARD 1, max, %2, %4, %6, %7 +%endif +%ifnidn %9, swap + paddw m%8, m%2 %else - TRANSPOSE2x4x4W 0, 1, 2, 3, 4 - HADAMARD4_1D m0, m1, m2, m3 -%endif - ABS4 m0, m1, m2, m3, m4, m5 - paddusw m0, m1 - paddusw m2, m3 - paddusw m6, m0 - paddusw m6, m2 + SWAP %8, %2 +%endif +%ifidn %1, sse2 + paddw m%8, m%4 +%else + HADAMARD 1, max, %3, %5, %6, %7 + paddw m%8, m%3 +%endif %endmacro %macro SATD_START_MMX 0 @@ -489,26 +682,23 @@ cglobal x264_pixel_satd_4x8_mmxext, 4,6 paddw m0, m1 SATD_END_MMX -%macro SATD_W4 1 -INIT_MMX -cglobal x264_pixel_satd_4x4_%1, 4,6 +cglobal x264_pixel_satd_4x4_mmxext, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 0 SATD_END_MMX -%endmacro - -SATD_W4 mmxext -%macro SATD_START_SSE2 0 - pxor m6, m6 +%macro SATD_START_SSE2 3 +%ifnidn %1, sse2 + mova %3, [hmul_8p GLOBAL] +%endif lea r4, [3*r1] lea r5, [3*r3] + pxor %2, %2 %endmacro -%macro SATD_END_SSE2 0 - psrlw m6, 1 - HADDW m6, m7 - movd eax, m6 +%macro SATD_END_SSE2 2 + HADDW %2, m7 + movd eax, %2 RET %endmacro @@ -536,81 +726,136 @@ SATD_W4 mmxext ;----------------------------------------------------------------------------- %macro SATDS_SSE2 1 INIT_XMM +%ifnidn %1, sse2 +cglobal x264_pixel_satd_4x4_%1, 4, 6, 6 + SATD_START_MMX + mova m4, [hmul_4p GLOBAL] + LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] + LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] + LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] + LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 + HADAMARD 0, sumsub, 0, 1, 2, 3 + HADAMARD 4, sumsub, 0, 1, 2, 3 + HADAMARD 1, amax, 0, 1, 2, 3 + HADDW m0, m1 + movd eax, m0 + RET +%endif + +cglobal x264_pixel_satd_4x8_%1, 4, 6, 8 + SATD_START_MMX +%ifnidn %1, sse2 + mova m7, [hmul_4p GLOBAL] +%endif + movd m4, [r2] + movd m5, [r2+r3] + movd m6, [r2+2*r3] + add r2, r5 + movd m0, [r0] + movd m1, [r0+r1] + movd m2, [r0+2*r1] + add r0, r4 + movd m3, [r2+r3] + JDUP m4, m3 + movd m3, [r0+r1] + JDUP m0, m3 + movd m3, [r2+2*r3] + JDUP m5, m3 + movd m3, [r0+2*r1] + JDUP m1, m3 + DIFFOP 0, 4, 1, 5, 7 + movd m5, [r2] + add r2, r5 + movd m3, [r0] + add r0, r4 + movd m4, [r2] + JDUP m6, m4 + movd m4, [r0] + JDUP m2, m4 + movd m4, [r2+r3] + JDUP m5, m4 + movd m4, [r0+r1] + JDUP m3, m4 + DIFFOP 2, 6, 3, 5, 7 + SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6, swap + HADDW m6, m1 + movd eax, m6 + RET + cglobal x264_pixel_satd_8x8_internal_%1 - LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 - SATD_8x4_SSE2 %1 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 + SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6 x264_pixel_satd_8x4_internal_%1: - LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 -x264_pixel_satd_4x8_internal_%1: - SAVE_MM_PERMUTATION satd_4x8_internal - SATD_8x4_SSE2 %1 + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 + SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6 ret -cglobal x264_pixel_satd_16x16_%1, 4,6,8 - SATD_START_SSE2 - BACKUP_POINTERS - call x264_pixel_satd_8x8_internal_%1 - lea r0, [r0+4*r1] +%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same +cglobal x264_pixel_satd_16x4_internal_%1 + LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] + lea r0, [r0+4*r1] + SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10 + SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10 + ret + +cglobal x264_pixel_satd_16x8_%1, 4,6,12 + SATD_START_SSE2 %1, m10, m7 +%ifidn %1, sse2 + mova m7, [pw_00ff GLOBAL] +%endif + jmp x264_pixel_satd_16x8_internal_%1 + +cglobal x264_pixel_satd_16x16_%1, 4,6,12 + SATD_START_SSE2 %1, m10, m7 +%ifidn %1, sse2 + mova m7, [pw_00ff GLOBAL] +%endif + call x264_pixel_satd_16x4_internal_%1 + call x264_pixel_satd_16x4_internal_%1 +x264_pixel_satd_16x8_internal_%1: + call x264_pixel_satd_16x4_internal_%1 + call x264_pixel_satd_16x4_internal_%1 + SATD_END_SSE2 %1, m10 +%else +cglobal x264_pixel_satd_16x8_%1, 4,6,8 + SATD_START_SSE2 %1, m6, m7 + BACKUP_POINTERS call x264_pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS call x264_pixel_satd_8x8_internal_%1 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - call x264_pixel_satd_8x8_internal_%1 - SATD_END_SSE2 + SATD_END_SSE2 %1, m6 -cglobal x264_pixel_satd_16x8_%1, 4,6,8 - SATD_START_SSE2 +cglobal x264_pixel_satd_16x16_%1, 4,6,8 + SATD_START_SSE2 %1, m6, m7 BACKUP_POINTERS call x264_pixel_satd_8x8_internal_%1 + call x264_pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS call x264_pixel_satd_8x8_internal_%1 - SATD_END_SSE2 + call x264_pixel_satd_8x8_internal_%1 + SATD_END_SSE2 %1, m6 +%endif cglobal x264_pixel_satd_8x16_%1, 4,6,8 - SATD_START_SSE2 + SATD_START_SSE2 %1, m6, m7 call x264_pixel_satd_8x8_internal_%1 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] call x264_pixel_satd_8x8_internal_%1 - SATD_END_SSE2 + SATD_END_SSE2 %1, m6 cglobal x264_pixel_satd_8x8_%1, 4,6,8 - SATD_START_SSE2 + SATD_START_SSE2 %1, m6, m7 call x264_pixel_satd_8x8_internal_%1 - SATD_END_SSE2 + SATD_END_SSE2 %1, m6 cglobal x264_pixel_satd_8x4_%1, 4,6,8 - SATD_START_SSE2 + SATD_START_SSE2 %1, m6, m7 call x264_pixel_satd_8x4_internal_%1 - SATD_END_SSE2 - -cglobal x264_pixel_satd_4x8_%1, 4,6,8 - INIT_XMM - LOAD_MM_PERMUTATION satd_4x8_internal - %define movh movd - SATD_START_SSE2 - LOAD_DIFF m0, m7, m6, [r0], [r2] - LOAD_DIFF m1, m7, m6, [r0+r1], [r2+r3] - LOAD_DIFF m2, m7, m6, [r0+2*r1], [r2+2*r3] - LOAD_DIFF m3, m7, m6, [r0+r4], [r2+r5] - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - LOAD_DIFF m4, m7, m6, [r0], [r2] - LOAD_DIFF m5, m7, m6, [r0+r1], [r2+r3] - punpcklqdq m0, m4 - punpcklqdq m1, m5 - LOAD_DIFF m4, m7, m6, [r0+2*r1], [r2+2*r3] - LOAD_DIFF m5, m7, m6, [r0+r4], [r2+r5] - punpcklqdq m2, m4 - punpcklqdq m3, m5 - %define movh movq - call x264_pixel_satd_4x8_internal_%1 - SATD_END_SSE2 + SATD_END_SSE2 %1, m6 +%endmacro ; SATDS_SSE2 +%macro SA8D 1 %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ) @@ -618,27 +863,36 @@ cglobal x264_pixel_satd_4x8_%1, 4,6,8 cglobal x264_pixel_sa8d_8x8_internal_%1 lea r10, [r0+4*r1] lea r11, [r2+4*r3] - LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9, r0, r2 - LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9, r10, r11 - - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - - ABS4 m0, m1, m2, m3, m8, m9 - ABS4 m4, m5, m6, m7, m8, m9 - paddusw m0, m1 - paddusw m2, m3 - paddusw m4, m5 - paddusw m6, m7 - paddusw m0, m2 - paddusw m4, m6 - pavgw m0, m4 + LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 + LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11 +%ifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things + HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +%else ; non-sse2 + HADAMARD4_V m0, m1, m2, m8, m6 + HADAMARD4_V m4, m5, m3, m9, m6 + SUMSUB_BADC m0, m4, m1, m5, m6 + HADAMARD 2, sumsub, 0, 4, 6, 11 + HADAMARD 2, sumsub, 1, 5, 6, 11 + SUMSUB_BADC m2, m3, m8, m9, m6 + HADAMARD 2, sumsub, 2, 3, 6, 11 + HADAMARD 2, sumsub, 8, 9, 6, 11 + HADAMARD 1, amax, 0, 4, 6, 11 + HADAMARD 1, amax, 1, 5, 6, 4 + HADAMARD 1, amax, 2, 3, 6, 4 + HADAMARD 1, amax, 8, 9, 6, 4 +%endif + paddw m0, m1 + paddw m0, m2 + paddw m0, m8 + SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1 ret -cglobal x264_pixel_sa8d_8x8_%1, 4,6,10 +cglobal x264_pixel_sa8d_8x8_%1, 4,6,12 lea r4, [3*r1] lea r5, [3*r3] +%ifnidn %1, sse2 + mova m7, [hmul_8p GLOBAL] +%endif call x264_pixel_sa8d_8x8_internal_%1 HADDW m0, m1 movd eax, m0 @@ -646,20 +900,23 @@ cglobal x264_pixel_sa8d_8x8_%1, 4,6,10 shr eax, 1 RET -cglobal x264_pixel_sa8d_16x16_%1, 4,6,11 +cglobal x264_pixel_sa8d_16x16_%1, 4,6,12 lea r4, [3*r1] lea r5, [3*r3] +%ifnidn %1, sse2 + mova m7, [hmul_8p GLOBAL] +%endif call x264_pixel_sa8d_8x8_internal_%1 ; pix[0] - add r0, 8 add r2, 8 + add r0, 8 mova m10, m0 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8] - lea r0, [r0+8*r1] lea r2, [r2+8*r3] + lea r0, [r0+8*r1] paddusw m10, m0 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8] - sub r0, 8 sub r2, 8 + sub r0, 8 paddusw m10, m0 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride] paddusw m0, m10 @@ -670,47 +927,61 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,6,11 RET %else ; ARCH_X86_32 +%ifnidn %1, mmxext cglobal x264_pixel_sa8d_8x8_internal_%1 - LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7 - movdqa [esp+4], m2 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2 - movdqa m2, [esp+4] - - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp+4], [esp+20] - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - -%ifidn %1, sse2 - movdqa [esp+4], m4 - movdqa [esp+20], m2 -%endif - ABS2 m6, m3, m4, m2 - ABS2 m0, m7, m4, m2 - paddusw m0, m6 - paddusw m7, m3 + %define spill0 [esp+4] + %define spill1 [esp+20] + %define spill2 [esp+36] %ifidn %1, sse2 - movdqa m4, [esp+4] - movdqa m2, [esp+20] -%endif - ABS2 m5, m1, m6, m3 - ABS2 m4, m2, m6, m3 - paddusw m5, m1 - paddusw m4, m2 - paddusw m0, m7 - paddusw m5, m4 - pavgw m0, m5 + LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 + HADAMARD4_2D 0, 1, 2, 3, 4 + movdqa spill0, m3 + LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 + HADAMARD4_2D 4, 5, 6, 7, 3 + HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax + movdqa m3, spill0 + paddw m0, m1 + HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax +%else ; non-sse2 + mova m7, [hmul_8p GLOBAL] + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1 + ; could do first HADAMARD4_V here to save spilling later + ; surprisingly, not a win on conroe or even p4 + mova spill0, m2 + mova spill1, m3 + mova spill2, m1 + SWAP 1, 7 + LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1 + HADAMARD4_V m4, m5, m6, m7, m3 + mova m1, spill2 + mova m2, spill0 + mova m3, spill1 + mova spill0, m6 + mova spill1, m7 + HADAMARD4_V m0, m1, m2, m3, m7 + SUMSUB_BADC m0, m4, m1, m5, m7 + HADAMARD 2, sumsub, 0, 4, 7, 6 + HADAMARD 2, sumsub, 1, 5, 7, 6 + HADAMARD 1, amax, 0, 4, 7, 6 + HADAMARD 1, amax, 1, 5, 7, 6 + mova m6, spill0 + mova m7, spill1 + paddw m0, m1 + SUMSUB_BADC m2, m6, m3, m7, m4 + HADAMARD 2, sumsub, 2, 6, 4, 5 + HADAMARD 2, sumsub, 3, 7, 4, 5 + HADAMARD 1, amax, 2, 6, 4, 5 + HADAMARD 1, amax, 3, 7, 4, 5 +%endif ; sse2/non-sse2 + paddw m0, m2 + paddw m0, m3 ret -%endif ; ARCH -%endmacro ; SATDS_SSE2 +%endif ; ifndef mmxext -%macro SA8D_16x16_32 1 -%ifndef ARCH_X86_64 cglobal x264_pixel_sa8d_8x8_%1, 4,7 mov r6, esp and esp, ~15 - sub esp, 32 + sub esp, 48 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1 @@ -724,33 +995,37 @@ cglobal x264_pixel_sa8d_8x8_%1, 4,7 cglobal x264_pixel_sa8d_16x16_%1, 4,7 mov r6, esp and esp, ~15 - sub esp, 48 + sub esp, 64 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1 +%ifidn %1, mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] - mova [esp+32], m0 +%endif + mova [esp+48], m0 call x264_pixel_sa8d_8x8_internal_%1 mov r0, [r6+20] mov r2, [r6+28] add r0, 8 add r2, 8 - paddusw m0, [esp+32] - mova [esp+32], m0 + paddusw m0, [esp+48] + mova [esp+48], m0 call x264_pixel_sa8d_8x8_internal_%1 +%ifidn %1, mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] +%endif %if mmsize == 16 - paddusw m0, [esp+32] + paddusw m0, [esp+48] %endif - mova [esp+48-mmsize], m0 + mova [esp+64-mmsize], m0 call x264_pixel_sa8d_8x8_internal_%1 - paddusw m0, [esp+48-mmsize] + paddusw m0, [esp+64-mmsize] %if mmsize == 16 HADDUW m0, m1 %else - mova m2, [esp+32] + mova m2, [esp+48] pxor m7, m7 mova m1, m0 mova m3, m2 @@ -769,9 +1044,7 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,7 mov esp, r6 RET %endif ; !ARCH_X86_64 -%endmacro ; SA8D_16x16_32 - - +%endmacro ; SA8D ;============================================================================= ; INTRA SATD @@ -802,9 +1075,8 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16 punpcklbw m5, m8 punpcklbw m6, m8 punpcklbw m7, m8 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + + HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8 ; dc movzx r0d, word [r1+0] @@ -900,9 +1172,7 @@ load_hadamard: punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 - HADAMARD4_1D m0, m1, m2, m3 - TRANSPOSE4x4W 0, 1, 2, 3, 4 - HADAMARD4_1D m0, m1, m2, m3 + HADAMARD4_2D 0, 1, 2, 3, 4 SAVE_MM_PERMUTATION load_hadamard ret @@ -1262,9 +1532,7 @@ cglobal x264_hadamard_ac_4x4_mmxext punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 - HADAMARD4_1D m0, m1, m2, m3 - TRANSPOSE4x4W 0, 1, 2, 3, 4 - HADAMARD4_1D m0, m1, m2, m3 + HADAMARD4_2D 0, 1, 2, 3, 4 mova [r3], m0 mova [r3+8], m1 mova [r3+16], m2 @@ -1280,15 +1548,19 @@ cglobal x264_hadamard_ac_4x4_mmxext SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext ret -cglobal x264_hadamard_ac_2x2_mmxext +cglobal x264_hadamard_ac_2x2max_mmxext mova m0, [r3+0x00] mova m1, [r3+0x20] mova m2, [r3+0x40] mova m3, [r3+0x60] - HADAMARD4_1D m0, m1, m2, m3 - ABS2 m0, m1, m4, m5 - ABS2 m2, m3, m4, m5 - SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext + sub r3, 8 + SUMSUB_BADC m0, m1, m2, m3, m4 + ABS4 m0, m2, m1, m3, m4, m5 + HADAMARD 0, max, 0, 2, 4, 5 + HADAMARD 0, max, 1, 3, 4, 5 + paddw m7, m0 + paddw m7, m1 + SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext ret cglobal x264_hadamard_ac_8x8_mmxext @@ -1308,28 +1580,23 @@ cglobal x264_hadamard_ac_8x8_mmxext paddw m5, m0 call x264_hadamard_ac_4x4_mmxext paddw m5, m0 - sub r3, 64 + sub r3, 40 mova [rsp+gprsize+8], m5 ; save satd - call x264_hadamard_ac_2x2_mmxext - add r3, 8 - pand m6, m0 - mova m7, m1 - paddw m6, m2 - paddw m7, m3 -%rep 2 - call x264_hadamard_ac_2x2_mmxext - add r3, 8 - paddw m6, m0 - paddw m7, m1 - paddw m6, m2 - paddw m7, m3 +%rep 3 + call x264_hadamard_ac_2x2max_mmxext %endrep - call x264_hadamard_ac_2x2_mmxext - sub r3, 24 - paddw m6, m0 + mova m0, [r3+0x00] + mova m1, [r3+0x20] + mova m2, [r3+0x40] + mova m3, [r3+0x60] + SUMSUB_BADC m0, m1, m2, m3, m4 + HADAMARD 0, sumsub, 0, 2, 4, 5 + ABS4 m1, m3, m0, m2, m4, m5 + HADAMARD 0, max, 1, 3, 4, 5 + pand m6, m0 paddw m7, m1 paddw m6, m2 - paddw m7, m3 + paddw m7, m7 paddw m6, m7 mova [rsp+gprsize], m6 ; save sa8d SWAP m0, m6 @@ -1400,6 +1667,28 @@ HADAMARD_AC_WXH_MMX 8, 16 HADAMARD_AC_WXH_MMX 16, 8 HADAMARD_AC_WXH_MMX 8, 8 +%macro LOAD_INC_8x4W_SSE2 5 + movh m%1, [r0] + movh m%2, [r0+r1] + movh m%3, [r0+r1*2] + movh m%4, [r0+r2] +%ifidn %1, 0 + lea r0, [r0+r1*4] +%endif + punpcklbw m%1, m%5 + punpcklbw m%2, m%5 + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 +%endmacro + +%macro LOAD_INC_8x4W_SSSE3 5 + LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1] +%ifidn %1, 0 + lea r0, [r0+r1*4] +%endif + HSUMSUB %1, %2, %3, %4, %5 +%endmacro + %macro HADAMARD_AC_SSE2 1 INIT_XMM ; in: r0=pix, r1=stride, r2=stride*3 @@ -1414,45 +1703,55 @@ cglobal x264_hadamard_ac_8x8_%1 %define spill1 [rsp+gprsize+16] %define spill2 [rsp+gprsize+32] %endif +%ifnidn %1, sse2 + ;LOAD_INC loads sumsubs + mova m7, [hmul_8p GLOBAL] +%else + ;LOAD_INC only unpacks to words pxor m7, m7 - movh m0, [r0] - movh m1, [r0+r1] - movh m2, [r0+r1*2] - movh m3, [r0+r2] - lea r0, [r0+r1*4] - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - HADAMARD4_1D m0, m1, m2, m3 - mova spill0, m3 - SWAP m3, m7 - movh m4, [r0] - movh m5, [r0+r1] - movh m6, [r0+r1*2] - movh m7, [r0+r2] - punpcklbw m4, m3 - punpcklbw m5, m3 - punpcklbw m6, m3 - punpcklbw m7, m3 - HADAMARD4_1D m4, m5, m6, m7 - mova m3, spill0 -%ifdef ARCH_X86_64 - TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 +%endif + LOAD_INC_8x4W 0, 1, 2, 3, 7 +%ifidn %1, sse2 + HADAMARD4_2D_SSE 0, 1, 2, 3, 4 %else - TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1 + HADAMARD4_V m0, m1, m2, m3, m4 %endif - HADAMARD4_1D m0, m1, m2, m3 - HADAMARD4_1D m4, m5, m6, m7 mova spill0, m1 + SWAP 1, 7 + LOAD_INC_8x4W 4, 5, 6, 7, 1 +%ifidn %1, sse2 + HADAMARD4_2D_SSE 4, 5, 6, 7, 1 +%else + HADAMARD4_V m4, m5, m6, m7, m1 +%endif + +%ifnidn %1, sse2 + mova m1, spill0 + mova spill0, m6 + mova spill1, m7 + HADAMARD 1, sumsub, 0, 1, 6, 7 + HADAMARD 1, sumsub, 2, 3, 6, 7 + mova m6, spill0 + mova m7, spill1 + mova spill0, m1 + mova spill1, m0 + HADAMARD 1, sumsub, 4, 5, 1, 0 + HADAMARD 1, sumsub, 6, 7, 1, 0 + mova m0, spill1 +%endif + mova spill1, m2 mova spill2, m3 ABS_MOV m1, m0 ABS_MOV m2, m4 ABS_MOV m3, m5 paddw m1, m2 - SUMSUB_BA m0, m4 + SUMSUB_BA m0, m4; m2 +%ifnidn %1, sse2 + pand m1, [mask_ac4b GLOBAL] +%else pand m1, [mask_ac4 GLOBAL] +%endif ABS_MOV m2, spill0 paddw m1, m3 ABS_MOV m3, spill1 @@ -1474,31 +1773,29 @@ cglobal x264_hadamard_ac_8x8_%1 paddw m2, spill1 psubw m5, spill0 paddw m1, spill0 - mova spill1, m7 - SBUTTERFLY qdq, 0, 4, 7 - SBUTTERFLY qdq, 1, 5, 7 - SBUTTERFLY qdq, 2, 6, 7 - SUMSUB_BADC m0, m4, m1, m5 - SUMSUB_BA m2, m6 - ABS1 m0, m7 - ABS1 m1, m7 - pand m0, [mask_ac8 GLOBAL] - ABS1 m2, m7 +%ifnidn %1, sse2 + mova spill1, m4 + HADAMARD 2, amax, 3, 7, 4 + HADAMARD 2, amax, 2, 6, 7, 4 + mova m4, spill1 + HADAMARD 2, amax, 1, 5, 6, 7 + HADAMARD 2, sumsub, 0, 4, 5, 6 +%else + mova spill1, m4 + HADAMARD 4, amax, 3, 7, 4 + HADAMARD 4, amax, 2, 6, 7, 4 + mova m4, spill1 + HADAMARD 4, amax, 1, 5, 6, 7 + HADAMARD 4, sumsub, 0, 4, 5, 6 +%endif + paddw m2, m3 + paddw m2, m1 + paddw m2, m2 ABS1 m4, m7 - ABS1 m5, m7 - ABS1 m6, m7 - mova m7, spill1 - paddw m0, m4 - SBUTTERFLY qdq, 3, 7, 4 - SUMSUB_BA m3, m7 - paddw m1, m5 - ABS1 m3, m4 - ABS1 m7, m4 - paddw m2, m6 - paddw m3, m7 - paddw m0, m1 - paddw m2, m3 - paddw m0, m2 + pand m0, [mask_ac8 GLOBAL] + ABS1 m0, m7 + paddw m2, m4 + paddw m0, m2 mova [rsp+gprsize+16], m0 ; save sa8d SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1 ret @@ -1565,28 +1862,51 @@ cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11 %ifndef ARCH_X86_64 cextern x264_pixel_sa8d_8x8_internal_mmxext -SA8D_16x16_32 mmxext +SA8D mmxext %endif +%define TRANS TRANS_SSE2 %define ABS1 ABS1_MMX %define ABS2 ABS2_MMX +%define DIFFOP DIFF_UNPACK_SSE2 +%define JDUP JDUP_SSE2 +%define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2 +%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P +%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2 +%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size +%define movdqu movups +%define punpcklqdq movlhps +INIT_XMM +SA8D sse2 SATDS_SSE2 sse2 -SA8D_16x16_32 sse2 INTRA_SA8D_SSE2 sse2 INTRA_SATDS_MMX mmxext HADAMARD_AC_SSE2 sse2 + %define ABS1 ABS1_SSSE3 %define ABS2 ABS2_SSSE3 %define ABS_MOV ABS_MOV_SSSE3 -SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. +%define DIFFOP DIFF_SUMSUB_SSSE3 +%define JDUP JDUP_CONROE +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE +%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3 +%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 +%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 SATDS_SSE2 ssse3 -SA8D_16x16_32 ssse3 +SA8D ssse3 +HADAMARD_AC_SSE2 ssse3 +%undef movdqa ; nehalem doesn't like movaps +%undef movdqu ; movups +%undef punpcklqdq ; or movlhps INTRA_SA8D_SSE2 ssse3 INTRA_SATDS_MMX ssse3 -HADAMARD_AC_SSE2 ssse3 -SATDS_SSE2 ssse3_phadd - +%define TRANS TRANS_SSE4 +%define JDUP JDUP_PENRYN +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN +SATDS_SSE2 sse4 +SA8D sse4 +HADAMARD_AC_SSE2 sse4 ;============================================================================= ; SSIM diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 5bc81c75..0bb7dfeb 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -49,15 +49,17 @@ DECL_X4( sad, mmxext ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) DECL_X1( ssd, mmx ) +DECL_X1( ssd, sse2slow ) DECL_X1( ssd, sse2 ) -DECL_X1( ssd, sse4 ) +DECL_X1( ssd, ssse3 ) DECL_X1( satd, mmxext ) DECL_X1( satd, sse2 ) DECL_X1( satd, ssse3 ) -DECL_X1( satd, ssse3_phadd ) +DECL_X1( satd, sse4 ) DECL_X1( sa8d, mmxext ) DECL_X1( sa8d, sse2 ) DECL_X1( sa8d, ssse3 ) +DECL_X1( sa8d, sse4) DECL_X1( sad, cache32_mmxext ); DECL_X1( sad, cache64_mmxext ); DECL_X1( sad, cache64_sse2 ); @@ -72,6 +74,8 @@ DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( uint8_t *pix, int i_stride )) + void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * ); diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index 2e318ef7..8bfe5520 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -1,7 +1,10 @@ ;***************************************************************************** -;* x86inc.asm +;* x86util.asm ;***************************************************************************** -;* Copyright (C) 2008 Loren Merritt +;* Copyright (C) 2008 x264 project +;* +;* Authors: Holger Lubitz +;* Loren Merritt ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -93,7 +96,7 @@ SBUTTERFLY qdq, %4, %8, %2 SWAP %2, %5 SWAP %4, %7 -%if 0<11 +%if %0<11 movdqa m%5, %10 %endif %endif @@ -165,28 +168,147 @@ palignr %1, %2, %3 %endmacro -%macro SUMSUB_BA 2 +%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from +%ifnum %5 + mova m%1, m%5 + mova m%3, m%5 +%else + mova m%1, %5 + mova m%3, m%1 +%endif + pand m%1, m%2 ; dst .. y6 .. y4 + pand m%3, m%4 ; src .. y6 .. y4 + psrlw m%2, 8 ; dst .. y7 .. y5 + psrlw m%4, 8 ; src .. y7 .. y5 +%endmacro + +%macro SUMSUB_BA 2-3 +%if %0==2 paddw %1, %2 paddw %2, %2 psubw %2, %1 +%else + mova %3, %1 + paddw %1, %2 + psubw %2, %3 +%endif %endmacro -%macro SUMSUB_BADC 4 +%macro SUMSUB_BADC 4-5 +%if %0==5 + SUMSUB_BA %1, %2, %5 + SUMSUB_BA %3, %4, %5 +%else paddw %1, %2 paddw %3, %4 paddw %2, %2 paddw %4, %4 psubw %2, %1 psubw %4, %3 +%endif %endmacro -%macro HADAMARD8_1D 8 - SUMSUB_BADC %1, %5, %2, %6 - SUMSUB_BADC %3, %7, %4, %8 +%macro HADAMARD4_V 4+ + SUMSUB_BADC %1, %2, %3, %4 SUMSUB_BADC %1, %3, %2, %4 - SUMSUB_BADC %5, %7, %6, %8 +%endmacro + +%macro HADAMARD8_V 8+ SUMSUB_BADC %1, %2, %3, %4 SUMSUB_BADC %5, %6, %7, %8 + SUMSUB_BADC %1, %3, %2, %4 + SUMSUB_BADC %5, %7, %6, %8 + SUMSUB_BADC %1, %5, %2, %6 + SUMSUB_BADC %3, %7, %4, %8 +%endmacro + +%macro HADAMARD 5-6 +; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes) +; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes) +; %3/%4: regs +; %5(%6): tmpregs +%if %1!=0 ; have to reorder stuff for horizontal op + %ifidn %2, sumsub + %define ORDER ord + ; sumsub needs order because a-b != b-a unless a=b + %else + %define ORDER unord + ; if we just max, order doesn't matter (allows pblendw+or in sse4) + %endif + %if %1==1 + TRANS d, ORDER, %3, %4, %5, %6 + %elif %1==2 + %if mmsize==8 + SBUTTERFLY dq, %3, %4, %5 + %else + TRANS q, ORDER, %3, %4, %5, %6 + %endif + %elif %1==4 + SBUTTERFLY qdq, %3, %4, %5 + %endif +%endif +%ifidn %2, sumsub + SUMSUB_BA m%3, m%4, m%5 +%else + %ifidn %2, amax + %if %0==6 + ABS2 m%3, m%4, m%5, m%6 + %else + ABS1 m%3, m%5 + ABS1 m%4, m%5 + %endif + %endif + pmaxsw m%3, m%4 +%endif +%endmacro + + +%macro HADAMARD2_2D 6-7 sumsub + HADAMARD 0, sumsub, %1, %2, %5 + HADAMARD 0, sumsub, %3, %4, %5 + SBUTTERFLY %6, %1, %2, %5 +%ifnum %7 + HADAMARD 0, amax, %1, %2, %5, %7 +%else + HADAMARD 0, %7, %1, %2, %5 +%endif + SBUTTERFLY %6, %3, %4, %5 +%ifnum %7 + HADAMARD 0, amax, %3, %4, %5, %7 +%else + HADAMARD 0, %7, %3, %4, %5 +%endif +%endmacro + +%macro HADAMARD4_2D 5-6 sumsub + HADAMARD2_2D %1, %2, %3, %4, %5, wd + HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6 + SWAP %2, %3 +%endmacro + +%macro HADAMARD4_2D_SSE 5-6 sumsub + HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1 + HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3 + SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0 + SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2 + HADAMARD2_2D %1, %3, %2, %4, %5, dq + SBUTTERFLY qdq, %1, %2, %5 + HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1 + SBUTTERFLY qdq, %3, %4, %5 + HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3 +%endmacro + +%macro HADAMARD8_2D 9-10 sumsub + HADAMARD2_2D %1, %2, %3, %4, %9, wd + HADAMARD2_2D %5, %6, %7, %8, %9, wd + HADAMARD2_2D %1, %3, %2, %4, %9, dq + HADAMARD2_2D %5, %7, %6, %8, %9, dq + HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10 + HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10 +%ifnidn %10, amax + SWAP %2, %5 + SWAP %4, %7 +%endif %endmacro %macro SUMSUB2_AB 3 @@ -197,15 +319,52 @@ psubw %3, %2 %endmacro +%macro SUMSUB2_BA 3 + mova m%3, m%1 + paddw m%1, m%2 + paddw m%1, m%2 + psubw m%2, m%3 + psubw m%2, m%3 +%endmacro + %macro SUMSUBD2_AB 4 mova %4, %1 mova %3, %2 psraw %2, 1 - psraw %4, 1 - paddw %1, %2 - psubw %4, %3 + psraw %1, 1 + paddw %2, %4 + psubw %1, %3 +%endmacro + +%macro DCT4_1D 5 +%ifnum %5 + SUMSUB_BADC m%4, m%1, m%3, m%2; m%5 + SUMSUB_BA m%3, m%4, m%5 + SUMSUB2_AB m%1, m%2, m%5 + SWAP %1, %3, %4, %5, %2 +%else + SUMSUB_BADC m%4, m%1, m%3, m%2 + SUMSUB_BA m%3, m%4 + mova [%5], m%2 + SUMSUB2_AB m%1, [%5], m%2 + SWAP %1, %3, %4, %2 +%endif +%endmacro + +%macro IDCT4_1D 5-6 +%ifnum %5 + SUMSUBD2_AB m%2, m%4, m%6, m%5 + SUMSUB_BA m%3, m%1, m%6 + SUMSUB_BADC m%4, m%3, m%2, m%1, m%6 +%else + SUMSUBD2_AB m%2, m%4, [%5], [%5+16] + SUMSUB_BA m%3, m%1 + SUMSUB_BADC m%4, m%3, m%2, m%1 +%endif + SWAP %1, %4, %3 %endmacro + %macro LOAD_DIFF 5 %ifidn %3, none movh %1, %4 @@ -222,19 +381,82 @@ %endif %endmacro -%macro LOAD_DIFF_8x4P 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer - LOAD_DIFF %1, %5, none, [%7], [%8] - LOAD_DIFF %2, %6, none, [%7+r1], [%8+r3] - LOAD_DIFF %3, %5, none, [%7+2*r1], [%8+2*r3] - LOAD_DIFF %4, %6, none, [%7+r4], [%8+r5] +%macro LOAD_DIFF8x4_SSE2 8 + LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE] + LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE] + LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE] + LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE] %endmacro -%macro STORE_DIFF 4 +%macro LOAD_DIFF8x4_SSSE3 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr + movh m%2, [%8+%1*FDEC_STRIDE] + movh m%1, [%7+%1*FENC_STRIDE] + punpcklbw m%1, m%2 + movh m%3, [%8+%2*FDEC_STRIDE] + movh m%2, [%7+%2*FENC_STRIDE] + punpcklbw m%2, m%3 + movh m%4, [%8+%3*FDEC_STRIDE] + movh m%3, [%7+%3*FENC_STRIDE] + punpcklbw m%3, m%4 + movh m%5, [%8+%4*FDEC_STRIDE] + movh m%4, [%7+%4*FENC_STRIDE] + punpcklbw m%4, m%5 + pmaddubsw m%1, m%6 + pmaddubsw m%2, m%6 + pmaddubsw m%3, m%6 + pmaddubsw m%4, m%6 +%endmacro + +%macro STORE_DCT 6 + movq [%5+%6+ 0], m%1 + movq [%5+%6+ 8], m%2 + movq [%5+%6+16], m%3 + movq [%5+%6+24], m%4 + movhps [%5+%6+32], m%1 + movhps [%5+%6+40], m%2 + movhps [%5+%6+48], m%3 + movhps [%5+%6+56], m%4 +%endmacro + +%macro STORE_IDCT 4 + movhps [r0-4*FDEC_STRIDE], %1 + movh [r0-3*FDEC_STRIDE], %1 + movhps [r0-2*FDEC_STRIDE], %2 + movh [r0-1*FDEC_STRIDE], %2 + movhps [r0+0*FDEC_STRIDE], %3 + movh [r0+1*FDEC_STRIDE], %3 + movhps [r0+2*FDEC_STRIDE], %4 + movh [r0+3*FDEC_STRIDE], %4 +%endmacro + +%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? + LOAD_DIFF m%1, m%5, m%7, [%8], [%9] + LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] + LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] + LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro DIFFx2 6-7 + movh %3, %5 + punpcklbw %3, %4 psraw %1, 6 + paddsw %1, %3 + movh %3, %6 + punpcklbw %3, %4 + psraw %2, 6 + paddsw %2, %3 + packuswb %2, %1 +%endmacro + +%macro STORE_DIFF 4 movh %2, %4 punpcklbw %2, %3 + psraw %1, 6 paddsw %1, %2 packuswb %1, %1 movh %4, %1 %endmacro - diff --git a/tools/checkasm.c b/tools/checkasm.c index 152db4dd..1c1562a3 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -292,7 +292,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_PIXEL( sad_aligned, 1 ); TEST_PIXEL( ssd, 1 ); TEST_PIXEL( satd, 0 ); - TEST_PIXEL( sa8d, 0 ); + TEST_PIXEL( sa8d, 1 ); #define TEST_PIXEL_X( N ) \ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \