From: Holger Lubitz Date: Tue, 6 Oct 2009 22:17:34 +0000 (-0700) Subject: SSE4 version of 4x4 idct X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e9fbd8db8908074f46005383bf0c117d5fc4c8a8;p=libx264 SSE4 version of 4x4 idct 27->24 clocks on Nehalem. This is really just an excuse to use "movsd" in a real function. Add some comments to subsum-related macros in x86util. --- diff --git a/common/dct.c b/common/dct.c index 4ac9a86a..c5a79139 100644 --- a/common/dct.c +++ b/common/dct.c @@ -482,6 +482,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; } + + if( cpu&X264_CPU_SSE4 ) + dctf->add4x4_idct = x264_add4x4_idct_sse4; + #endif //HAVE_MMX #ifdef ARCH_PPC diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 61abd875..c4ebae5b 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -27,6 +27,8 @@ %include "x86util.asm" SECTION_RODATA +pw_32_0: times 4 dw 32 + times 4 dw 0 pw_32: times 8 dw 32 pw_8000: times 8 dw 0x8000 hsub_mul: times 8 db 1, -1 @@ -148,6 +150,59 @@ cglobal x264_add4x4_idct_mmx, 2,2 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE] RET +INIT_XMM +cglobal x264_add4x4_idct_sse4, 2,2,6 + mova m0, [r1+0x00] ; row1/row0 + mova m2, [r1+0x10] ; row3/row2 + mova m1, m0 ; row1/row0 + psraw m0, 1 ; row1>>1/... + mova m3, m2 ; row3/row2 + psraw m2, 1 ; row3>>1/... + movsd m0, m1 ; row1>>1/row0 + movsd m2, m3 ; row3>>1/row2 + psubw m0, m3 ; row1>>1-row3/row0-2 + paddw m2, m1 ; row3>>1+row1/row0+2 + SBUTTERFLY2 wd, 0, 2, 1 + SUMSUB_BA m2, m0, m1 + pshuflw m1, m2, 10110001b + pshufhw m2, m2, 10110001b + punpckldq m1, m0 + punpckhdq m2, m0 + SWAP 0, 1 + + mova m1, [pw_32_0 GLOBAL] + paddw m1, m0 ; row1/row0 corrected + psraw m0, 1 ; row1>>1/... + mova m3, m2 ; row3/row2 + psraw m2, 1 ; row3>>1/... + movsd m0, m1 ; row1>>1/row0 + movsd m2, m3 ; row3>>1/row2 + psubw m0, m3 ; row1>>1-row3/row0-2 + paddw m2, m1 ; row3>>1+row1/row0+2 + SBUTTERFLY2 qdq, 0, 2, 1 + SUMSUB_BA m2, m0, m1 + + movd m4, [r0+FDEC_STRIDE*0] + movd m1, [r0+FDEC_STRIDE*1] + movd m3, [r0+FDEC_STRIDE*2] + movd m5, [r0+FDEC_STRIDE*3] + punpckldq m1, m4 ; row0/row1 + pxor m4, m4 + punpckldq m3, m5 ; row3/row2 + punpcklbw m1, m4 + psraw m2, 6 + punpcklbw m3, m4 + psraw m0, 6 + paddsw m2, m1 + paddsw m0, m3 + packuswb m0, m2 ; row0/row1/row3/row2 + pextrd [r0+FDEC_STRIDE*0], m0, 3 + pextrd [r0+FDEC_STRIDE*1], m0, 2 + movd [r0+FDEC_STRIDE*2], m0 + pextrd [r0+FDEC_STRIDE*3], m0, 1 + RET + +INIT_MMX ;----------------------------------------------------------------------------- ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- diff --git a/common/x86/dct.h b/common/x86/dct.h index 0502b59f..9f6ed8d4 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -36,6 +36,7 @@ void x264_sub8x8_dct_dc_mmxext( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 void x264_sub8x8_dct_dc_sse2 ( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ); void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] ); +void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct[ 4][4] ); void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] ); void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] ); void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] ); diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index cfd7767e..be010e5c 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -28,6 +28,13 @@ SWAP %3, %4 %endmacro +%macro SBUTTERFLY2 4 + mova m%4, m%2 + punpckh%1 m%2, m%3 + punpckl%1 m%4, m%3 + SWAP %2, %4, %3 +%endmacro + %macro TRANSPOSE4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5 @@ -386,10 +393,10 @@ %macro SUMSUBD2_AB 4 mova %4, %1 mova %3, %2 - psraw %2, 1 - psraw %1, 1 - paddw %2, %4 - psubw %1, %3 + psraw %2, 1 ; %2: %2>>1 + psraw %1, 1 ; %1: %1>>1 + paddw %2, %4 ; %2: %2>>1+%1 + psubw %1, %3 ; %1: %1>>1-%2 %endmacro %macro DCT4_1D 5 @@ -410,14 +417,24 @@ %macro IDCT4_1D 5-6 %ifnum %5 SUMSUBD2_AB m%2, m%4, m%6, m%5 + ; %2: %2>>1-%4 %4: %2+%4>>1 SUMSUB_BA m%3, m%1, m%6 + ; %3: %1+%3 %1: %1-%3 SUMSUB_BADC m%4, m%3, m%2, m%1, m%6 + ; %4: %1+%3 + (%2+%4>>1) + ; %3: %1+%3 - (%2+%4>>1) + ; %2: %1-%3 + (%2>>1-%4) + ; %1: %1-%3 - (%2>>1-%4) %else SUMSUBD2_AB m%2, m%4, [%5], [%5+16] SUMSUB_BA m%3, m%1 SUMSUB_BADC m%4, m%3, m%2, m%1 %endif SWAP %1, %4, %3 + ; %1: %1+%3 + (%2+%4>>1) row0 + ; %2: %1-%3 + (%2>>1-%4) row1 + ; %3: %1-%3 - (%2>>1-%4) row2 + ; %4: %1+%3 - (%2+%4>>1) row3 %endmacro