From: Loren Merritt Date: Sat, 24 Sep 2005 18:22:02 +0000 (+0000) Subject: faster mmx quant 15bit, and add 16bit version. total speedup: ~0.3% X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cfebeac1a475f4a2ee57e5dd3cd1ff0c560f38db;p=libx264 faster mmx quant 15bit, and add 16bit version. total speedup: ~0.3% patch by Christian Heine. git-svn-id: svn://svn.videolan.org/x264/trunk@298 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/quant-a.asm b/common/amd64/quant-a.asm index e3699708..91d9347b 100644 --- a/common/amd64/quant-a.asm +++ b/common/amd64/quant-a.asm @@ -21,6 +21,16 @@ ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** +;***************************************************************************** +;* * +;* Revision history: * +;* * +;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) * +;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) * +;* 2005.09.21 faster MMX and added MMXEXT16 (CH) * +;* * +;***************************************************************************** + BITS 64 %macro cglobal 1 @@ -36,184 +46,338 @@ ALIGN 16 SECTION .text -cglobal x264_quant_8x8_core16_mmx -cglobal x264_quant_4x4_core16_mmx -cglobal x264_quant_8x8_core32_mmx -cglobal x264_quant_4x4_core32_mmx -cglobal x264_quant_4x4_dc_core32_mmx -cglobal x264_quant_2x2_dc_core32_mmx - -%macro QUANT_AC_START 0 -; mov rdi, rdi ; dct -; mov rsi, rsi ; quant_mf - movd mm6, edx ; i_qbits - movd mm7, ecx ; f - punpckldq mm7, mm7 -%endmacro - -%macro QUANT_DC_START 0 -; mov rdi, rdi ; dct - movd mm5, rsi ; i_quant_mf - movd mm6, edx ; i_qbits - movd mm7, ecx ; f - punpckldq mm5, mm5 - punpckldq mm7, mm7 +cglobal x264_quant_2x2_dc_core15_mmx +cglobal x264_quant_4x4_dc_core15_mmx +cglobal x264_quant_4x4_core15_mmx +cglobal x264_quant_8x8_core15_mmx + +cglobal x264_quant_2x2_dc_core16_mmxext +cglobal x264_quant_4x4_dc_core16_mmxext +cglobal x264_quant_4x4_core16_mmxext +cglobal x264_quant_8x8_core16_mmxext + +cglobal x264_quant_2x2_dc_core32_mmxext +cglobal x264_quant_4x4_dc_core32_mmxext +cglobal x264_quant_4x4_core32_mmxext +cglobal x264_quant_8x8_core32_mmxext + +%macro MMX_QUANT_AC_START 0 +; mov rdi, rdi ; &dct[0][0] +; mov rsi, rsi ; &quant_mf[0][0] + movd mm6, edx ; i_qbits + movd mm7, ecx ; f + punpckldq mm7, mm7 ; f in each dword %endmacro -%macro QUANT16_1x4 5 -;;; %1 dct[y][x] -;;; %2,%3 quant_mf[i_mf][y][x], entries must fit in int16 -;;; %4 i_qbits -;;; %5 f as doublewords -;;; trashes mm0-mm5 - movq mm0, %1 - movq mm1, %2 - movq mm2, %3 - packssdw mm1, mm2 - - movq mm4, mm0 - pxor mm5, mm5 - pcmpgtw mm4, mm5 - - movq mm2, mm0 - pmullw mm0, mm1 - pmulhw mm2, mm1 - - movq mm1, mm0 - punpcklwd mm0, mm2 - punpckhwd mm1, mm2 - - movq mm2, %5 - movq mm3, %5 - psubd mm2, mm0 - psubd mm3, mm1 - paddd mm0, %5 - paddd mm1, %5 - - psrad mm0, %4 - psrad mm1, %4 - psrad mm2, %4 - psrad mm3, %4 - - packssdw mm0, mm1 - packssdw mm2, mm3 - pxor mm5, mm5 - psubw mm5, mm2 - - pand mm0, mm4 - pandn mm4, mm5 - - por mm0, mm4 - movq %1, mm0 +%macro MMX_QUANT15_DC_START 0 +; mov rdi, rdi ; &dct[0][0] + movd mm5, rsi ; i_qmf + movd mm6, edx ; i_qbits + movd mm7, ecx ; f + punpcklwd mm5, mm5 + punpcklwd mm5, mm5 ; i_qmf in each word + punpckldq mm7, mm7 ; f in each dword %endmacro -%macro QUANT32_1x4 5 -;;; %1 dct[y][x] -;;; %2,%3 quant_mf[i_mf][y][x] -;;; %4 i_qbits -;;; %5 f as doublewords -;;; trashes mm0-mm4 - movq mm0, %1 +%macro MMX_QUANT15_1x4 4 +;;; %1 (m64) dct[y][x] +;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t) +;;; %3 (mmx) i_qbits in the low doubleword +;;; %4 (mmx) f as doublewords +;;; trashes mm0-mm2,mm4 + movq mm0, %1 ; load dct coeffs pxor mm4, mm4 - pcmpgtw mm4, mm0 ; mm4 = sign(mm0) + pcmpgtw mm4, mm0 ; sign(coeff) pxor mm0, mm4 - psubw mm0, mm4 ; mm0 = abs(mm0) + psubw mm0, mm4 ; abs(coeff) + + movq mm2, mm0 + pmullw mm0, %2 + pmulhw mm2, %2 + movq mm1, mm0 - punpcklwd mm0, mm0 ; duplicate the words for the upcomming - punpckhwd mm1, mm1 ; 32 bit multiplication + punpcklwd mm0, mm2 + punpckhwd mm1, mm2 + + paddd mm0, %4 ; round with f + paddd mm1, %4 + psrad mm0, %3 + psrad mm1, %3 + + packssdw mm0, mm1 ; pack + pxor mm0, mm4 ; restore sign + psubw mm0, mm4 + movq %1, mm0 ; store +%endmacro - movq mm2, mm0 ; like in school ... - movq mm3, mm1 - pmulhuw mm0, %2 ; ... multiply the parts ... - pmulhuw mm1, %3 - pmullw mm2, %2 - pmullw mm3, %3 - pslld mm0, 16 ; ... shift ... - pslld mm1, 16 - paddd mm0, mm2 ; ... and add them - paddd mm1, mm3 +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_2x2_dc_core15_mmx: + MMX_QUANT15_DC_START + MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7 + ret - paddd mm0, %5 ; round with f - paddd mm1, %5 - psrad mm0, %4 - psrad mm1, %4 - packssdw mm0, mm1 ; pack & store +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_dc_core15_mmx: + MMX_QUANT15_DC_START + +%rep 4 + MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7 + add rdi, byte 8 +%endrep + + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4], +; int const quant_mf[4][4], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_core15_mmx: + MMX_QUANT_AC_START + +%rep 4 + movq mm5, [rsi] + packssdw mm5, [rsi+8] + MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7 + add rsi, byte 16 + add rdi, byte 8 +%endrep + + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8], +; int const quant_mf[8][8], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_8x8_core15_mmx: + MMX_QUANT_AC_START + +%rep 16 + movq mm5, [rsi] + packssdw mm5, [rsi+8] + MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7 + add rsi, byte 16 + add rdi, byte 8 +%endrep + + ret + +; ============================================================================ + +%macro MMXEXT_QUANT16_DC_START 0 +; mov rdi, rdi ; &dct[0][0] + movd mm5, rsi ; i_qmf + movd mm6, edx ; i_qbits + movd mm7, ecx ; f + pshufw mm5, mm5, 0 ; i_qmf in each word + punpckldq mm7, mm7 ; f in each dword +%endmacro + +%macro MMXEXT_QUANT16_1x4 4 +;;; %1 (m64) dct[y][x] +;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t) +;;; %3 (mmx) i_qbits in the low doubleword +;;; %4 (mmx) f as doublewords +;;; trashes mm0-mm2,mm4 + movq mm0, %1 ; load dct coeffs + pxor mm4, mm4 + pcmpgtw mm4, mm0 ; sign(coeff) pxor mm0, mm4 - psubw mm0, mm4 ; restore sign - movq %1, mm0 + psubw mm0, mm4 ; abs(coeff) + + movq mm2, mm0 + pmullw mm0, %2 + pmulhuw mm2, %2 + + movq mm1, mm0 + punpcklwd mm0, mm2 + punpckhwd mm1, mm2 + + paddd mm0, %4 ; round with f + paddd mm1, %4 + psrad mm0, %3 + psrad mm1, %3 + + packssdw mm0, mm1 ; pack + pxor mm0, mm4 ; restore sign + psubw mm0, mm4 + movq %1, mm0 ; store %endmacro +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_2x2_dc_core16_mmxext: + MMXEXT_QUANT16_DC_START + MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7 + ret ALIGN 16 -;;; void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f ) -x264_quant_8x8_core16_mmx: - QUANT_AC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_dc_core16_mmxext: + MMXEXT_QUANT16_DC_START -%rep 16 - QUANT16_1x4 [rdi], [rsi], [rsi+8], mm6, mm7 - add rdi, 8 - add rsi, 16 +%rep 4 + MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7 + add rdi, byte 8 %endrep ret ALIGN 16 -;;; void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f ) -x264_quant_4x4_core16_mmx: - QUANT_AC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4], +; int const quant_mf[4][4], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_core16_mmxext: + MMX_QUANT_AC_START %rep 4 - QUANT16_1x4 [rdi], [rsi], [rsi+8], mm6, mm7 - add rdi, 8 - add rsi, 16 + pshufw mm5, [rsi], 10110001b + paddw mm5, [rsi+8] + pshufw mm5, mm5, 10001101b + MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7 + add rsi, byte 16 + add rdi, byte 8 %endrep ret ALIGN 16 -;;; void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f ) -x264_quant_8x8_core32_mmx: - QUANT_AC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8], +; int const quant_mf[8][8], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_8x8_core16_mmxext: + MMX_QUANT_AC_START %rep 16 - QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7 - add rdi, 8 - add rsi, 16 + pshufw mm5, [rsi], 10110001b + paddw mm5, [rsi+8] + pshufw mm5, mm5, 10001101b + MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7 + add rsi, byte 16 + add rdi, byte 8 %endrep ret + + +%macro MMX_QUANT32_DC_START 0 +; mov rdi, rdi ; &dct[0][0] + movd mm5, rsi ; i_qmf + movd mm6, edx ; i_qbits + movd mm7, ecx ; f + punpckldq mm5, mm5 ; i_qmf in each dword + punpckldq mm7, mm7 ; f in each dword +%endmacro + +%macro MMXEXT_QUANT32_1x4 5 +;;; %1 (m64) dct[y][x] +;;; %2,%3 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t) +;;; %4 (mmx) i_qbits in the low quadword +;;; %5 (mmx) f as doublewords +;;; trashes mm0-mm4 + movq mm0, %1 ; load dct coeffs + pxor mm4, mm4 + pcmpgtw mm4, mm0 ; sign(mm0) + pxor mm0, mm4 + psubw mm0, mm4 ; abs(mm0) + movq mm1, mm0 + punpcklwd mm0, mm0 ; duplicate the words for the upcomming + punpckhwd mm1, mm1 ; 32 bit multiplication + + movq mm2, mm0 ; like in school ... + movq mm3, mm1 + pmulhuw mm0, %2 ; ... multiply the parts ... + pmulhuw mm1, %3 + pmullw mm2, %2 + pmullw mm3, %3 + pslld mm0, 16 ; ... shift ... + pslld mm1, 16 + paddd mm0, mm2 ; ... and add them + paddd mm1, mm3 + + paddd mm0, %5 ; round with f + paddd mm1, %5 + psrad mm0, %4 + psrad mm1, %4 + + packssdw mm0, mm1 ; pack to int16_t + pxor mm0, mm4 ; restore sign + psubw mm0, mm4 + movq %1, mm0 ; store +%endmacro + ALIGN 16 -;;; void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f ) -x264_quant_4x4_core32_mmx: - QUANT_AC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_2x2_dc_core32_mmxext: + MMX_QUANT32_DC_START + MMXEXT_QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7 + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_dc_core32_mmxext: + MMX_QUANT32_DC_START %rep 4 - QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7 - add rdi, 8 - add rsi, 16 + MMXEXT_QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7 + add rdi, byte 8 %endrep ret ALIGN 16 -;;; void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f ) -x264_quant_4x4_dc_core32_mmx: - QUANT_DC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4], +; int const quant_mf[4][4], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_core32_mmxext: + MMX_QUANT_AC_START %rep 4 - QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7 - add rdi, 8 + MMXEXT_QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7 + add rdi, byte 8 + add rsi, byte 16 %endrep ret ALIGN 16 -;;; void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f ) -x264_quant_2x2_dc_core32_mmx: - QUANT_DC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8], +; int const quant_mf[8][8], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_8x8_core32_mmxext: + MMX_QUANT_AC_START - QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7 +%rep 16 + MMXEXT_QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7 + add rdi, byte 8 + add rsi, byte 16 +%endrep ret diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm index 7806736f..b8813b36 100644 --- a/common/i386/quant-a.asm +++ b/common/i386/quant-a.asm @@ -21,6 +21,16 @@ ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** +;***************************************************************************** +;* * +;* Revision history: * +;* * +;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) * +;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) * +;* 2005.09.21 faster MMX and added MMXEXT16 (CH) * +;* * +;***************************************************************************** + BITS 32 %macro cglobal 1 @@ -36,184 +46,338 @@ ALIGN 16 SECTION .text -cglobal x264_quant_8x8_core16_mmx -cglobal x264_quant_4x4_core16_mmx -cglobal x264_quant_8x8_core32_mmx -cglobal x264_quant_4x4_core32_mmx -cglobal x264_quant_4x4_dc_core32_mmx -cglobal x264_quant_2x2_dc_core32_mmx - -%macro QUANT_AC_START 0 - mov eax, [esp+ 4] ; dct - mov ecx, [esp+ 8] ; quant_mf - movd mm6, [esp+12] ; i_qbits - movd mm7, [esp+16] ; f - punpckldq mm7, mm7 -%endmacro - -%macro QUANT_DC_START 0 - mov eax, [esp+ 4] ; dct - movd mm5, [esp+ 8] ; i_quant_mf - movd mm6, [esp+12] ; i_qbits - movd mm7, [esp+16] ; f - punpckldq mm5, mm5 - punpckldq mm7, mm7 +cglobal x264_quant_2x2_dc_core15_mmx +cglobal x264_quant_4x4_dc_core15_mmx +cglobal x264_quant_4x4_core15_mmx +cglobal x264_quant_8x8_core15_mmx + +cglobal x264_quant_2x2_dc_core16_mmxext +cglobal x264_quant_4x4_dc_core16_mmxext +cglobal x264_quant_4x4_core16_mmxext +cglobal x264_quant_8x8_core16_mmxext + +cglobal x264_quant_2x2_dc_core32_mmxext +cglobal x264_quant_4x4_dc_core32_mmxext +cglobal x264_quant_4x4_core32_mmxext +cglobal x264_quant_8x8_core32_mmxext + +%macro MMX_QUANT_AC_START 0 + mov eax, [esp+ 4] ; &dct[0][0] + mov ecx, [esp+ 8] ; &quant_mf[0][0] + movd mm6, [esp+12] ; i_qbits + movd mm7, [esp+16] ; f + punpckldq mm7, mm7 ; f in each dword %endmacro -%macro QUANT16_1x4 5 -;;; %1 dct[y][x] -;;; %2,%3 quant_mf[i_mf][y][x], entries must fit in int16 -;;; %4 i_qbits -;;; %5 f as doublewords -;;; trashes mm0-mm5 - movq mm0, %1 - movq mm1, %2 - movq mm2, %3 - packssdw mm1, mm2 - - movq mm4, mm0 - pxor mm5, mm5 - pcmpgtw mm4, mm5 - - movq mm2, mm0 - pmullw mm0, mm1 - pmulhw mm2, mm1 - - movq mm1, mm0 - punpcklwd mm0, mm2 - punpckhwd mm1, mm2 - - movq mm2, %5 - movq mm3, %5 - psubd mm2, mm0 - psubd mm3, mm1 - paddd mm0, %5 - paddd mm1, %5 - - psrad mm0, %4 - psrad mm1, %4 - psrad mm2, %4 - psrad mm3, %4 - - packssdw mm0, mm1 - packssdw mm2, mm3 - pxor mm5, mm5 - psubw mm5, mm2 - - pand mm0, mm4 - pandn mm4, mm5 - - por mm0, mm4 - movq %1, mm0 +%macro MMX_QUANT15_DC_START 0 + mov eax, [esp+ 4] ; &dct[0][0] + movd mm5, [esp+ 8] ; i_qmf + movd mm6, [esp+12] ; i_qbits + movd mm7, [esp+16] ; f + punpcklwd mm5, mm5 + punpcklwd mm5, mm5 ; i_qmf in each word + punpckldq mm7, mm7 ; f in each dword %endmacro -%macro QUANT32_1x4 5 -;;; %1 dct[y][x] -;;; %2,%3 quant_mf[i_mf][y][x] -;;; %4 i_qbits -;;; %5 f as doublewords -;;; trashes mm0-mm4 - movq mm0, %1 +%macro MMX_QUANT15_1x4 4 +;;; %1 (m64) dct[y][x] +;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t) +;;; %3 (mmx) i_qbits in the low doubleword +;;; %4 (mmx) f as doublewords +;;; trashes mm0-mm2,mm4 + movq mm0, %1 ; load dct coeffs pxor mm4, mm4 - pcmpgtw mm4, mm0 ; mm4 = sign(mm0) + pcmpgtw mm4, mm0 ; sign(coeff) pxor mm0, mm4 - psubw mm0, mm4 ; mm0 = abs(mm0) + psubw mm0, mm4 ; abs(coeff) + + movq mm2, mm0 + pmullw mm0, %2 + pmulhw mm2, %2 + movq mm1, mm0 - punpcklwd mm0, mm0 ; duplicate the words for the upcomming - punpckhwd mm1, mm1 ; 32 bit multiplication + punpcklwd mm0, mm2 + punpckhwd mm1, mm2 + + paddd mm0, %4 ; round with f + paddd mm1, %4 + psrad mm0, %3 + psrad mm1, %3 + + packssdw mm0, mm1 ; pack + pxor mm0, mm4 ; restore sign + psubw mm0, mm4 + movq %1, mm0 ; store +%endmacro - movq mm2, mm0 ; like in school ... - movq mm3, mm1 - pmulhuw mm0, %2 ; ... multiply the parts ... - pmulhuw mm1, %3 - pmullw mm2, %2 - pmullw mm3, %3 - pslld mm0, 16 ; ... shift ... - pslld mm1, 16 - paddd mm0, mm2 ; ... and add them - paddd mm1, mm3 +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_2x2_dc_core15_mmx: + MMX_QUANT15_DC_START + MMX_QUANT15_1x4 [eax], mm5, mm6, mm7 + ret - paddd mm0, %5 ; round with f - paddd mm1, %5 - psrad mm0, %4 - psrad mm1, %4 - packssdw mm0, mm1 ; pack & store +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_dc_core15_mmx: + MMX_QUANT15_DC_START + +%rep 4 + MMX_QUANT15_1x4 [eax], mm5, mm6, mm7 + add eax, byte 8 +%endrep + + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4], +; int const quant_mf[4][4], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_core15_mmx: + MMX_QUANT_AC_START + +%rep 4 + movq mm5, [ecx] + packssdw mm5, [ecx+8] + MMX_QUANT15_1x4 [eax], mm5, mm6, mm7 + add ecx, byte 16 + add eax, byte 8 +%endrep + + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8], +; int const quant_mf[8][8], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_8x8_core15_mmx: + MMX_QUANT_AC_START + +%rep 16 + movq mm5, [ecx] + packssdw mm5, [ecx+8] + MMX_QUANT15_1x4 [eax], mm5, mm6, mm7 + add ecx, byte 16 + add eax, byte 8 +%endrep + + ret + +; ============================================================================ + +%macro MMXEXT_QUANT16_DC_START 0 + mov eax, [esp+ 4] ; &dct[0][0] + movd mm5, [esp+ 8] ; i_qmf + movd mm6, [esp+12] ; i_qbits + movd mm7, [esp+16] ; f + pshufw mm5, mm5, 0 ; i_qmf in each word + punpckldq mm7, mm7 ; f in each dword +%endmacro + +%macro MMXEXT_QUANT16_1x4 4 +;;; %1 (m64) dct[y][x] +;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t) +;;; %3 (mmx) i_qbits in the low doubleword +;;; %4 (mmx) f as doublewords +;;; trashes mm0-mm2,mm4 + movq mm0, %1 ; load dct coeffs + pxor mm4, mm4 + pcmpgtw mm4, mm0 ; sign(coeff) pxor mm0, mm4 - psubw mm0, mm4 ; restore sign - movq %1, mm0 + psubw mm0, mm4 ; abs(coeff) + + movq mm2, mm0 + pmullw mm0, %2 + pmulhuw mm2, %2 + + movq mm1, mm0 + punpcklwd mm0, mm2 + punpckhwd mm1, mm2 + + paddd mm0, %4 ; round with f + paddd mm1, %4 + psrad mm0, %3 + psrad mm1, %3 + + packssdw mm0, mm1 ; pack + pxor mm0, mm4 ; restore sign + psubw mm0, mm4 + movq %1, mm0 ; store %endmacro +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_2x2_dc_core16_mmxext: + MMXEXT_QUANT16_DC_START + MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7 + ret ALIGN 16 -;;; void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f ) -x264_quant_8x8_core16_mmx: - QUANT_AC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_dc_core16_mmxext: + MMXEXT_QUANT16_DC_START -%rep 16 - QUANT16_1x4 [eax], [ecx], [ecx+8], mm6, mm7 - add eax, 8 - add ecx, 16 +%rep 4 + MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7 + add eax, byte 8 %endrep ret ALIGN 16 -;;; void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f ) -x264_quant_4x4_core16_mmx: - QUANT_AC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4], +; int const quant_mf[4][4], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_core16_mmxext: + MMX_QUANT_AC_START %rep 4 - QUANT16_1x4 [eax], [ecx], [ecx+8], mm6, mm7 - add eax, 8 - add ecx, 16 + pshufw mm5, [ecx], 10110001b + paddw mm5, [ecx+8] + pshufw mm5, mm5, 10001101b + MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7 + add ecx, byte 16 + add eax, byte 8 %endrep ret ALIGN 16 -;;; void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f ) -x264_quant_8x8_core32_mmx: - QUANT_AC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8], +; int const quant_mf[8][8], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_8x8_core16_mmxext: + MMX_QUANT_AC_START %rep 16 - QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7 - add eax, 8 - add ecx, 16 + pshufw mm5, [ecx], 10110001b + paddw mm5, [ecx+8] + pshufw mm5, mm5, 10001101b + MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7 + add ecx, byte 16 + add eax, byte 8 %endrep ret + + +%macro MMX_QUANT32_DC_START 0 + mov eax, [esp+ 4] ; &dct[0][0] + movd mm5, [esp+ 8] ; i_qmf + movd mm6, [esp+12] ; i_qbits + movd mm7, [esp+16] ; f + punpckldq mm5, mm5 ; i_qmf in each dword + punpckldq mm7, mm7 ; f in each dword +%endmacro + +%macro MMXEXT_QUANT32_1x4 5 +;;; %1 (m64) dct[y][x] +;;; %2,%3 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t) +;;; %4 (mmx) i_qbits in the low quadword +;;; %5 (mmx) f as doublewords +;;; trashes mm0-mm4 + movq mm0, %1 ; load dct coeffs + pxor mm4, mm4 + pcmpgtw mm4, mm0 ; sign(mm0) + pxor mm0, mm4 + psubw mm0, mm4 ; abs(mm0) + movq mm1, mm0 + punpcklwd mm0, mm0 ; duplicate the words for the upcomming + punpckhwd mm1, mm1 ; 32 bit multiplication + + movq mm2, mm0 ; like in school ... + movq mm3, mm1 + pmulhuw mm0, %2 ; ... multiply the parts ... + pmulhuw mm1, %3 + pmullw mm2, %2 + pmullw mm3, %3 + pslld mm0, 16 ; ... shift ... + pslld mm1, 16 + paddd mm0, mm2 ; ... and add them + paddd mm1, mm3 + + paddd mm0, %5 ; round with f + paddd mm1, %5 + psrad mm0, %4 + psrad mm1, %4 + + packssdw mm0, mm1 ; pack to int16_t + pxor mm0, mm4 ; restore sign + psubw mm0, mm4 + movq %1, mm0 ; store +%endmacro + ALIGN 16 -;;; void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f ) -x264_quant_4x4_core32_mmx: - QUANT_AC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_2x2_dc_core32_mmxext: + MMX_QUANT32_DC_START + MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7 + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_dc_core32_mmxext: + MMX_QUANT32_DC_START %rep 4 - QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7 - add eax, 8 - add ecx, 16 + MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7 + add eax, byte 8 %endrep ret ALIGN 16 -;;; void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f ) -x264_quant_4x4_dc_core32_mmx: - QUANT_DC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4], +; int const quant_mf[4][4], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_core32_mmxext: + MMX_QUANT_AC_START %rep 4 - QUANT32_1x4 [eax], mm5, mm5, mm6, mm7 - add eax, 8 + MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7 + add eax, byte 8 + add ecx, byte 16 %endrep ret ALIGN 16 -;;; void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f ) -x264_quant_2x2_dc_core32_mmx: - QUANT_DC_START +;----------------------------------------------------------------------------- +; void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8], +; int const quant_mf[8][8], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_8x8_core32_mmxext: + MMX_QUANT_AC_START - QUANT32_1x4 [eax], mm5, mm5, mm6, mm7 +%rep 16 + MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7 + add eax, byte 8 + add ecx, byte 16 +%endrep ret diff --git a/common/i386/quant.h b/common/i386/quant.h new file mode 100644 index 00000000..87fabbd4 --- /dev/null +++ b/common/i386/quant.h @@ -0,0 +1,53 @@ +/***************************************************************************** + * quant.h: h264 encoder library + ***************************************************************************** + * Copyright (C) 2005 x264 project + * + * Authors: Christian Heine + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifndef _I386_QUANT_H +#define _I386_QUANT_H 1 + +void x264_quant_8x8_core15_mmx( int16_t dct[8][8], + int quant_mf[8][8], int const i_qbits, int const f ); +void x264_quant_4x4_core15_mmx( int16_t dct[4][4], + int quant_mf[4][4], int const i_qbits, int const f ); +void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4], + int const i_qmf, int const i_qbits, int const f ); +void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2], + int const i_qmf, int const i_qbits, int const f ); + +void x264_quant_8x8_core16_mmxext( int16_t dct[8][8], + int quant_mf[8][8], int const i_qbits, int const f ); +void x264_quant_4x4_core16_mmxext( int16_t dct[4][4], + int quant_mf[4][4], int const i_qbits, int const f ); +void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4], + int const i_qmf, int const i_qbits, int const f ); +void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2], + int const i_qmf, int const i_qbits, int const f ); + +void x264_quant_8x8_core32_mmxext( int16_t dct[8][8], + int quant_mf[8][8], int const i_qbits, int const f ); +void x264_quant_4x4_core32_mmxext( int16_t dct[4][4], + int quant_mf[4][4], int const i_qbits, int const f ); +void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4], + int const i_qmf, int const i_qbits, int const f ); +void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2], + int const i_qmf, int const i_qbits, int const f ); + +#endif diff --git a/common/quant.c b/common/quant.c index 437a135d..fc32cfd2 100644 --- a/common/quant.c +++ b/common/quant.c @@ -22,13 +22,9 @@ #include "common.h" -void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f ); -void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f ); -void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f ); -void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f ); -void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f ); -void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f ); - +#ifdef HAVE_MMXEXT +#include "i386/quant.h" +#endif #define QUANT_ONE( coef, mf ) \ { \ @@ -70,7 +66,7 @@ static void quant_2x2_dc_core( int16_t dct[2][2], int i_quant_mf, int i_qbits, i void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) { - const char *name[4] = { "C", "C", "C", "C" }; + int i, maxQ8=0, maxQ4=0, maxQdc=0; pf->quant_8x8_core = quant_8x8_core; pf->quant_4x4_core = quant_4x4_core; @@ -78,34 +74,64 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_2x2_dc_core = quant_2x2_dc_core; #ifdef HAVE_MMXEXT - if( cpu&X264_CPU_MMX ) + + /* determine the biggest coeffient in all quant8_mf tables */ + for( i = 0; i < 2*6*8*8; i++ ) { - int i; - - pf->quant_8x8_core = x264_quant_8x8_core16_mmx; - pf->quant_4x4_core = x264_quant_4x4_core16_mmx; - pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmx; - pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmx; - - name[0] = name[1] = "16MMX"; - name[2] = name[3] = "32MMX"; - - for( i = 0; i < 2*6*8*8; i++ ) - if( (***h->quant8_mf)[i] >= 0x8000 ) - { - pf->quant_8x8_core = x264_quant_8x8_core32_mmx; - name[0] = "32MMX"; - } - - for( i = 0; i < 4*6*4*4; i++ ) - if( (***h->quant4_mf)[i] >= 0x8000 ) - { - pf->quant_4x4_core = x264_quant_4x4_core32_mmx; - name[1] = "32MMX"; - } + int q = h->quant8_mf[0][0][0][i]; + if( maxQ8 < q ) + maxQ8 = q; + } + + /* determine the biggest coeffient in all quant4_mf tables ( maxQ4 ) + and the biggest DC coefficient if all quant4_mf tables ( maxQdc ) */ + for( i = 0; i < 4*6*4*4; i++ ) + { + int q = h->quant4_mf[0][0][0][i]; + if( maxQ4 < q ) + maxQ4 = q; + if( maxQdc < q && i%16 == 0 ) + maxQdc = q; + } + + /* select quant_8x8 based on CPU and maxQ8 */ + if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX ) + pf->quant_8x8_core = x264_quant_8x8_core15_mmx; + else + if( maxQ8 < (1<<16) && cpu&X264_CPU_MMXEXT ) + pf->quant_8x8_core = x264_quant_8x8_core16_mmxext; + else + if( cpu&X264_CPU_MMXEXT ) + pf->quant_8x8_core = x264_quant_8x8_core32_mmxext; + + /* select quant_4x4 based on CPU and maxQ4 */ + if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX ) + pf->quant_4x4_core = x264_quant_4x4_core15_mmx; + else + if( maxQ4 < (1<<16) && cpu&X264_CPU_MMXEXT ) + pf->quant_4x4_core = x264_quant_4x4_core16_mmxext; + else + if( cpu&X264_CPU_MMXEXT ) + pf->quant_4x4_core = x264_quant_4x4_core32_mmxext; + + /* select quant_XxX_dc based on CPU and maxQdc */ + if( maxQdc < (1<<16) && cpu&X264_CPU_MMXEXT ) + { + pf->quant_4x4_dc_core = x264_quant_4x4_dc_core16_mmxext; + pf->quant_2x2_dc_core = x264_quant_2x2_dc_core16_mmxext; + } + else + if( maxQdc < (1<<15) && cpu&X264_CPU_MMX ) + { + pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_mmx; + pf->quant_2x2_dc_core = x264_quant_2x2_dc_core15_mmx; + } + else + if( cpu&X264_CPU_MMXEXT ) + { + pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmxext; + pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext; } -#endif - x264_log( h, X264_LOG_DEBUG, "using quant functions 8x8=%s 4x4=%s dc4x4=%s dc2x2=%s\n", - name[0], name[1], name[2], name[3] ); +#endif /* HAVE_MMXEXT */ }