From: Loren Merritt Date: Wed, 4 Apr 2007 18:45:25 +0000 (+0000) Subject: 2x faster quant. 2% overall. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=10265a0c2a0b29e6252ad3be6fad1569e7a04339;p=libx264 2x faster quant. 2% overall. side effects: not bit-identical to the previous algorithm. while the new algorithm covers a wider range of cqms than the previous one did, I couldn't find a good way to fallback to a general version for the extreme cqms. so now it refuses to encode extreme cqms instead of just being slower. lays a framework for custom deadzone matrices, though I didn't add an api. git-svn-id: svn://svn.videolan.org/x264/trunk@642 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/quant-a.asm b/common/amd64/quant-a.asm index 32ff0cda..6fdc3198 100644 --- a/common/amd64/quant-a.asm +++ b/common/amd64/quant-a.asm @@ -3,8 +3,7 @@ ;***************************************************************************** ;* Copyright (C) 2005 x264 project ;* -;* Authors: Alex Izvorski -;* Christian Heine +;* Authors: Loren Merritt ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -21,16 +20,6 @@ ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** -;***************************************************************************** -;* * -;* Revision history: * -;* * -;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) * -;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) * -;* 2005.09.21 faster MMX and added MMXEXT16 (CH) * -;* * -;***************************************************************************** - BITS 64 %include "amd64inc.asm" @@ -40,394 +29,106 @@ pd_1: times 2 dd 1 SECTION .text -%macro MMX_QUANT_AC_START 0 -; mov rdi, rdi ; &dct[0][0] -; mov rsi, rsi ; &quant_mf[0][0] - movd mm6, parm3d ; i_qbits - movd mm7, parm4d ; f - punpckldq mm7, mm7 ; f in each dword +%macro MMX_QUANT_DC_START 0 + movd mm6, parm2d ; mf + movd mm7, parm3d ; bias + pshufw mm6, mm6, 0 + pshufw mm7, mm7, 0 %endmacro -%macro MMX_QUANT15_DC_START 0 -; mov rdi, rdi ; &dct[0][0] - movd mm5, parm2d ; i_qmf - movd mm6, parm3d ; i_qbits - movd mm7, parm4d ; f - punpcklwd mm5, mm5 - punpcklwd mm5, mm5 ; i_qmf in each word - punpckldq mm7, mm7 ; f in each dword +%macro SSE2_QUANT_DC_START 0 + movd xmm6, parm2d ; mf + movd xmm7, parm3d ; bias + pshuflw xmm6, xmm6, 0 + pshuflw xmm7, xmm7, 0 + punpcklqdq xmm6, xmm6 + punpcklqdq xmm7, xmm7 %endmacro -%macro SSE2_QUANT_AC_START 0 - movd xmm6, parm3d ; i_qbits - movd xmm7, parm4d ; f - pshufd xmm7, xmm7, 0 ; f in each dword +%macro QUANT_ONE 5 +;;; %1 (m64) dct[y][x] +;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t) +;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t) + + mov%1 %2m0, %3 ; load dct coeffs + pxor %2m1, %2m1 + pcmpgtw %2m1, %2m0 ; sign(coeff) + pxor %2m0, %2m1 + psubw %2m0, %2m1 ; abs(coeff) + paddusw %2m0, %5 ; round + pmulhuw %2m0, %4 ; divide + pxor %2m0, %2m1 ; restore sign + psubw %2m0, %2m1 + mov%1 %3, %2m0 ; store %endmacro - -%macro SSE2_QUANT15_DC_START 0 - movd xmm5, parm2d ; i_qmf - movd xmm6, parm3d ; i_qbits - movd xmm7, parm4d ; f - pshuflw xmm5, xmm5, 0 - punpcklqdq xmm5, xmm5 ; i_qmf in each word - pshufd xmm7, xmm7, 0 ; f in each dword +%macro MMX_QUANT_1x4 3 + QUANT_ONE q, m, %1, %2, %3 %endmacro - -%macro MMX_QUANT15_1x4 4 -;;; %1 (m64) dct[y][x] -;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t) -;;; %3 (mmx) i_qbits in the low doubleword -;;; %4 (mmx) f as doublewords -;;; trashes mm0-mm2,mm4 - movq mm0, %1 ; load dct coeffs - pxor mm4, mm4 - pcmpgtw mm4, mm0 ; sign(coeff) - pxor mm0, mm4 - psubw mm0, mm4 ; abs(coeff) - - movq mm2, mm0 - pmullw mm0, %2 - pmulhw mm2, %2 - - movq mm1, mm0 - punpcklwd mm0, mm2 - punpckhwd mm1, mm2 - - paddd mm0, %4 ; round with f - paddd mm1, %4 - psrad mm0, %3 - psrad mm1, %3 - - packssdw mm0, mm1 ; pack - pxor mm0, mm4 ; restore sign - psubw mm0, mm4 - movq %1, mm0 ; store +%macro SSE2_QUANT_1x8 3 + QUANT_ONE dqa, xm, %1, %2, %3 %endmacro -%macro SSSE3_QUANT15_1x8 4 +%macro SSSE3_QUANT_1x8 3 movdqa xmm0, %1 ; load dct coeffs - movdqa xmm4, xmm0 ; save sign + movdqa xmm1, xmm0 ; save sign pabsw xmm0, xmm0 - - movdqa xmm2, xmm0 - pmullw xmm0, %2 - pmulhw xmm2, %2 - - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 - punpckhwd xmm1, xmm2 - - paddd xmm0, %4 ; round with f - paddd xmm1, %4 - psrad xmm0, %3 - psrad xmm1, %3 - - packssdw xmm0, xmm1 ; pack - psignw xmm0, xmm4 ; restore sign + paddusw xmm0, %3 ; round + pmulhuw xmm0, %2 ; divide + psignw xmm0, xmm1 ; restore sign movdqa %1, xmm0 ; store %endmacro ;----------------------------------------------------------------------------- -; void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2], -; int const i_qmf, int const i_qbits, int const f ); +; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias ) ;----------------------------------------------------------------------------- -cglobal x264_quant_2x2_dc_core15_mmx - MMX_QUANT15_DC_START - MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7 - ret - -;----------------------------------------------------------------------------- -; void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4], -; int const i_qmf, int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_core15_mmx - MMX_QUANT15_DC_START - -%rep 4 - MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7 - add parm1q, byte 8 -%endrep - - ret - -;----------------------------------------------------------------------------- -; void x264_quant_4x4_core15_mmx( int16_t dct[4][4], -; int const quant_mf[4][4], int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_core15_mmx - MMX_QUANT_AC_START - -%rep 4 - movq mm5, [parm2q] - packssdw mm5, [parm2q+8] - MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7 - add parm2q, byte 16 - add parm1q, byte 8 -%endrep - +cglobal x264_quant_2x2_dc_mmxext + MMX_QUANT_DC_START + MMX_QUANT_1x4 [parm1q], mm6, mm7 ret +%macro QUANT_SSE 1 ;----------------------------------------------------------------------------- -; void x264_quant_8x8_core15_mmx( int16_t dct[8][8], -; int const quant_mf[8][8], int const i_qbits, int const f ); +; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias ) ;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_core15_mmx - MMX_QUANT_AC_START - -%rep 16 - movq mm5, [parm2q] - packssdw mm5, [parm2q+8] - MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7 - add parm2q, byte 16 - add parm1q, byte 8 +cglobal x264_quant_4x4_dc_%1 + SSE2_QUANT_DC_START +%assign x 0 +%rep 2 + QUANT_1x8 [parm1q+x], xmm6, xmm7 +%assign x (x+16) %endrep - - ret - -%ifdef HAVE_SSE3 -;----------------------------------------------------------------------------- -; void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4], -; int const i_qmf, int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_core15_ssse3 - SSE2_QUANT15_DC_START - SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7 - SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7 ret ;----------------------------------------------------------------------------- -; void x264_quant_4x4_core15_ssse3( int16_t dct[4][4], -; int const quant_mf[4][4], int const i_qbits, int const f ); +; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) ;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_core15_ssse3 - SSE2_QUANT_AC_START +cglobal x264_quant_4x4_%1 %assign x 0 %rep 2 - movdqa xmm5, [parm2q+32*x] - packssdw xmm5, [parm2q+32*x+16] - SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7 - %assign x x+1 + QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x] +%assign x (x+16) %endrep ret ;----------------------------------------------------------------------------- -; void x264_quant_8x8_core15_ssse3( int16_t dct[8][8], -; int const quant_mf[8][8], int const i_qbits, int const f ); +; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) ;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_core15_ssse3 - SSE2_QUANT_AC_START +cglobal x264_quant_8x8_%1 %assign x 0 %rep 8 - movdqa xmm5, [parm2q+32*x] - packssdw xmm5, [parm2q+32*x+16] - SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7 - %assign x x+1 -%endrep - ret -%endif ; HAVE_SSE3 - - -; ============================================================================ - -%macro MMXEXT_QUANT16_DC_START 0 -; mov rdi, rdi ; &dct[0][0] - movd mm5, parm2d ; i_qmf - movd mm6, parm3d ; i_qbits - movd mm7, parm4d ; f - pshufw mm5, mm5, 0 ; i_qmf in each word - punpckldq mm7, mm7 ; f in each dword -%endmacro - -%macro MMXEXT_QUANT16_1x4 4 -;;; %1 (m64) dct[y][x] -;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t) -;;; %3 (mmx) i_qbits in the low doubleword -;;; %4 (mmx) f as doublewords -;;; trashes mm0-mm2,mm4 - movq mm0, %1 ; load dct coeffs - pxor mm4, mm4 - pcmpgtw mm4, mm0 ; sign(coeff) - pxor mm0, mm4 - psubw mm0, mm4 ; abs(coeff) - - movq mm2, mm0 - pmullw mm0, %2 - pmulhuw mm2, %2 - - movq mm1, mm0 - punpcklwd mm0, mm2 - punpckhwd mm1, mm2 - - paddd mm0, %4 ; round with f - paddd mm1, %4 - psrad mm0, %3 - psrad mm1, %3 - - packssdw mm0, mm1 ; pack - pxor mm0, mm4 ; restore sign - psubw mm0, mm4 - movq %1, mm0 ; store -%endmacro - -;----------------------------------------------------------------------------- -; void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2], -; int const i_qmf, int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_2x2_dc_core16_mmxext - MMXEXT_QUANT16_DC_START - MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7 - ret - -;----------------------------------------------------------------------------- -; void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4], -; int const i_qmf, int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_core16_mmxext - MMXEXT_QUANT16_DC_START - -%rep 4 - MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7 - add parm1q, byte 8 -%endrep - - ret - -;----------------------------------------------------------------------------- -; void x264_quant_4x4_core16_mmxext( int16_t dct[4][4], -; int const quant_mf[4][4], int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_core16_mmxext - MMX_QUANT_AC_START - -%rep 4 - pshufw mm5, [parm2q], 10110001b - paddw mm5, [parm2q+8] - pshufw mm5, mm5, 10001101b - MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7 - add parm2q, byte 16 - add parm1q, byte 8 + QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x] +%assign x (x+16) %endrep - ret - -;----------------------------------------------------------------------------- -; void x264_quant_8x8_core16_mmxext( int16_t dct[8][8], -; int const quant_mf[8][8], int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_core16_mmxext - MMX_QUANT_AC_START - -%rep 16 - pshufw mm5, [parm2q], 10110001b - paddw mm5, [parm2q+8] - pshufw mm5, mm5, 10001101b - MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7 - add parm2q, byte 16 - add parm1q, byte 8 -%endrep - - ret - - - -%macro MMX_QUANT32_DC_START 0 -; mov rdi, rdi ; &dct[0][0] - movd mm5, parm2d ; i_qmf - movd mm6, parm3d ; i_qbits - movd mm7, parm4d ; f - punpckldq mm5, mm5 ; i_qmf in each dword - punpckldq mm7, mm7 ; f in each dword %endmacro -%macro MMXEXT_QUANT32_1x4 5 -;;; %1 (m64) dct[y][x] -;;; %2,%3 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t) -;;; %4 (mmx) i_qbits in the low quadword -;;; %5 (mmx) f as doublewords -;;; trashes mm0-mm4 - movq mm0, %1 ; load dct coeffs - pxor mm4, mm4 - pcmpgtw mm4, mm0 ; sign(mm0) - pxor mm0, mm4 - psubw mm0, mm4 ; abs(mm0) - movq mm1, mm0 - punpcklwd mm0, mm0 ; duplicate the words for the upcomming - punpckhwd mm1, mm1 ; 32 bit multiplication - - movq mm2, mm0 ; like in school ... - movq mm3, mm1 - pmulhuw mm0, %2 ; ... multiply the parts ... - pmulhuw mm1, %3 - pmullw mm2, %2 - pmullw mm3, %3 - pslld mm0, 16 ; ... shift ... - pslld mm1, 16 - paddd mm0, mm2 ; ... and add them - paddd mm1, mm3 - - paddd mm0, %5 ; round with f - paddd mm1, %5 - psrad mm0, %4 - psrad mm1, %4 - - packssdw mm0, mm1 ; pack to int16_t - pxor mm0, mm4 ; restore sign - psubw mm0, mm4 - movq %1, mm0 ; store -%endmacro - -;----------------------------------------------------------------------------- -; void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2], -; int const i_qmf, int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_2x2_dc_core32_mmxext - MMX_QUANT32_DC_START - MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7 - ret - -;----------------------------------------------------------------------------- -; void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4], -; int const i_qmf, int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_core32_mmxext - MMX_QUANT32_DC_START - -%rep 4 - MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7 - add parm1q, byte 8 -%endrep - - ret - -;----------------------------------------------------------------------------- -; void x264_quant_4x4_core32_mmxext( int16_t dct[4][4], -; int const quant_mf[4][4], int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_core32_mmxext - MMX_QUANT_AC_START - -%rep 4 - MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7 - add parm1q, byte 8 - add parm2q, byte 16 -%endrep - - ret - -;----------------------------------------------------------------------------- -; void x264_quant_8x8_core32_mmxext( int16_t dct[8][8], -; int const quant_mf[8][8], int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_core32_mmxext - MMX_QUANT_AC_START - -%rep 16 - MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7 - add parm1q, byte 8 - add parm2q, byte 16 -%endrep +%define QUANT_1x8 SSE2_QUANT_1x8 +QUANT_SSE sse2 +%ifdef HAVE_SSE3 +%define QUANT_1x8 SSSE3_QUANT_1x8 +QUANT_SSE ssse3 +%endif - ret ;============================================================================= diff --git a/common/common.h b/common/common.h index dd055f5e..74f8c108 100644 --- a/common/common.h +++ b/common/common.h @@ -341,10 +341,12 @@ struct x264_t int (*dequant4_mf[4])[4][4]; /* [4][6][4][4] */ int (*dequant8_mf[2])[8][8]; /* [2][6][8][8] */ - int (*quant4_mf[4])[4][4]; /* [4][6][4][4] */ - int (*quant8_mf[2])[8][8]; /* [2][6][8][8] */ int (*unquant4_mf[4])[16]; /* [4][52][16] */ int (*unquant8_mf[2])[64]; /* [2][52][64] */ + uint16_t (*quant4_mf[4])[16]; /* [4][52][16] */ + uint16_t (*quant8_mf[2])[64]; /* [2][52][64] */ + uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */ + uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */ uint32_t nr_residual_sum[2][64]; uint32_t nr_offset[2][64]; @@ -436,9 +438,6 @@ struct x264_t int b_interlaced; - /* Inverted luma quantization deadzone */ - int i_luma_deadzone[2]; // {inter, intra} - /* Allowed qpel MV range to stay within the picture + emulated edge pixels */ int mv_min[2]; int mv_max[2]; diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm index b8860557..13e794de 100644 --- a/common/i386/quant-a.asm +++ b/common/i386/quant-a.asm @@ -3,8 +3,7 @@ ;***************************************************************************** ;* Copyright (C) 2005 x264 project ;* -;* Authors: Alex Izvorski -;* Christian Heine +;* Authors: Loren Merritt ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -21,16 +20,6 @@ ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** -;***************************************************************************** -;* * -;* Revision history: * -;* * -;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) * -;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) * -;* 2005.09.21 faster MMX and added MMXEXT16 (CH) * -;* * -;***************************************************************************** - BITS 32 %include "i386inc.asm" @@ -40,313 +29,151 @@ pd_1: times 2 dd 1 SECTION .text -%macro MMX_QUANT_AC_START 0 - mov eax, [esp+ 4] ; &dct[0][0] - mov ecx, [esp+ 8] ; &quant_mf[0][0] - movd mm6, [esp+12] ; i_qbits - movd mm7, [esp+16] ; f - punpckldq mm7, mm7 ; f in each dword +%macro QUANT_AC_START 0 + mov eax, [esp+ 4] ; dct + mov ecx, [esp+ 8] ; mf + mov edx, [esp+12] ; bias %endmacro -%macro MMX_QUANT15_DC_START 0 - mov eax, [esp+ 4] ; &dct[0][0] - movd mm5, [esp+ 8] ; i_qmf - movd mm6, [esp+12] ; i_qbits - movd mm7, [esp+16] ; f - punpcklwd mm5, mm5 - punpcklwd mm5, mm5 ; i_qmf in each word - punpckldq mm7, mm7 ; f in each dword +%macro MMX_QUANT_DC_START 0 + mov eax, [esp+ 4] ; dct + movd mm6, [esp+ 8] ; mf + movd mm7, [esp+12] ; bias + pshufw mm6, mm6, 0 + pshufw mm7, mm7, 0 %endmacro -%macro MMX_QUANT15_1x4 4 -;;; %1 (m64) dct[y][x] -;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t) -;;; %3 (mmx) i_qbits in the low doubleword -;;; %4 (mmx) f as doublewords -;;; trashes mm0-mm2,mm4 - movq mm0, %1 ; load dct coeffs - pxor mm4, mm4 - pcmpgtw mm4, mm0 ; sign(coeff) - pxor mm0, mm4 - psubw mm0, mm4 ; abs(coeff) - - movq mm2, mm0 - pmullw mm0, %2 - pmulhw mm2, %2 - - movq mm1, mm0 - punpcklwd mm0, mm2 - punpckhwd mm1, mm2 - - paddd mm0, %4 ; round with f - paddd mm1, %4 - psrad mm0, %3 - psrad mm1, %3 - - packssdw mm0, mm1 ; pack - pxor mm0, mm4 ; restore sign - psubw mm0, mm4 - movq %1, mm0 ; store +%macro SSE2_QUANT_DC_START 0 + mov eax, [esp+ 4] ; dct + movd xmm6, [esp+ 8] ; mf + movd xmm7, [esp+12] ; bias + pshuflw xmm6, xmm6, 0 + pshuflw xmm7, xmm7, 0 + punpcklqdq xmm6, xmm6 + punpcklqdq xmm7, xmm7 %endmacro -;----------------------------------------------------------------------------- -; void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2], -; int const i_qmf, int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_2x2_dc_core15_mmx - MMX_QUANT15_DC_START - MMX_QUANT15_1x4 [eax], mm5, mm6, mm7 - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4], -; int const i_qmf, int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_core15_mmx - MMX_QUANT15_DC_START - -%rep 4 - MMX_QUANT15_1x4 [eax], mm5, mm6, mm7 - add eax, byte 8 -%endrep - - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4], -; int const quant_mf[4][4], int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_core15_mmx - MMX_QUANT_AC_START - -%rep 4 - movq mm5, [ecx] - packssdw mm5, [ecx+8] - MMX_QUANT15_1x4 [eax], mm5, mm6, mm7 - add ecx, byte 16 - add eax, byte 8 -%endrep - - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8], -; int const quant_mf[8][8], int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_core15_mmx - MMX_QUANT_AC_START - -%rep 16 - movq mm5, [ecx] - packssdw mm5, [ecx+8] - MMX_QUANT15_1x4 [eax], mm5, mm6, mm7 - add ecx, byte 16 - add eax, byte 8 -%endrep - - ret - -; ============================================================================ - -%macro MMXEXT_QUANT16_DC_START 0 - mov eax, [esp+ 4] ; &dct[0][0] - movd mm5, [esp+ 8] ; i_qmf - movd mm6, [esp+12] ; i_qbits - movd mm7, [esp+16] ; f - pshufw mm5, mm5, 0 ; i_qmf in each word - punpckldq mm7, mm7 ; f in each dword +%macro QUANT_ONE 5 +;;; %1 (m64) dct[y][x] +;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t) +;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t) + + mov%1 %2m0, %3 ; load dct coeffs + pxor %2m1, %2m1 + pcmpgtw %2m1, %2m0 ; sign(coeff) + pxor %2m0, %2m1 + psubw %2m0, %2m1 ; abs(coeff) + paddusw %2m0, %5 ; round + pmulhuw %2m0, %4 ; divide + pxor %2m0, %2m1 ; restore sign + psubw %2m0, %2m1 + mov%1 %3, %2m0 ; store +%endmacro +%macro MMX_QUANT_1x4 3 + QUANT_ONE q, m, %1, %2, %3 +%endmacro +%macro SSE2_QUANT_1x8 3 + QUANT_ONE dqa, xm, %1, %2, %3 %endmacro -%macro MMXEXT_QUANT16_1x4 4 -;;; %1 (m64) dct[y][x] -;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t) -;;; %3 (mmx) i_qbits in the low doubleword -;;; %4 (mmx) f as doublewords -;;; trashes mm0-mm2,mm4 - movq mm0, %1 ; load dct coeffs - pxor mm4, mm4 - pcmpgtw mm4, mm0 ; sign(coeff) - pxor mm0, mm4 - psubw mm0, mm4 ; abs(coeff) - - movq mm2, mm0 - pmullw mm0, %2 - pmulhuw mm2, %2 - - movq mm1, mm0 - punpcklwd mm0, mm2 - punpckhwd mm1, mm2 - - paddd mm0, %4 ; round with f - paddd mm1, %4 - psrad mm0, %3 - psrad mm1, %3 - - packssdw mm0, mm1 ; pack - pxor mm0, mm4 ; restore sign - psubw mm0, mm4 - movq %1, mm0 ; store +%macro SSSE3_QUANT_1x8 3 + movdqa xmm0, %1 ; load dct coeffs + movdqa xmm1, xmm0 ; save sign + pabsw xmm0, xmm0 + paddusw xmm0, %3 ; round + pmulhuw xmm0, %2 ; divide + psignw xmm0, xmm1 ; restore sign + movdqa %1, xmm0 ; store %endmacro ;----------------------------------------------------------------------------- -; void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2], -; int const i_qmf, int const i_qbits, int const f ); +; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias ) ;----------------------------------------------------------------------------- -cglobal x264_quant_2x2_dc_core16_mmxext - MMXEXT_QUANT16_DC_START - MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7 +cglobal x264_quant_2x2_dc_mmxext + MMX_QUANT_DC_START + MMX_QUANT_1x4 [eax], mm6, mm7 ret ;----------------------------------------------------------------------------- -; void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4], -; int const i_qmf, int const i_qbits, int const f ); +; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias ) ;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_core16_mmxext - MMXEXT_QUANT16_DC_START - +cglobal x264_quant_4x4_dc_mmxext + MMX_QUANT_DC_START +%assign x 0 %rep 4 - MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7 - add eax, byte 8 + MMX_QUANT_1x4 [eax+x], mm6, mm7 +%assign x (x+8) %endrep - ret ;----------------------------------------------------------------------------- -; void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4], -; int const quant_mf[4][4], int const i_qbits, int const f ); +; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) ;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_core16_mmxext - MMX_QUANT_AC_START - +cglobal x264_quant_4x4_mmx + QUANT_AC_START +%assign x 0 %rep 4 - pshufw mm5, [ecx], 10110001b - paddw mm5, [ecx+8] - pshufw mm5, mm5, 10001101b - MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7 - add ecx, byte 16 - add eax, byte 8 + MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x] +%assign x (x+8) %endrep - ret ;----------------------------------------------------------------------------- -; void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8], -; int const quant_mf[8][8], int const i_qbits, int const f ); +; void x264_quant_8x8_mmx( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) ;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_core16_mmxext - MMX_QUANT_AC_START - +cglobal x264_quant_8x8_mmx + QUANT_AC_START +%assign x 0 %rep 16 - pshufw mm5, [ecx], 10110001b - paddw mm5, [ecx+8] - pshufw mm5, mm5, 10001101b - MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7 - add ecx, byte 16 - add eax, byte 8 + MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x] +%assign x (x+8) %endrep - ret - - -%macro MMX_QUANT32_DC_START 0 - mov eax, [esp+ 4] ; &dct[0][0] - movd mm5, [esp+ 8] ; i_qmf - movd mm6, [esp+12] ; i_qbits - movd mm7, [esp+16] ; f - punpckldq mm5, mm5 ; i_qmf in each dword - punpckldq mm7, mm7 ; f in each dword -%endmacro - -%macro MMXEXT_QUANT32_1x4 5 -;;; %1 (m64) dct[y][x] -;;; %2,%3 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t) -;;; %4 (mmx) i_qbits in the low quadword -;;; %5 (mmx) f as doublewords -;;; trashes mm0-mm4 - movq mm0, %1 ; load dct coeffs - pxor mm4, mm4 - pcmpgtw mm4, mm0 ; sign(mm0) - pxor mm0, mm4 - psubw mm0, mm4 ; abs(mm0) - movq mm1, mm0 - punpcklwd mm0, mm0 ; duplicate the words for the upcomming - punpckhwd mm1, mm1 ; 32 bit multiplication - - movq mm2, mm0 ; like in school ... - movq mm3, mm1 - pmulhuw mm0, %2 ; ... multiply the parts ... - pmulhuw mm1, %3 - pmullw mm2, %2 - pmullw mm3, %3 - pslld mm0, 16 ; ... shift ... - pslld mm1, 16 - paddd mm0, mm2 ; ... and add them - paddd mm1, mm3 - - paddd mm0, %5 ; round with f - paddd mm1, %5 - psrad mm0, %4 - psrad mm1, %4 - - packssdw mm0, mm1 ; pack to int16_t - pxor mm0, mm4 ; restore sign - psubw mm0, mm4 - movq %1, mm0 ; store -%endmacro - +%macro QUANT_SSE 1 ;----------------------------------------------------------------------------- -; void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2], -; int const i_qmf, int const i_qbits, int const f ); +; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias ) ;----------------------------------------------------------------------------- -cglobal x264_quant_2x2_dc_core32_mmxext - MMX_QUANT32_DC_START - MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7 - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4], -; int const i_qmf, int const i_qbits, int const f ); -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_core32_mmxext - MMX_QUANT32_DC_START - -%rep 4 - MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7 - add eax, byte 8 +cglobal x264_quant_4x4_dc_%1 + SSE2_QUANT_DC_START +%assign x 0 +%rep 2 + QUANT_1x8 [eax+x], xmm6, xmm7 +%assign x (x+16) %endrep - ret ;----------------------------------------------------------------------------- -; void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4], -; int const quant_mf[4][4], int const i_qbits, int const f ); +; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) ;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_core32_mmxext - MMX_QUANT_AC_START - -%rep 4 - MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7 - add eax, byte 8 - add ecx, byte 16 +cglobal x264_quant_4x4_%1 + QUANT_AC_START +%assign x 0 +%rep 2 + QUANT_1x8 [eax+x], [ecx+x], [edx+x] +%assign x (x+16) %endrep - ret ;----------------------------------------------------------------------------- -; void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8], -; int const quant_mf[8][8], int const i_qbits, int const f ); +; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) ;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_core32_mmxext - MMX_QUANT_AC_START - -%rep 16 - MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7 - add eax, byte 8 - add ecx, byte 16 +cglobal x264_quant_8x8_%1 + QUANT_AC_START +%assign x 0 +%rep 8 + QUANT_1x8 [eax+x], [ecx+x], [edx+x] +%assign x (x+16) %endrep - ret +%endmacro + +%define QUANT_1x8 SSE2_QUANT_1x8 +QUANT_SSE sse2 +%ifdef HAVE_SSE3 +%define QUANT_1x8 SSSE3_QUANT_1x8 +QUANT_SSE ssse3 +%endif ;============================================================================= diff --git a/common/i386/quant.h b/common/i386/quant.h index 1d4b51d9..8532fde9 100644 --- a/common/i386/quant.h +++ b/common/i386/quant.h @@ -23,40 +23,16 @@ #ifndef _I386_QUANT_H #define _I386_QUANT_H 1 -void x264_quant_8x8_core15_mmx( int16_t dct[8][8], - int quant_mf[8][8], int const i_qbits, int const f ); -void x264_quant_4x4_core15_mmx( int16_t dct[4][4], - int quant_mf[4][4], int const i_qbits, int const f ); -void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4], - int const i_qmf, int const i_qbits, int const f ); -void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2], - int const i_qmf, int const i_qbits, int const f ); - -void x264_quant_8x8_core15_ssse3( int16_t dct[8][8], - int quant_mf[8][8], int const i_qbits, int const f ); -void x264_quant_4x4_core15_ssse3( int16_t dct[4][4], - int quant_mf[4][4], int const i_qbits, int const f ); -void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4], - int const i_qmf, int const i_qbits, int const f ); - -void x264_quant_8x8_core16_mmxext( int16_t dct[8][8], - int quant_mf[8][8], int const i_qbits, int const f ); -void x264_quant_4x4_core16_mmxext( int16_t dct[4][4], - int quant_mf[4][4], int const i_qbits, int const f ); -void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4], - int const i_qmf, int const i_qbits, int const f ); -void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2], - int const i_qmf, int const i_qbits, int const f ); - -void x264_quant_8x8_core32_mmxext( int16_t dct[8][8], - int quant_mf[8][8], int const i_qbits, int const f ); -void x264_quant_4x4_core32_mmxext( int16_t dct[4][4], - int quant_mf[4][4], int const i_qbits, int const f ); -void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4], - int const i_qmf, int const i_qbits, int const f ); -void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2], - int const i_qmf, int const i_qbits, int const f ); - +void x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias ); +void x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias ); +void x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); +void x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); +void x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias ); +void x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); +void x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); +void x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias ); +void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); +void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); diff --git a/common/quant.c b/common/quant.c index 9d0ffee4..1e990cb5 100644 --- a/common/quant.c +++ b/common/quant.c @@ -29,41 +29,41 @@ # include "ppc/quant.h" #endif -#define QUANT_ONE( coef, mf ) \ +#define QUANT_ONE( coef, mf, f ) \ { \ if( (coef) > 0 ) \ - (coef) = ( f + (coef) * (mf) ) >> i_qbits; \ + (coef) = (f + (coef)) * (mf) >> 16; \ else \ - (coef) = - ( ( f - (coef) * (mf) ) >> i_qbits ); \ + (coef) = - ((f - (coef)) * (mf) >> 16); \ } -static void quant_8x8_core( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f ) +static void quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) { int i; for( i = 0; i < 64; i++ ) - QUANT_ONE( dct[0][i], quant_mf[0][i] ); + QUANT_ONE( dct[0][i], mf[i], bias[i] ); } -static void quant_4x4_core( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f ) +static void quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) { int i; for( i = 0; i < 16; i++ ) - QUANT_ONE( dct[0][i], quant_mf[0][i] ); + QUANT_ONE( dct[0][i], mf[i], bias[i] ); } -static void quant_4x4_dc_core( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f ) +static void quant_4x4_dc( int16_t dct[4][4], int mf, int bias ) { int i; for( i = 0; i < 16; i++ ) - QUANT_ONE( dct[0][i], i_quant_mf ); + QUANT_ONE( dct[0][i], mf, bias ); } -static void quant_2x2_dc_core( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f ) +static void quant_2x2_dc( int16_t dct[2][2], int mf, int bias ) { - QUANT_ONE( dct[0][0], i_quant_mf ); - QUANT_ONE( dct[0][1], i_quant_mf ); - QUANT_ONE( dct[0][2], i_quant_mf ); - QUANT_ONE( dct[0][3], i_quant_mf ); + QUANT_ONE( dct[0][0], mf, bias ); + QUANT_ONE( dct[0][1], mf, bias ); + QUANT_ONE( dct[0][2], mf, bias ); + QUANT_ONE( dct[0][3], mf, bias ); } #define DEQUANT_SHL( x ) \ @@ -195,117 +195,47 @@ void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_q void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) { - int i, j, maxQ8=0, maxQ4=0, maxQdc=0; - - pf->quant_8x8_core = quant_8x8_core; - pf->quant_4x4_core = quant_4x4_core; - pf->quant_4x4_dc_core = quant_4x4_dc_core; - pf->quant_2x2_dc_core = quant_2x2_dc_core; + pf->quant_8x8 = quant_8x8; + pf->quant_4x4 = quant_4x4; + pf->quant_4x4_dc = quant_4x4_dc; + pf->quant_2x2_dc = quant_2x2_dc; pf->dequant_4x4 = dequant_4x4; pf->dequant_8x8 = dequant_8x8; - /* determine the biggest coefficient in all quant8_mf tables */ - for( j = 0; j < 2; j++ ) - for( i = 0; i < 6*8*8; i++ ) - { - int q = h->quant8_mf[j][0][0][i]; - if( maxQ8 < q ) - maxQ8 = q; - } - - /* determine the biggest coefficient in all quant4_mf tables ( maxQ4 ) - and the biggest DC coefficient if all quant4_mf tables ( maxQdc ) */ - for( j = 0; j < 4; j++ ) - for( i = 0; i < 6*4*4; i++ ) - { - int q = h->quant4_mf[j][0][0][i]; - if( maxQ4 < q ) - maxQ4 = q; - if( maxQdc < q && i%16 == 0 ) - maxQdc = q; - } - #ifdef HAVE_MMX - - /* select quant_8x8 based on CPU and maxQ8 */ -#if defined(ARCH_X86_64) && defined(HAVE_SSE3) - if( maxQ8 < (1<<15) && cpu&X264_CPU_SSSE3 ) - pf->quant_8x8_core = x264_quant_8x8_core15_ssse3; - else + if( cpu&X264_CPU_MMX ) + { +#ifdef ARCH_X86 + pf->quant_4x4 = x264_quant_4x4_mmx; + pf->quant_8x8 = x264_quant_8x8_mmx; #endif - if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX ) - pf->quant_8x8_core = x264_quant_8x8_core15_mmx; - else - if( maxQ8 < (1<<16) && cpu&X264_CPU_MMXEXT ) - pf->quant_8x8_core = x264_quant_8x8_core16_mmxext; - else - if( cpu&X264_CPU_MMXEXT ) - pf->quant_8x8_core = x264_quant_8x8_core32_mmxext; + pf->dequant_4x4 = x264_dequant_4x4_mmx; + pf->dequant_8x8 = x264_dequant_8x8_mmx; + } - /* select quant_4x4 based on CPU and maxQ4 */ -#if defined(ARCH_X86_64) && defined(HAVE_SSE3) - if( maxQ4 < (1<<15) && cpu&X264_CPU_SSSE3 ) - pf->quant_4x4_core = x264_quant_4x4_core15_ssse3; - else -#endif - if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX ) - pf->quant_4x4_core = x264_quant_4x4_core15_mmx; - else - if( maxQ4 < (1<<16) && cpu&X264_CPU_MMXEXT ) - pf->quant_4x4_core = x264_quant_4x4_core16_mmxext; - else if( cpu&X264_CPU_MMXEXT ) - pf->quant_4x4_core = x264_quant_4x4_core32_mmxext; - - /* select quant_XxX_dc based on CPU and maxQdc */ - if( maxQdc < (1<<16) && cpu&X264_CPU_MMXEXT ) - { - pf->quant_4x4_dc_core = x264_quant_4x4_dc_core16_mmxext; - pf->quant_2x2_dc_core = x264_quant_2x2_dc_core16_mmxext; - } - else - if( maxQdc < (1<<15) && cpu&X264_CPU_MMX ) { - pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_mmx; - pf->quant_2x2_dc_core = x264_quant_2x2_dc_core15_mmx; + pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext; +#ifdef ARCH_X86 + pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext; +#endif } - else - if( cpu&X264_CPU_MMXEXT ) + + if( cpu&X264_CPU_SSE2 ) { - pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmxext; - pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext; + pf->quant_4x4_dc = x264_quant_4x4_dc_sse2; + pf->quant_4x4 = x264_quant_4x4_sse2; + pf->quant_8x8 = x264_quant_8x8_sse2; } - -#if defined(ARCH_X86_64) && defined(HAVE_SSE3) - if( maxQdc < (1<<15) && cpu&X264_CPU_SSSE3 ) - pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_ssse3; #endif - if( cpu&X264_CPU_MMX ) +#ifdef HAVE_SSE3 + if( cpu&X264_CPU_SSSE3 ) { - /* dequant is not subject to the above CQM-dependent overflow issues, - * as long as the inputs are in the range generable by dct+quant. - * that is not guaranteed by the standard, but is true within x264 */ - pf->dequant_4x4 = x264_dequant_4x4_mmx; - pf->dequant_8x8 = x264_dequant_8x8_mmx; - } -#endif /* HAVE_MMX */ - -#ifdef ARCH_PPC - if( cpu&X264_CPU_ALTIVEC ) { - if( maxQ8 < (1<<16) ) - { - pf->quant_8x8_core = x264_quant_8x8_altivec; - } - if( maxQ4 < (1<<16) ) - { - pf->quant_4x4_core = x264_quant_4x4_altivec; - } - if( maxQdc < (1<<16) ) - { - pf->quant_4x4_dc_core = x264_quant_4x4_dc_altivec; - } + pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; + pf->quant_4x4 = x264_quant_4x4_ssse3; + pf->quant_8x8 = x264_quant_8x8_ssse3; } -#endif /* ARCH_PPC */ +#endif } diff --git a/common/quant.h b/common/quant.h index 3294df59..0fe7d0c9 100644 --- a/common/quant.h +++ b/common/quant.h @@ -25,10 +25,10 @@ typedef struct { - void (*quant_8x8_core)( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f ); - void (*quant_4x4_core)( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f ); - void (*quant_4x4_dc_core)( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f ); - void (*quant_2x2_dc_core)( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f ); + void (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); + void (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); + void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias ); + void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias ); void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); diff --git a/common/set.c b/common/set.c index f6dc7c3d..9ae8aa5d 100644 --- a/common/set.c +++ b/common/set.c @@ -24,6 +24,9 @@ #include #include +#define SHIFT(x,s) ((s)<0 ? (x)<<-(s) : (s)==0 ? (x) : ((x)+(1<<((s)-1)))>>(s)) +#define DIV(n,d) (((n) + ((d)>>1)) / (d)) + static const int dequant4_scale[6][3] = { { 10, 13, 16 }, @@ -66,13 +69,19 @@ static const int quant8_scale[6][6] = { 7282, 6428, 11570, 6830, 9118, 8640 } }; -void x264_cqm_init( x264_t *h ) +int x264_cqm_init( x264_t *h ) { int def_quant4[6][16]; int def_quant8[6][64]; int def_dequant4[6][16]; int def_dequant8[6][64]; + int quant4_mf[4][6][4][4]; + int quant8_mf[2][6][8][8]; int q, i, j, i_list; + int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1], + 32 - h->param.analyse.i_luma_deadzone[0], + 32 - 11, 32 - 21 }; + int max_qp_err = -1; for( i = 0; i < 6; i++ ) { @@ -88,10 +97,19 @@ void x264_cqm_init( x264_t *h ) } else { - h-> quant4_mf[i] = x264_malloc( 6*size*sizeof(int) ); + h-> quant4_mf[i] = x264_malloc(52*size*sizeof(uint16_t) ); h->dequant4_mf[i] = x264_malloc( 6*size*sizeof(int) ); h->unquant4_mf[i] = x264_malloc(52*size*sizeof(int) ); } + + for( j = (i<4 ? 0 : 4); j < i; j++ ) + if( deadzone[j&3] == deadzone[i&3] && + !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) ) + break; + if( j < i ) + h->quant4_bias[i] = h->quant4_bias[j]; + else + h->quant4_bias[i] = x264_malloc(52*size*sizeof(uint16_t) ); } for( q = 0; q < 6; q++ ) @@ -116,24 +134,47 @@ void x264_cqm_init( x264_t *h ) for( i = 0; i < 16; i++ ) { h->dequant4_mf[i_list][q][0][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i]; - h-> quant4_mf[i_list][q][0][i] = def_quant4[q][i] * 16 / h->pps->scaling_list[i_list][i]; + quant4_mf[i_list][q][0][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]); } for( i_list = 0; i_list < 2; i_list++ ) for( i = 0; i < 64; i++ ) { h->dequant8_mf[i_list][q][0][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i]; - h-> quant8_mf[i_list][q][0][i] = def_quant8[q][i] * 16 / h->pps->scaling_list[4+i_list][i]; + quant8_mf[i_list][q][0][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]); } } for( q = 0; q < 52; q++ ) { for( i_list = 0; i_list < 4; i_list++ ) for( i = 0; i < 16; i++ ) - h->unquant4_mf[i_list][q][i] = (1 << (q/6 + 15 + 8)) / h->quant4_mf[i_list][q%6][0][i]; + { + h->unquant4_mf[i_list][q][i] = (1 << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][0][i]; + h-> quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][0][i], q/6 - 1); + // round to nearest, unless that would cause the deadzone to be negative + h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); + if( j > 0xffff && q > max_qp_err ) + max_qp_err = q; + } + if( h->param.analyse.b_transform_8x8 ) for( i_list = 0; i_list < 2; i_list++ ) for( i = 0; i < 64; i++ ) - h->unquant8_mf[i_list][q][i] = (1 << (q/6 + 16 + 8)) / h->quant8_mf[i_list][q%6][0][i]; + { + h->unquant8_mf[i_list][q][i] = (1 << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][0][i]; + h-> quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][0][i], q/6); + h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); + if( j > 0xffff && q > max_qp_err ) + max_qp_err = q; + } } + + if( !h->mb.b_lossless && max_qp_err >= h->param.rc.i_qp_min ) + { + x264_log( h, X264_LOG_ERROR, "Quantization overflow.\n" ); + x264_log( h, X264_LOG_ERROR, "Your CQM is incompatible with QP < %d, but min QP is set to %d\n", + max_qp_err+1, h->param.rc.i_qp_min ); + return -1; + } + return 0; } void x264_cqm_delete( x264_t *h ) diff --git a/common/set.h b/common/set.h index 86de444a..a018e7a0 100644 --- a/common/set.h +++ b/common/set.h @@ -218,7 +218,7 @@ static const uint8_t * const x264_cqm_jvt[6] = x264_cqm_jvt8i, x264_cqm_jvt8p }; -void x264_cqm_init( x264_t *h ); +int x264_cqm_init( x264_t *h ); void x264_cqm_delete( x264_t *h ); int x264_cqm_parse_file( x264_t *h, const char *filename ); diff --git a/encoder/encoder.c b/encoder/encoder.c index e82445f9..d76994cd 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -410,6 +410,14 @@ static int x264_validate_parameters( x264_t *h ) h->param.analyse.i_noise_reduction = 0; h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 ); } + if( h->param.rc.i_rc_method == X264_RC_CQP ) + { + float qp_p = h->param.rc.i_qp_constant; + float qp_i = qp_p - 6*log(h->param.rc.f_ip_factor)/log(2); + float qp_b = qp_p + 6*log(h->param.rc.f_pb_factor)/log(2); + h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 ); + h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 ); + } if( ( h->param.i_width % 16 || h->param.i_height % 16 ) && !h->mb.b_lossless ) { @@ -438,8 +446,6 @@ static int x264_validate_parameters( x264_t *h ) h->param.i_deblocking_filter_beta = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 ); h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 ); h->param.analyse.i_luma_deadzone[1] = x264_clip3( h->param.analyse.i_luma_deadzone[1], 0, 32 ); - h->mb.i_luma_deadzone[0] = 32 - h->param.analyse.i_luma_deadzone[0]; - h->mb.i_luma_deadzone[1] = 32 - h->param.analyse.i_luma_deadzone[1]; h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, 0, 2 ); @@ -625,7 +631,11 @@ x264_t *x264_encoder_open ( x264_param_t *param ) x264_validate_levels( h ); - x264_cqm_init( h ); + if( x264_cqm_init( h ) < 0 ) + { + x264_free( h ); + return NULL; + } h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height; diff --git a/encoder/macroblock.c b/encoder/macroblock.c index be0ee357..ec7e9b80 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -38,42 +38,6 @@ static inline void zigzag_scan_2x2_dc( int level[4], int16_t dct[2][2] ) } #undef ZIG -static void quant_8x8( x264_t *h, int16_t dct[8][8], int quant_mf[6][8][8], int i_qscale, int b_intra ) -{ - const int i_qbits = 16 + i_qscale / 6; - const int i_mf = i_qscale % 6; - const int f = h->mb.i_luma_deadzone[b_intra] << (i_qbits-6); - h->quantf.quant_8x8_core( dct, quant_mf[i_mf], i_qbits, f ); -} -static void quant_4x4( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra ) -{ - const int i_qbits = 15 + i_qscale / 6; - const int i_mf = i_qscale % 6; - const int f = h->mb.i_luma_deadzone[b_intra] << (i_qbits-6); - h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f ); -} -static void quant_4x4_chroma( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra ) -{ - const int i_qbits = 15 + i_qscale / 6; - const int i_mf = i_qscale % 6; - const int f = ( 1 << (i_qbits + b_intra) ) / 6; - h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f ); -} -static void quant_4x4_dc( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale ) -{ - const int i_qbits = 16 + i_qscale / 6; - const int i_mf = i_qscale % 6; - const int f = h->mb.i_luma_deadzone[1] << (i_qbits-6); - h->quantf.quant_4x4_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f ); -} -static void quant_2x2_dc( x264_t *h, int16_t dct[2][2], int quant_mf[6][4][4], int i_qscale, int b_intra ) -{ - const int i_qbits = 16 + i_qscale / 6; - const int i_mf = i_qscale % 6; - const int f = ( 1 << (i_qbits + b_intra) ) / 6; - h->quantf.quant_2x2_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f ); -} - /* (ref: JVT-B118) * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs * to 0 (low score means set it to null) @@ -137,7 +101,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale ) if( h->mb.b_trellis ) x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 ); else - quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 ); + h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] ); h->zigzagf.scan_4x4( h->dct.block[idx].luma4x4, dct4x4 ); h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale ); @@ -159,7 +123,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale ) if( h->mb.b_trellis ) x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 ); else - quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 ); + h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8IY][i_qscale], h->quant8_bias[CQM_8IY][i_qscale] ); h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 ); h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale ); @@ -199,14 +163,14 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale ) if( h->mb.b_trellis ) x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 ); else - quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 ); + h->quantf.quant_4x4( dct4x4[1+i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] ); h->zigzagf.scan_4x4ac( h->dct.block[i].residual_ac, dct4x4[1+i] ); h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale ); } h->dctf.dct4x4dc( dct4x4[0] ); - quant_4x4_dc( h, dct4x4[0], h->quant4_mf[CQM_4IY], i_qscale ); + h->quantf.quant_4x4_dc( dct4x4[0], h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 ); h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] ); /* output samples to fdec */ @@ -258,7 +222,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0]; /* no trellis; it doesn't seem to help chroma noticeably */ - quant_4x4_chroma( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter ); + h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] ); h->zigzagf.scan_4x4ac( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] ); if( b_decimate ) @@ -268,7 +232,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) } h->dctf.dct2x2dc( dct2x2 ); - quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter ); + h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qscale][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qscale][0]<<1 ); zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); /* output samples to fdec */ @@ -466,7 +430,7 @@ void x264_macroblock_encode( x264_t *h ) if( h->mb.b_trellis ) x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 ); else - quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 ); + h->quantf.quant_8x8( dct8x8[idx], h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] ); h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] ); @@ -518,7 +482,7 @@ void x264_macroblock_encode( x264_t *h ) if( h->mb.b_trellis ) x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 ); else - quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 ); + h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); h->zigzagf.scan_4x4( h->dct.block[idx].luma4x4, dct4x4[idx] ); @@ -673,7 +637,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) { const int idx = i8x8 * 4 + i4x4; - quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 ); + h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); h->zigzagf.scan_4x4( dctscan, dct4x4[idx] ); i_decimate_mb += x264_mb_decimate_score( dctscan, 16 ); @@ -709,7 +673,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) dct2x2[1][0] = dct4x4[2][0][0]; dct2x2[1][1] = dct4x4[3][0][0]; h->dctf.dct2x2dc( dct2x2 ); - quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4PC], i_qp, 0 ); + h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ); if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1] ) { /* can't be */ @@ -719,7 +683,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) /* calculate dct coeffs */ for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ ) { - quant_4x4_chroma( h, dct4x4[i4x4], h->quant4_mf[CQM_4PC], i_qp, 0 ); + h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); h->zigzagf.scan_4x4ac( dctscan, dct4x4[i4x4] ); i_decimate_mb += x264_mb_decimate_score( dctscan, 15 ); @@ -811,7 +775,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) { DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); - quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 ); + h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] ); h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); if( b_decimate ) @@ -830,10 +794,10 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) int i4; DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 ); h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); - quant_4x4( h, dct4x4[0], h->quant4_mf[CQM_4PY], i_qp, 0 ); - quant_4x4( h, dct4x4[1], h->quant4_mf[CQM_4PY], i_qp, 0 ); - quant_4x4( h, dct4x4[2], h->quant4_mf[CQM_4PY], i_qp, 0 ); - quant_4x4( h, dct4x4[3], h->quant4_mf[CQM_4PY], i_qp, 0 ); + h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); for( i4 = 0; i4 < 4; i4++ ) h->zigzagf.scan_4x4( h->dct.block[i8*4+i4].luma4x4, dct4x4[i4] ); @@ -864,7 +828,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE; h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); - quant_4x4_chroma( h, dct4x4, h->quant4_mf[CQM_4PC], i_qp, 0 ); + h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); h->zigzagf.scan_4x4ac( h->dct.block[16+i8+ch*4].residual_ac, dct4x4 ); if( array_non_zero( dct4x4 ) ) { diff --git a/encoder/rdo.c b/encoder/rdo.c index 3c827cb7..d2fa78b1 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -282,7 +282,7 @@ typedef struct { // and uses the dct scaling factors, not the idct ones. static void quant_trellis_cabac( x264_t *h, int16_t *dct, - const int *quant_mf, const int *unquant_mf, + const uint16_t *quant_mf, const int *unquant_mf, const int *coef_weight, const int *zigzag, int i_ctxBlockCat, int i_qbits, int i_lambda2, int b_ac, int i_coefs ) { @@ -294,7 +294,7 @@ static void quant_trellis_cabac( x264_t *h, int16_t *dct, uint8_t cabac_state_sig[64]; uint8_t cabac_state_last[64]; const int b_interlaced = h->mb.b_interlaced; - const int f = 1 << (i_qbits-1); // no deadzone + const int f = 1 << 15; // no deadzone int i_last_nnz = -1; int i, j; @@ -359,7 +359,7 @@ static void quant_trellis_cabac( x264_t *h, int16_t *dct, for( i = i_last_nnz; i >= b_ac; i-- ) { int i_coef = abs_coefs[i]; - int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> i_qbits; + int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> 16; int abs_level; int cost_sig[2], cost_last[2]; trellis_node_t n; @@ -488,7 +488,7 @@ void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, << (2*i_qbits)) >> LAMBDA_BITS; quant_trellis_cabac( h, (int16_t*)dct, - (int*)h->quant4_mf[i_quant_cat][i_mf], h->unquant4_mf[i_quant_cat][i_qp], + h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_dct4_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan4[h->mb.b_interlaced], i_ctxBlockCat, 15+i_qbits, i_lambda2, b_ac, 16 ); @@ -505,7 +505,7 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, << (2*i_qbits)) >> LAMBDA_BITS; quant_trellis_cabac( h, (int16_t*)dct, - (int*)h->quant8_mf[i_quant_cat][i_mf], h->unquant8_mf[i_quant_cat][i_qp], + h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], x264_dct8_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan8[h->mb.b_interlaced], DCT_LUMA_8x8, 16+i_qbits, i_lambda2, 0, 64 ); diff --git a/tools/checkasm.c b/tools/checkasm.c index 3152a5af..8e77fbb7 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -478,11 +478,12 @@ static int check_quant( int cpu_ref, int cpu_new ) uint8_t cqm_buf[64] __attribute__((__aligned__(16))); int ret = 0, ok, used_asm; int oks[2] = {1,1}, used_asms[2] = {0,0}; - int i, i_cqm; + int i, i_cqm, qp; x264_t h_buf; x264_t *h = &h_buf; h->pps = h->pps_array; x264_param_default( &h->param ); + h->param.rc.i_qp_min = 26; for( i_cqm = 0; i_cqm < 4; i_cqm++ ) { @@ -533,112 +534,74 @@ static int check_quant( int cpu_ref, int cpu_new ) } \ } -#define TEST_QUANT( name, cqm ) \ +#define TEST_QUANT_DC( name, cqm ) \ if( qf_a.name != qf_ref.name ) \ { \ - used_asms[0] = 1; \ - for( i = 0; i < 64; i++ ) \ - dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \ - qf_c.name( (void*)dct1, cqm, 20, (1<<20)/6 ); \ - qf_a.name( (void*)dct2, cqm, 20, (1<<20)/6 ); \ - if( memcmp( dct1, dct2, 64*2 ) ) \ - { \ - oks[0] = 0; \ - fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \ - } \ - } - -#define TEST_QUANT8( qname, cqm, shift, divider ) \ - if( qf_a.qname != qf_ref.qname ) \ - { \ - int qp; \ used_asms[0] = 1; \ for( qp = 51; qp > 0; qp-- ) \ { \ - INIT_QUANT8() \ - qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \ - qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \ - if( memcmp( dct1, dct2, 64*2 ) ) \ + for( i = 0; i < 16; i++ ) \ + dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \ + qf_c.name( (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + qf_a.name( (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + if( memcmp( dct1, dct2, 16*2 ) ) \ { \ oks[0] = 0; \ - fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \ + fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \ break; \ } \ } \ } -#define TEST_QUANT4( qname, cqm, shift, divider ) \ +#define TEST_QUANT( qname, block, w ) \ if( qf_a.qname != qf_ref.qname ) \ { \ - int qp; \ used_asms[0] = 1; \ for( qp = 51; qp > 0; qp-- ) \ { \ - INIT_QUANT4() \ - qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \ - qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \ - if( memcmp( dct1, dct2, 16*2 ) ) \ + INIT_QUANT##w() \ + qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + qf_a.qname( (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + if( memcmp( dct1, dct2, w*w*2 ) ) \ { \ oks[0] = 0; \ - fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \ + fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ break; \ } \ } \ } - TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8IY], 16, 3 ); - TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8PY], 16, 6 ); - TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4IY], 15, 3 ); - TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4PY], 15, 6 ); - TEST_QUANT( quant_4x4_dc_core, ***h->quant4_mf[CQM_4IY] ); - TEST_QUANT( quant_2x2_dc_core, ***h->quant4_mf[CQM_4IC] ); - -#define TEST_DEQUANT8( qname, dqname, cqm, dqm, shift, divider ) \ - if( qf_a.dqname != qf_ref.dqname ) \ - { \ - int qp; \ - used_asms[1] = 1; \ - for( qp = 51; qp > 0; qp-- ) \ - { \ - INIT_QUANT8() \ - qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \ - memcpy( dct2, dct1, 64*2 ); \ - qf_c.dqname( (void*)dct1, dqm, qp ); \ - qf_a.dqname( (void*)dct2, dqm, qp ); \ - if( memcmp( dct1, dct2, 64*2 ) ) \ - { \ - oks[1] = 0; \ - fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \ - break; \ - } \ - } \ - } + TEST_QUANT( quant_8x8, CQM_8IY, 8 ); + TEST_QUANT( quant_8x8, CQM_8PY, 8 ); + TEST_QUANT( quant_4x4, CQM_4IY, 4 ); + TEST_QUANT( quant_4x4, CQM_4PY, 4 ); + TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] ); + TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] ); -#define TEST_DEQUANT4( qname, dqname, cqm, dqm, shift, divider ) \ +#define TEST_DEQUANT( qname, dqname, block, w ) \ if( qf_a.dqname != qf_ref.dqname ) \ { \ - int qp; \ used_asms[1] = 1; \ for( qp = 51; qp > 0; qp-- ) \ { \ - INIT_QUANT4() \ - qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \ - memcpy( dct2, dct1, 16*2 ); \ - qf_c.dqname( (void*)dct1, dqm, qp ); \ - qf_a.dqname( (void*)dct2, dqm, qp ); \ - if( memcmp( dct1, dct2, 16*2 ) ) \ + INIT_QUANT##w() \ + qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + memcpy( dct2, dct1, w*w*2 ); \ + qf_c.dqname( (void*)dct1, h->dequant##w##_mf[block], qp ); \ + qf_a.dqname( (void*)dct2, h->dequant##w##_mf[block], qp ); \ + if( memcmp( dct1, dct2, w*w*2 ) ) \ { \ oks[1] = 0; \ - fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \ + fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ break; \ } \ } \ } - TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8IY], h->dequant8_mf[CQM_8IY], 16, 3 ); - TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8PY], h->dequant8_mf[CQM_8PY], 16, 6 ); - TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4IY], h->dequant4_mf[CQM_4IY], 15, 3 ); - TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4PY], h->dequant4_mf[CQM_4PY], 15, 6 ); + TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 ); + TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 ); + TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 ); + TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 ); } ok = oks[0]; used_asm = used_asms[0];