;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
-;* Authors: Alex Izvorski <aizvorksi@gmail.com>
-;* Christian Heine <sennindemokrit@gmx.net>
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
-;*****************************************************************************
-;* *
-;* Revision history: *
-;* *
-;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) *
-;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) *
-;* 2005.09.21 faster MMX and added MMXEXT16 (CH) *
-;* *
-;*****************************************************************************
-
BITS 64
%include "amd64inc.asm"
SECTION .text
-%macro MMX_QUANT_AC_START 0
-; mov rdi, rdi ; &dct[0][0]
-; mov rsi, rsi ; &quant_mf[0][0]
- movd mm6, parm3d ; i_qbits
- movd mm7, parm4d ; f
- punpckldq mm7, mm7 ; f in each dword
+%macro MMX_QUANT_DC_START 0
+ movd mm6, parm2d ; mf
+ movd mm7, parm3d ; bias
+ pshufw mm6, mm6, 0
+ pshufw mm7, mm7, 0
%endmacro
-%macro MMX_QUANT15_DC_START 0
-; mov rdi, rdi ; &dct[0][0]
- movd mm5, parm2d ; i_qmf
- movd mm6, parm3d ; i_qbits
- movd mm7, parm4d ; f
- punpcklwd mm5, mm5
- punpcklwd mm5, mm5 ; i_qmf in each word
- punpckldq mm7, mm7 ; f in each dword
+%macro SSE2_QUANT_DC_START 0
+ movd xmm6, parm2d ; mf
+ movd xmm7, parm3d ; bias
+ pshuflw xmm6, xmm6, 0
+ pshuflw xmm7, xmm7, 0
+ punpcklqdq xmm6, xmm6
+ punpcklqdq xmm7, xmm7
%endmacro
-%macro SSE2_QUANT_AC_START 0
- movd xmm6, parm3d ; i_qbits
- movd xmm7, parm4d ; f
- pshufd xmm7, xmm7, 0 ; f in each dword
+%macro QUANT_ONE 5
+;;; %1 (m64) dct[y][x]
+;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
+;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
+
+ mov%1 %2m0, %3 ; load dct coeffs
+ pxor %2m1, %2m1
+ pcmpgtw %2m1, %2m0 ; sign(coeff)
+ pxor %2m0, %2m1
+ psubw %2m0, %2m1 ; abs(coeff)
+ paddusw %2m0, %5 ; round
+ pmulhuw %2m0, %4 ; divide
+ pxor %2m0, %2m1 ; restore sign
+ psubw %2m0, %2m1
+ mov%1 %3, %2m0 ; store
%endmacro
-
-%macro SSE2_QUANT15_DC_START 0
- movd xmm5, parm2d ; i_qmf
- movd xmm6, parm3d ; i_qbits
- movd xmm7, parm4d ; f
- pshuflw xmm5, xmm5, 0
- punpcklqdq xmm5, xmm5 ; i_qmf in each word
- pshufd xmm7, xmm7, 0 ; f in each dword
+%macro MMX_QUANT_1x4 3
+ QUANT_ONE q, m, %1, %2, %3
%endmacro
-
-%macro MMX_QUANT15_1x4 4
-;;; %1 (m64) dct[y][x]
-;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
-;;; %3 (mmx) i_qbits in the low doubleword
-;;; %4 (mmx) f as doublewords
-;;; trashes mm0-mm2,mm4
- movq mm0, %1 ; load dct coeffs
- pxor mm4, mm4
- pcmpgtw mm4, mm0 ; sign(coeff)
- pxor mm0, mm4
- psubw mm0, mm4 ; abs(coeff)
-
- movq mm2, mm0
- pmullw mm0, %2
- pmulhw mm2, %2
-
- movq mm1, mm0
- punpcklwd mm0, mm2
- punpckhwd mm1, mm2
-
- paddd mm0, %4 ; round with f
- paddd mm1, %4
- psrad mm0, %3
- psrad mm1, %3
-
- packssdw mm0, mm1 ; pack
- pxor mm0, mm4 ; restore sign
- psubw mm0, mm4
- movq %1, mm0 ; store
+%macro SSE2_QUANT_1x8 3
+ QUANT_ONE dqa, xm, %1, %2, %3
%endmacro
-%macro SSSE3_QUANT15_1x8 4
+%macro SSSE3_QUANT_1x8 3
movdqa xmm0, %1 ; load dct coeffs
- movdqa xmm4, xmm0 ; save sign
+ movdqa xmm1, xmm0 ; save sign
pabsw xmm0, xmm0
-
- movdqa xmm2, xmm0
- pmullw xmm0, %2
- pmulhw xmm2, %2
-
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2
- punpckhwd xmm1, xmm2
-
- paddd xmm0, %4 ; round with f
- paddd xmm1, %4
- psrad xmm0, %3
- psrad xmm1, %3
-
- packssdw xmm0, xmm1 ; pack
- psignw xmm0, xmm4 ; restore sign
+ paddusw xmm0, %3 ; round
+ pmulhuw xmm0, %2 ; divide
+ psignw xmm0, xmm1 ; restore sign
movdqa %1, xmm0 ; store
%endmacro
;-----------------------------------------------------------------------------
-; void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
-; int const i_qmf, int const i_qbits, int const f );
+; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core15_mmx
- MMX_QUANT15_DC_START
- MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
-; int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core15_mmx
- MMX_QUANT15_DC_START
-
-%rep 4
- MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
- add parm1q, byte 8
-%endrep
-
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_core15_mmx( int16_t dct[4][4],
-; int const quant_mf[4][4], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core15_mmx
- MMX_QUANT_AC_START
-
-%rep 4
- movq mm5, [parm2q]
- packssdw mm5, [parm2q+8]
- MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
- add parm2q, byte 16
- add parm1q, byte 8
-%endrep
-
+cglobal x264_quant_2x2_dc_mmxext
+ MMX_QUANT_DC_START
+ MMX_QUANT_1x4 [parm1q], mm6, mm7
ret
+%macro QUANT_SSE 1
;-----------------------------------------------------------------------------
-; void x264_quant_8x8_core15_mmx( int16_t dct[8][8],
-; int const quant_mf[8][8], int const i_qbits, int const f );
+; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core15_mmx
- MMX_QUANT_AC_START
-
-%rep 16
- movq mm5, [parm2q]
- packssdw mm5, [parm2q+8]
- MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
- add parm2q, byte 16
- add parm1q, byte 8
+cglobal x264_quant_4x4_dc_%1
+ SSE2_QUANT_DC_START
+%assign x 0
+%rep 2
+ QUANT_1x8 [parm1q+x], xmm6, xmm7
+%assign x (x+16)
%endrep
-
- ret
-
-%ifdef HAVE_SSE3
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
-; int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core15_ssse3
- SSE2_QUANT15_DC_START
- SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7
- SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7
ret
;-----------------------------------------------------------------------------
-; void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
-; int const quant_mf[4][4], int const i_qbits, int const f );
+; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core15_ssse3
- SSE2_QUANT_AC_START
+cglobal x264_quant_4x4_%1
%assign x 0
%rep 2
- movdqa xmm5, [parm2q+32*x]
- packssdw xmm5, [parm2q+32*x+16]
- SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
- %assign x x+1
+ QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
+%assign x (x+16)
%endrep
ret
;-----------------------------------------------------------------------------
-; void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
-; int const quant_mf[8][8], int const i_qbits, int const f );
+; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core15_ssse3
- SSE2_QUANT_AC_START
+cglobal x264_quant_8x8_%1
%assign x 0
%rep 8
- movdqa xmm5, [parm2q+32*x]
- packssdw xmm5, [parm2q+32*x+16]
- SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
- %assign x x+1
-%endrep
- ret
-%endif ; HAVE_SSE3
-
-
-; ============================================================================
-
-%macro MMXEXT_QUANT16_DC_START 0
-; mov rdi, rdi ; &dct[0][0]
- movd mm5, parm2d ; i_qmf
- movd mm6, parm3d ; i_qbits
- movd mm7, parm4d ; f
- pshufw mm5, mm5, 0 ; i_qmf in each word
- punpckldq mm7, mm7 ; f in each dword
-%endmacro
-
-%macro MMXEXT_QUANT16_1x4 4
-;;; %1 (m64) dct[y][x]
-;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
-;;; %3 (mmx) i_qbits in the low doubleword
-;;; %4 (mmx) f as doublewords
-;;; trashes mm0-mm2,mm4
- movq mm0, %1 ; load dct coeffs
- pxor mm4, mm4
- pcmpgtw mm4, mm0 ; sign(coeff)
- pxor mm0, mm4
- psubw mm0, mm4 ; abs(coeff)
-
- movq mm2, mm0
- pmullw mm0, %2
- pmulhuw mm2, %2
-
- movq mm1, mm0
- punpcklwd mm0, mm2
- punpckhwd mm1, mm2
-
- paddd mm0, %4 ; round with f
- paddd mm1, %4
- psrad mm0, %3
- psrad mm1, %3
-
- packssdw mm0, mm1 ; pack
- pxor mm0, mm4 ; restore sign
- psubw mm0, mm4
- movq %1, mm0 ; store
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
-; int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core16_mmxext
- MMXEXT_QUANT16_DC_START
- MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
-; int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core16_mmxext
- MMXEXT_QUANT16_DC_START
-
-%rep 4
- MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
- add parm1q, byte 8
-%endrep
-
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
-; int const quant_mf[4][4], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core16_mmxext
- MMX_QUANT_AC_START
-
-%rep 4
- pshufw mm5, [parm2q], 10110001b
- paddw mm5, [parm2q+8]
- pshufw mm5, mm5, 10001101b
- MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
- add parm2q, byte 16
- add parm1q, byte 8
+ QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
+%assign x (x+16)
%endrep
-
ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
-; int const quant_mf[8][8], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core16_mmxext
- MMX_QUANT_AC_START
-
-%rep 16
- pshufw mm5, [parm2q], 10110001b
- paddw mm5, [parm2q+8]
- pshufw mm5, mm5, 10001101b
- MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
- add parm2q, byte 16
- add parm1q, byte 8
-%endrep
-
- ret
-
-
-
-%macro MMX_QUANT32_DC_START 0
-; mov rdi, rdi ; &dct[0][0]
- movd mm5, parm2d ; i_qmf
- movd mm6, parm3d ; i_qbits
- movd mm7, parm4d ; f
- punpckldq mm5, mm5 ; i_qmf in each dword
- punpckldq mm7, mm7 ; f in each dword
%endmacro
-%macro MMXEXT_QUANT32_1x4 5
-;;; %1 (m64) dct[y][x]
-;;; %2,%3 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
-;;; %4 (mmx) i_qbits in the low quadword
-;;; %5 (mmx) f as doublewords
-;;; trashes mm0-mm4
- movq mm0, %1 ; load dct coeffs
- pxor mm4, mm4
- pcmpgtw mm4, mm0 ; sign(mm0)
- pxor mm0, mm4
- psubw mm0, mm4 ; abs(mm0)
- movq mm1, mm0
- punpcklwd mm0, mm0 ; duplicate the words for the upcomming
- punpckhwd mm1, mm1 ; 32 bit multiplication
-
- movq mm2, mm0 ; like in school ...
- movq mm3, mm1
- pmulhuw mm0, %2 ; ... multiply the parts ...
- pmulhuw mm1, %3
- pmullw mm2, %2
- pmullw mm3, %3
- pslld mm0, 16 ; ... shift ...
- pslld mm1, 16
- paddd mm0, mm2 ; ... and add them
- paddd mm1, mm3
-
- paddd mm0, %5 ; round with f
- paddd mm1, %5
- psrad mm0, %4
- psrad mm1, %4
-
- packssdw mm0, mm1 ; pack to int16_t
- pxor mm0, mm4 ; restore sign
- psubw mm0, mm4
- movq %1, mm0 ; store
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
-; int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core32_mmxext
- MMX_QUANT32_DC_START
- MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
-; int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core32_mmxext
- MMX_QUANT32_DC_START
-
-%rep 4
- MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
- add parm1q, byte 8
-%endrep
-
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
-; int const quant_mf[4][4], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core32_mmxext
- MMX_QUANT_AC_START
-
-%rep 4
- MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
- add parm1q, byte 8
- add parm2q, byte 16
-%endrep
-
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
-; int const quant_mf[8][8], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core32_mmxext
- MMX_QUANT_AC_START
-
-%rep 16
- MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
- add parm1q, byte 8
- add parm2q, byte 16
-%endrep
+%define QUANT_1x8 SSE2_QUANT_1x8
+QUANT_SSE sse2
+%ifdef HAVE_SSE3
+%define QUANT_1x8 SSSE3_QUANT_1x8
+QUANT_SSE ssse3
+%endif
- ret
;=============================================================================
int (*dequant4_mf[4])[4][4]; /* [4][6][4][4] */
int (*dequant8_mf[2])[8][8]; /* [2][6][8][8] */
- int (*quant4_mf[4])[4][4]; /* [4][6][4][4] */
- int (*quant8_mf[2])[8][8]; /* [2][6][8][8] */
int (*unquant4_mf[4])[16]; /* [4][52][16] */
int (*unquant8_mf[2])[64]; /* [2][52][64] */
+ uint16_t (*quant4_mf[4])[16]; /* [4][52][16] */
+ uint16_t (*quant8_mf[2])[64]; /* [2][52][64] */
+ uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
+ uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
uint32_t nr_residual_sum[2][64];
uint32_t nr_offset[2][64];
int b_interlaced;
- /* Inverted luma quantization deadzone */
- int i_luma_deadzone[2]; // {inter, intra}
-
/* Allowed qpel MV range to stay within the picture + emulated edge pixels */
int mv_min[2];
int mv_max[2];
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
-;* Authors: Alex Izvorski <aizvorksi@gmail.com>
-;* Christian Heine <sennindemokrit@gmx.net>
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
-;*****************************************************************************
-;* *
-;* Revision history: *
-;* *
-;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) *
-;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) *
-;* 2005.09.21 faster MMX and added MMXEXT16 (CH) *
-;* *
-;*****************************************************************************
-
BITS 32
%include "i386inc.asm"
SECTION .text
-%macro MMX_QUANT_AC_START 0
- mov eax, [esp+ 4] ; &dct[0][0]
- mov ecx, [esp+ 8] ; &quant_mf[0][0]
- movd mm6, [esp+12] ; i_qbits
- movd mm7, [esp+16] ; f
- punpckldq mm7, mm7 ; f in each dword
+%macro QUANT_AC_START 0
+ mov eax, [esp+ 4] ; dct
+ mov ecx, [esp+ 8] ; mf
+ mov edx, [esp+12] ; bias
%endmacro
-%macro MMX_QUANT15_DC_START 0
- mov eax, [esp+ 4] ; &dct[0][0]
- movd mm5, [esp+ 8] ; i_qmf
- movd mm6, [esp+12] ; i_qbits
- movd mm7, [esp+16] ; f
- punpcklwd mm5, mm5
- punpcklwd mm5, mm5 ; i_qmf in each word
- punpckldq mm7, mm7 ; f in each dword
+%macro MMX_QUANT_DC_START 0
+ mov eax, [esp+ 4] ; dct
+ movd mm6, [esp+ 8] ; mf
+ movd mm7, [esp+12] ; bias
+ pshufw mm6, mm6, 0
+ pshufw mm7, mm7, 0
%endmacro
-%macro MMX_QUANT15_1x4 4
-;;; %1 (m64) dct[y][x]
-;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
-;;; %3 (mmx) i_qbits in the low doubleword
-;;; %4 (mmx) f as doublewords
-;;; trashes mm0-mm2,mm4
- movq mm0, %1 ; load dct coeffs
- pxor mm4, mm4
- pcmpgtw mm4, mm0 ; sign(coeff)
- pxor mm0, mm4
- psubw mm0, mm4 ; abs(coeff)
-
- movq mm2, mm0
- pmullw mm0, %2
- pmulhw mm2, %2
-
- movq mm1, mm0
- punpcklwd mm0, mm2
- punpckhwd mm1, mm2
-
- paddd mm0, %4 ; round with f
- paddd mm1, %4
- psrad mm0, %3
- psrad mm1, %3
-
- packssdw mm0, mm1 ; pack
- pxor mm0, mm4 ; restore sign
- psubw mm0, mm4
- movq %1, mm0 ; store
+%macro SSE2_QUANT_DC_START 0
+ mov eax, [esp+ 4] ; dct
+ movd xmm6, [esp+ 8] ; mf
+ movd xmm7, [esp+12] ; bias
+ pshuflw xmm6, xmm6, 0
+ pshuflw xmm7, xmm7, 0
+ punpcklqdq xmm6, xmm6
+ punpcklqdq xmm7, xmm7
%endmacro
-;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
-; int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core15_mmx
- MMX_QUANT15_DC_START
- MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
-; int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core15_mmx
- MMX_QUANT15_DC_START
-
-%rep 4
- MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
- add eax, byte 8
-%endrep
-
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4],
-; int const quant_mf[4][4], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core15_mmx
- MMX_QUANT_AC_START
-
-%rep 4
- movq mm5, [ecx]
- packssdw mm5, [ecx+8]
- MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
- add ecx, byte 16
- add eax, byte 8
-%endrep
-
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8],
-; int const quant_mf[8][8], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core15_mmx
- MMX_QUANT_AC_START
-
-%rep 16
- movq mm5, [ecx]
- packssdw mm5, [ecx+8]
- MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
- add ecx, byte 16
- add eax, byte 8
-%endrep
-
- ret
-
-; ============================================================================
-
-%macro MMXEXT_QUANT16_DC_START 0
- mov eax, [esp+ 4] ; &dct[0][0]
- movd mm5, [esp+ 8] ; i_qmf
- movd mm6, [esp+12] ; i_qbits
- movd mm7, [esp+16] ; f
- pshufw mm5, mm5, 0 ; i_qmf in each word
- punpckldq mm7, mm7 ; f in each dword
+%macro QUANT_ONE 5
+;;; %1 (m64) dct[y][x]
+;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
+;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
+
+ mov%1 %2m0, %3 ; load dct coeffs
+ pxor %2m1, %2m1
+ pcmpgtw %2m1, %2m0 ; sign(coeff)
+ pxor %2m0, %2m1
+ psubw %2m0, %2m1 ; abs(coeff)
+ paddusw %2m0, %5 ; round
+ pmulhuw %2m0, %4 ; divide
+ pxor %2m0, %2m1 ; restore sign
+ psubw %2m0, %2m1
+ mov%1 %3, %2m0 ; store
+%endmacro
+%macro MMX_QUANT_1x4 3
+ QUANT_ONE q, m, %1, %2, %3
+%endmacro
+%macro SSE2_QUANT_1x8 3
+ QUANT_ONE dqa, xm, %1, %2, %3
%endmacro
-%macro MMXEXT_QUANT16_1x4 4
-;;; %1 (m64) dct[y][x]
-;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
-;;; %3 (mmx) i_qbits in the low doubleword
-;;; %4 (mmx) f as doublewords
-;;; trashes mm0-mm2,mm4
- movq mm0, %1 ; load dct coeffs
- pxor mm4, mm4
- pcmpgtw mm4, mm0 ; sign(coeff)
- pxor mm0, mm4
- psubw mm0, mm4 ; abs(coeff)
-
- movq mm2, mm0
- pmullw mm0, %2
- pmulhuw mm2, %2
-
- movq mm1, mm0
- punpcklwd mm0, mm2
- punpckhwd mm1, mm2
-
- paddd mm0, %4 ; round with f
- paddd mm1, %4
- psrad mm0, %3
- psrad mm1, %3
-
- packssdw mm0, mm1 ; pack
- pxor mm0, mm4 ; restore sign
- psubw mm0, mm4
- movq %1, mm0 ; store
+%macro SSSE3_QUANT_1x8 3
+ movdqa xmm0, %1 ; load dct coeffs
+ movdqa xmm1, xmm0 ; save sign
+ pabsw xmm0, xmm0
+ paddusw xmm0, %3 ; round
+ pmulhuw xmm0, %2 ; divide
+ psignw xmm0, xmm1 ; restore sign
+ movdqa %1, xmm0 ; store
%endmacro
;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
-; int const i_qmf, int const i_qbits, int const f );
+; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core16_mmxext
- MMXEXT_QUANT16_DC_START
- MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
+cglobal x264_quant_2x2_dc_mmxext
+ MMX_QUANT_DC_START
+ MMX_QUANT_1x4 [eax], mm6, mm7
ret
;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
-; int const i_qmf, int const i_qbits, int const f );
+; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core16_mmxext
- MMXEXT_QUANT16_DC_START
-
+cglobal x264_quant_4x4_dc_mmxext
+ MMX_QUANT_DC_START
+%assign x 0
%rep 4
- MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
- add eax, byte 8
+ MMX_QUANT_1x4 [eax+x], mm6, mm7
+%assign x (x+8)
%endrep
-
ret
;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
-; int const quant_mf[4][4], int const i_qbits, int const f );
+; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core16_mmxext
- MMX_QUANT_AC_START
-
+cglobal x264_quant_4x4_mmx
+ QUANT_AC_START
+%assign x 0
%rep 4
- pshufw mm5, [ecx], 10110001b
- paddw mm5, [ecx+8]
- pshufw mm5, mm5, 10001101b
- MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
- add ecx, byte 16
- add eax, byte 8
+ MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x]
+%assign x (x+8)
%endrep
-
ret
;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
-; int const quant_mf[8][8], int const i_qbits, int const f );
+; void x264_quant_8x8_mmx( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core16_mmxext
- MMX_QUANT_AC_START
-
+cglobal x264_quant_8x8_mmx
+ QUANT_AC_START
+%assign x 0
%rep 16
- pshufw mm5, [ecx], 10110001b
- paddw mm5, [ecx+8]
- pshufw mm5, mm5, 10001101b
- MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
- add ecx, byte 16
- add eax, byte 8
+ MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x]
+%assign x (x+8)
%endrep
-
ret
-
-
-%macro MMX_QUANT32_DC_START 0
- mov eax, [esp+ 4] ; &dct[0][0]
- movd mm5, [esp+ 8] ; i_qmf
- movd mm6, [esp+12] ; i_qbits
- movd mm7, [esp+16] ; f
- punpckldq mm5, mm5 ; i_qmf in each dword
- punpckldq mm7, mm7 ; f in each dword
-%endmacro
-
-%macro MMXEXT_QUANT32_1x4 5
-;;; %1 (m64) dct[y][x]
-;;; %2,%3 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
-;;; %4 (mmx) i_qbits in the low quadword
-;;; %5 (mmx) f as doublewords
-;;; trashes mm0-mm4
- movq mm0, %1 ; load dct coeffs
- pxor mm4, mm4
- pcmpgtw mm4, mm0 ; sign(mm0)
- pxor mm0, mm4
- psubw mm0, mm4 ; abs(mm0)
- movq mm1, mm0
- punpcklwd mm0, mm0 ; duplicate the words for the upcomming
- punpckhwd mm1, mm1 ; 32 bit multiplication
-
- movq mm2, mm0 ; like in school ...
- movq mm3, mm1
- pmulhuw mm0, %2 ; ... multiply the parts ...
- pmulhuw mm1, %3
- pmullw mm2, %2
- pmullw mm3, %3
- pslld mm0, 16 ; ... shift ...
- pslld mm1, 16
- paddd mm0, mm2 ; ... and add them
- paddd mm1, mm3
-
- paddd mm0, %5 ; round with f
- paddd mm1, %5
- psrad mm0, %4
- psrad mm1, %4
-
- packssdw mm0, mm1 ; pack to int16_t
- pxor mm0, mm4 ; restore sign
- psubw mm0, mm4
- movq %1, mm0 ; store
-%endmacro
-
+%macro QUANT_SSE 1
;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
-; int const i_qmf, int const i_qbits, int const f );
+; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core32_mmxext
- MMX_QUANT32_DC_START
- MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
-; int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core32_mmxext
- MMX_QUANT32_DC_START
-
-%rep 4
- MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
- add eax, byte 8
+cglobal x264_quant_4x4_dc_%1
+ SSE2_QUANT_DC_START
+%assign x 0
+%rep 2
+ QUANT_1x8 [eax+x], xmm6, xmm7
+%assign x (x+16)
%endrep
-
ret
;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
-; int const quant_mf[4][4], int const i_qbits, int const f );
+; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core32_mmxext
- MMX_QUANT_AC_START
-
-%rep 4
- MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
- add eax, byte 8
- add ecx, byte 16
+cglobal x264_quant_4x4_%1
+ QUANT_AC_START
+%assign x 0
+%rep 2
+ QUANT_1x8 [eax+x], [ecx+x], [edx+x]
+%assign x (x+16)
%endrep
-
ret
;-----------------------------------------------------------------------------
-; void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
-; int const quant_mf[8][8], int const i_qbits, int const f );
+; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core32_mmxext
- MMX_QUANT_AC_START
-
-%rep 16
- MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
- add eax, byte 8
- add ecx, byte 16
+cglobal x264_quant_8x8_%1
+ QUANT_AC_START
+%assign x 0
+%rep 8
+ QUANT_1x8 [eax+x], [ecx+x], [edx+x]
+%assign x (x+16)
%endrep
-
ret
+%endmacro
+
+%define QUANT_1x8 SSE2_QUANT_1x8
+QUANT_SSE sse2
+%ifdef HAVE_SSE3
+%define QUANT_1x8 SSSE3_QUANT_1x8
+QUANT_SSE ssse3
+%endif
;=============================================================================
#ifndef _I386_QUANT_H
#define _I386_QUANT_H 1
-void x264_quant_8x8_core15_mmx( int16_t dct[8][8],
- int quant_mf[8][8], int const i_qbits, int const f );
-void x264_quant_4x4_core15_mmx( int16_t dct[4][4],
- int quant_mf[4][4], int const i_qbits, int const f );
-void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
- int const i_qmf, int const i_qbits, int const f );
-void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
- int const i_qmf, int const i_qbits, int const f );
-
-void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
- int quant_mf[8][8], int const i_qbits, int const f );
-void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
- int quant_mf[4][4], int const i_qbits, int const f );
-void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
- int const i_qmf, int const i_qbits, int const f );
-
-void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
- int quant_mf[8][8], int const i_qbits, int const f );
-void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
- int quant_mf[4][4], int const i_qbits, int const f );
-void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
- int const i_qmf, int const i_qbits, int const f );
-void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
- int const i_qmf, int const i_qbits, int const f );
-
-void x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
- int quant_mf[8][8], int const i_qbits, int const f );
-void x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
- int quant_mf[4][4], int const i_qbits, int const f );
-void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
- int const i_qmf, int const i_qbits, int const f );
-void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
- int const i_qmf, int const i_qbits, int const f );
-
+void x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
+void x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
+void x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+void x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+void x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
+void x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+void x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+void x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
+void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
# include "ppc/quant.h"
#endif
-#define QUANT_ONE( coef, mf ) \
+#define QUANT_ONE( coef, mf, f ) \
{ \
if( (coef) > 0 ) \
- (coef) = ( f + (coef) * (mf) ) >> i_qbits; \
+ (coef) = (f + (coef)) * (mf) >> 16; \
else \
- (coef) = - ( ( f - (coef) * (mf) ) >> i_qbits ); \
+ (coef) = - ((f - (coef)) * (mf) >> 16); \
}
-static void quant_8x8_core( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
+static void quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
{
int i;
for( i = 0; i < 64; i++ )
- QUANT_ONE( dct[0][i], quant_mf[0][i] );
+ QUANT_ONE( dct[0][i], mf[i], bias[i] );
}
-static void quant_4x4_core( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
+static void quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
{
int i;
for( i = 0; i < 16; i++ )
- QUANT_ONE( dct[0][i], quant_mf[0][i] );
+ QUANT_ONE( dct[0][i], mf[i], bias[i] );
}
-static void quant_4x4_dc_core( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f )
+static void quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
{
int i;
for( i = 0; i < 16; i++ )
- QUANT_ONE( dct[0][i], i_quant_mf );
+ QUANT_ONE( dct[0][i], mf, bias );
}
-static void quant_2x2_dc_core( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f )
+static void quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
{
- QUANT_ONE( dct[0][0], i_quant_mf );
- QUANT_ONE( dct[0][1], i_quant_mf );
- QUANT_ONE( dct[0][2], i_quant_mf );
- QUANT_ONE( dct[0][3], i_quant_mf );
+ QUANT_ONE( dct[0][0], mf, bias );
+ QUANT_ONE( dct[0][1], mf, bias );
+ QUANT_ONE( dct[0][2], mf, bias );
+ QUANT_ONE( dct[0][3], mf, bias );
}
#define DEQUANT_SHL( x ) \
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
- int i, j, maxQ8=0, maxQ4=0, maxQdc=0;
-
- pf->quant_8x8_core = quant_8x8_core;
- pf->quant_4x4_core = quant_4x4_core;
- pf->quant_4x4_dc_core = quant_4x4_dc_core;
- pf->quant_2x2_dc_core = quant_2x2_dc_core;
+ pf->quant_8x8 = quant_8x8;
+ pf->quant_4x4 = quant_4x4;
+ pf->quant_4x4_dc = quant_4x4_dc;
+ pf->quant_2x2_dc = quant_2x2_dc;
pf->dequant_4x4 = dequant_4x4;
pf->dequant_8x8 = dequant_8x8;
- /* determine the biggest coefficient in all quant8_mf tables */
- for( j = 0; j < 2; j++ )
- for( i = 0; i < 6*8*8; i++ )
- {
- int q = h->quant8_mf[j][0][0][i];
- if( maxQ8 < q )
- maxQ8 = q;
- }
-
- /* determine the biggest coefficient in all quant4_mf tables ( maxQ4 )
- and the biggest DC coefficient if all quant4_mf tables ( maxQdc ) */
- for( j = 0; j < 4; j++ )
- for( i = 0; i < 6*4*4; i++ )
- {
- int q = h->quant4_mf[j][0][0][i];
- if( maxQ4 < q )
- maxQ4 = q;
- if( maxQdc < q && i%16 == 0 )
- maxQdc = q;
- }
-
#ifdef HAVE_MMX
-
- /* select quant_8x8 based on CPU and maxQ8 */
-#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
- if( maxQ8 < (1<<15) && cpu&X264_CPU_SSSE3 )
- pf->quant_8x8_core = x264_quant_8x8_core15_ssse3;
- else
+ if( cpu&X264_CPU_MMX )
+ {
+#ifdef ARCH_X86
+ pf->quant_4x4 = x264_quant_4x4_mmx;
+ pf->quant_8x8 = x264_quant_8x8_mmx;
#endif
- if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX )
- pf->quant_8x8_core = x264_quant_8x8_core15_mmx;
- else
- if( maxQ8 < (1<<16) && cpu&X264_CPU_MMXEXT )
- pf->quant_8x8_core = x264_quant_8x8_core16_mmxext;
- else
- if( cpu&X264_CPU_MMXEXT )
- pf->quant_8x8_core = x264_quant_8x8_core32_mmxext;
+ pf->dequant_4x4 = x264_dequant_4x4_mmx;
+ pf->dequant_8x8 = x264_dequant_8x8_mmx;
+ }
- /* select quant_4x4 based on CPU and maxQ4 */
-#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
- if( maxQ4 < (1<<15) && cpu&X264_CPU_SSSE3 )
- pf->quant_4x4_core = x264_quant_4x4_core15_ssse3;
- else
-#endif
- if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX )
- pf->quant_4x4_core = x264_quant_4x4_core15_mmx;
- else
- if( maxQ4 < (1<<16) && cpu&X264_CPU_MMXEXT )
- pf->quant_4x4_core = x264_quant_4x4_core16_mmxext;
- else
if( cpu&X264_CPU_MMXEXT )
- pf->quant_4x4_core = x264_quant_4x4_core32_mmxext;
-
- /* select quant_XxX_dc based on CPU and maxQdc */
- if( maxQdc < (1<<16) && cpu&X264_CPU_MMXEXT )
- {
- pf->quant_4x4_dc_core = x264_quant_4x4_dc_core16_mmxext;
- pf->quant_2x2_dc_core = x264_quant_2x2_dc_core16_mmxext;
- }
- else
- if( maxQdc < (1<<15) && cpu&X264_CPU_MMX )
{
- pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_mmx;
- pf->quant_2x2_dc_core = x264_quant_2x2_dc_core15_mmx;
+ pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext;
+#ifdef ARCH_X86
+ pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
+#endif
}
- else
- if( cpu&X264_CPU_MMXEXT )
+
+ if( cpu&X264_CPU_SSE2 )
{
- pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmxext;
- pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
+ pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
+ pf->quant_4x4 = x264_quant_4x4_sse2;
+ pf->quant_8x8 = x264_quant_8x8_sse2;
}
-
-#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
- if( maxQdc < (1<<15) && cpu&X264_CPU_SSSE3 )
- pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_ssse3;
#endif
- if( cpu&X264_CPU_MMX )
+#ifdef HAVE_SSE3
+ if( cpu&X264_CPU_SSSE3 )
{
- /* dequant is not subject to the above CQM-dependent overflow issues,
- * as long as the inputs are in the range generable by dct+quant.
- * that is not guaranteed by the standard, but is true within x264 */
- pf->dequant_4x4 = x264_dequant_4x4_mmx;
- pf->dequant_8x8 = x264_dequant_8x8_mmx;
- }
-#endif /* HAVE_MMX */
-
-#ifdef ARCH_PPC
- if( cpu&X264_CPU_ALTIVEC ) {
- if( maxQ8 < (1<<16) )
- {
- pf->quant_8x8_core = x264_quant_8x8_altivec;
- }
- if( maxQ4 < (1<<16) )
- {
- pf->quant_4x4_core = x264_quant_4x4_altivec;
- }
- if( maxQdc < (1<<16) )
- {
- pf->quant_4x4_dc_core = x264_quant_4x4_dc_altivec;
- }
+ pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
+ pf->quant_4x4 = x264_quant_4x4_ssse3;
+ pf->quant_8x8 = x264_quant_8x8_ssse3;
}
-#endif /* ARCH_PPC */
+#endif
}
typedef struct
{
- void (*quant_8x8_core)( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f );
- void (*quant_4x4_core)( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
- void (*quant_4x4_dc_core)( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f );
- void (*quant_2x2_dc_core)( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f );
+ void (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+ void (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+ void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
+ void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
#include <stdio.h>
#include <string.h>
+#define SHIFT(x,s) ((s)<0 ? (x)<<-(s) : (s)==0 ? (x) : ((x)+(1<<((s)-1)))>>(s))
+#define DIV(n,d) (((n) + ((d)>>1)) / (d))
+
static const int dequant4_scale[6][3] =
{
{ 10, 13, 16 },
{ 7282, 6428, 11570, 6830, 9118, 8640 }
};
-void x264_cqm_init( x264_t *h )
+int x264_cqm_init( x264_t *h )
{
int def_quant4[6][16];
int def_quant8[6][64];
int def_dequant4[6][16];
int def_dequant8[6][64];
+ int quant4_mf[4][6][4][4];
+ int quant8_mf[2][6][8][8];
int q, i, j, i_list;
+ int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1],
+ 32 - h->param.analyse.i_luma_deadzone[0],
+ 32 - 11, 32 - 21 };
+ int max_qp_err = -1;
for( i = 0; i < 6; i++ )
{
}
else
{
- h-> quant4_mf[i] = x264_malloc( 6*size*sizeof(int) );
+ h-> quant4_mf[i] = x264_malloc(52*size*sizeof(uint16_t) );
h->dequant4_mf[i] = x264_malloc( 6*size*sizeof(int) );
h->unquant4_mf[i] = x264_malloc(52*size*sizeof(int) );
}
+
+ for( j = (i<4 ? 0 : 4); j < i; j++ )
+ if( deadzone[j&3] == deadzone[i&3] &&
+ !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) )
+ break;
+ if( j < i )
+ h->quant4_bias[i] = h->quant4_bias[j];
+ else
+ h->quant4_bias[i] = x264_malloc(52*size*sizeof(uint16_t) );
}
for( q = 0; q < 6; q++ )
for( i = 0; i < 16; i++ )
{
h->dequant4_mf[i_list][q][0][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i];
- h-> quant4_mf[i_list][q][0][i] = def_quant4[q][i] * 16 / h->pps->scaling_list[i_list][i];
+ quant4_mf[i_list][q][0][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]);
}
for( i_list = 0; i_list < 2; i_list++ )
for( i = 0; i < 64; i++ )
{
h->dequant8_mf[i_list][q][0][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i];
- h-> quant8_mf[i_list][q][0][i] = def_quant8[q][i] * 16 / h->pps->scaling_list[4+i_list][i];
+ quant8_mf[i_list][q][0][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
}
}
for( q = 0; q < 52; q++ )
{
for( i_list = 0; i_list < 4; i_list++ )
for( i = 0; i < 16; i++ )
- h->unquant4_mf[i_list][q][i] = (1 << (q/6 + 15 + 8)) / h->quant4_mf[i_list][q%6][0][i];
+ {
+ h->unquant4_mf[i_list][q][i] = (1 << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][0][i];
+ h-> quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][0][i], q/6 - 1);
+ // round to nearest, unless that would cause the deadzone to be negative
+ h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
+ if( j > 0xffff && q > max_qp_err )
+ max_qp_err = q;
+ }
+ if( h->param.analyse.b_transform_8x8 )
for( i_list = 0; i_list < 2; i_list++ )
for( i = 0; i < 64; i++ )
- h->unquant8_mf[i_list][q][i] = (1 << (q/6 + 16 + 8)) / h->quant8_mf[i_list][q%6][0][i];
+ {
+ h->unquant8_mf[i_list][q][i] = (1 << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][0][i];
+ h-> quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][0][i], q/6);
+ h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
+ if( j > 0xffff && q > max_qp_err )
+ max_qp_err = q;
+ }
}
+
+ if( !h->mb.b_lossless && max_qp_err >= h->param.rc.i_qp_min )
+ {
+ x264_log( h, X264_LOG_ERROR, "Quantization overflow.\n" );
+ x264_log( h, X264_LOG_ERROR, "Your CQM is incompatible with QP < %d, but min QP is set to %d\n",
+ max_qp_err+1, h->param.rc.i_qp_min );
+ return -1;
+ }
+ return 0;
}
void x264_cqm_delete( x264_t *h )
x264_cqm_jvt8i, x264_cqm_jvt8p
};
-void x264_cqm_init( x264_t *h );
+int x264_cqm_init( x264_t *h );
void x264_cqm_delete( x264_t *h );
int x264_cqm_parse_file( x264_t *h, const char *filename );
h->param.analyse.i_noise_reduction = 0;
h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
}
+ if( h->param.rc.i_rc_method == X264_RC_CQP )
+ {
+ float qp_p = h->param.rc.i_qp_constant;
+ float qp_i = qp_p - 6*log(h->param.rc.f_ip_factor)/log(2);
+ float qp_b = qp_p + 6*log(h->param.rc.f_pb_factor)/log(2);
+ h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 );
+ h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 );
+ }
if( ( h->param.i_width % 16 || h->param.i_height % 16 ) && !h->mb.b_lossless )
{
h->param.i_deblocking_filter_beta = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 );
h->param.analyse.i_luma_deadzone[1] = x264_clip3( h->param.analyse.i_luma_deadzone[1], 0, 32 );
- h->mb.i_luma_deadzone[0] = 32 - h->param.analyse.i_luma_deadzone[0];
- h->mb.i_luma_deadzone[1] = 32 - h->param.analyse.i_luma_deadzone[1];
h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, 0, 2 );
x264_validate_levels( h );
- x264_cqm_init( h );
+ if( x264_cqm_init( h ) < 0 )
+ {
+ x264_free( h );
+ return NULL;
+ }
h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height;
}
#undef ZIG
-static void quant_8x8( x264_t *h, int16_t dct[8][8], int quant_mf[6][8][8], int i_qscale, int b_intra )
-{
- const int i_qbits = 16 + i_qscale / 6;
- const int i_mf = i_qscale % 6;
- const int f = h->mb.i_luma_deadzone[b_intra] << (i_qbits-6);
- h->quantf.quant_8x8_core( dct, quant_mf[i_mf], i_qbits, f );
-}
-static void quant_4x4( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra )
-{
- const int i_qbits = 15 + i_qscale / 6;
- const int i_mf = i_qscale % 6;
- const int f = h->mb.i_luma_deadzone[b_intra] << (i_qbits-6);
- h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f );
-}
-static void quant_4x4_chroma( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra )
-{
- const int i_qbits = 15 + i_qscale / 6;
- const int i_mf = i_qscale % 6;
- const int f = ( 1 << (i_qbits + b_intra) ) / 6;
- h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f );
-}
-static void quant_4x4_dc( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale )
-{
- const int i_qbits = 16 + i_qscale / 6;
- const int i_mf = i_qscale % 6;
- const int f = h->mb.i_luma_deadzone[1] << (i_qbits-6);
- h->quantf.quant_4x4_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
-}
-static void quant_2x2_dc( x264_t *h, int16_t dct[2][2], int quant_mf[6][4][4], int i_qscale, int b_intra )
-{
- const int i_qbits = 16 + i_qscale / 6;
- const int i_mf = i_qscale % 6;
- const int f = ( 1 << (i_qbits + b_intra) ) / 6;
- h->quantf.quant_2x2_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
-}
-
/* (ref: JVT-B118)
* x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
* to 0 (low score means set it to null)
if( h->mb.b_trellis )
x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
else
- quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 );
+ h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
h->zigzagf.scan_4x4( h->dct.block[idx].luma4x4, dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
if( h->mb.b_trellis )
x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
else
- quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 );
+ h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8IY][i_qscale], h->quant8_bias[CQM_8IY][i_qscale] );
h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
if( h->mb.b_trellis )
x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
else
- quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 );
+ h->quantf.quant_4x4( dct4x4[1+i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
h->zigzagf.scan_4x4ac( h->dct.block[i].residual_ac, dct4x4[1+i] );
h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
}
h->dctf.dct4x4dc( dct4x4[0] );
- quant_4x4_dc( h, dct4x4[0], h->quant4_mf[CQM_4IY], i_qscale );
+ h->quantf.quant_4x4_dc( dct4x4[0], h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
/* output samples to fdec */
dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
/* no trellis; it doesn't seem to help chroma noticeably */
- quant_4x4_chroma( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
+ h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] );
h->zigzagf.scan_4x4ac( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
if( b_decimate )
}
h->dctf.dct2x2dc( dct2x2 );
- quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
+ h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qscale][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qscale][0]<<1 );
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
/* output samples to fdec */
if( h->mb.b_trellis )
x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
else
- quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 );
+ h->quantf.quant_8x8( dct8x8[idx], h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
if( h->mb.b_trellis )
x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
else
- quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
+ h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
h->zigzagf.scan_4x4( h->dct.block[idx].luma4x4, dct4x4[idx] );
{
const int idx = i8x8 * 4 + i4x4;
- quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
+ h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
dct2x2[1][0] = dct4x4[2][0][0];
dct2x2[1][1] = dct4x4[3][0][0];
h->dctf.dct2x2dc( dct2x2 );
- quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4PC], i_qp, 0 );
+ h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 );
if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1] )
{
/* can't be */
/* calculate dct coeffs */
for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
{
- quant_4x4_chroma( h, dct4x4[i4x4], h->quant4_mf[CQM_4PC], i_qp, 0 );
+ h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
h->zigzagf.scan_4x4ac( dctscan, dct4x4[i4x4] );
i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
{
DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
- quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
+ h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
if( b_decimate )
int i4;
DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
- quant_4x4( h, dct4x4[0], h->quant4_mf[CQM_4PY], i_qp, 0 );
- quant_4x4( h, dct4x4[1], h->quant4_mf[CQM_4PY], i_qp, 0 );
- quant_4x4( h, dct4x4[2], h->quant4_mf[CQM_4PY], i_qp, 0 );
- quant_4x4( h, dct4x4[3], h->quant4_mf[CQM_4PY], i_qp, 0 );
+ h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+ h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+ h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+ h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
for( i4 = 0; i4 < 4; i4++ )
h->zigzagf.scan_4x4( h->dct.block[i8*4+i4].luma4x4, dct4x4[i4] );
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
- quant_4x4_chroma( h, dct4x4, h->quant4_mf[CQM_4PC], i_qp, 0 );
+ h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
h->zigzagf.scan_4x4ac( h->dct.block[16+i8+ch*4].residual_ac, dct4x4 );
if( array_non_zero( dct4x4 ) )
{
// and uses the dct scaling factors, not the idct ones.
static void quant_trellis_cabac( x264_t *h, int16_t *dct,
- const int *quant_mf, const int *unquant_mf,
+ const uint16_t *quant_mf, const int *unquant_mf,
const int *coef_weight, const int *zigzag,
int i_ctxBlockCat, int i_qbits, int i_lambda2, int b_ac, int i_coefs )
{
uint8_t cabac_state_sig[64];
uint8_t cabac_state_last[64];
const int b_interlaced = h->mb.b_interlaced;
- const int f = 1 << (i_qbits-1); // no deadzone
+ const int f = 1 << 15; // no deadzone
int i_last_nnz = -1;
int i, j;
for( i = i_last_nnz; i >= b_ac; i-- )
{
int i_coef = abs_coefs[i];
- int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> i_qbits;
+ int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> 16;
int abs_level;
int cost_sig[2], cost_last[2];
trellis_node_t n;
<< (2*i_qbits)) >> LAMBDA_BITS;
quant_trellis_cabac( h, (int16_t*)dct,
- (int*)h->quant4_mf[i_quant_cat][i_mf], h->unquant4_mf[i_quant_cat][i_qp],
+ h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
x264_dct4_weight2_zigzag[h->mb.b_interlaced],
x264_zigzag_scan4[h->mb.b_interlaced],
i_ctxBlockCat, 15+i_qbits, i_lambda2, b_ac, 16 );
<< (2*i_qbits)) >> LAMBDA_BITS;
quant_trellis_cabac( h, (int16_t*)dct,
- (int*)h->quant8_mf[i_quant_cat][i_mf], h->unquant8_mf[i_quant_cat][i_qp],
+ h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
x264_dct8_weight2_zigzag[h->mb.b_interlaced],
x264_zigzag_scan8[h->mb.b_interlaced],
DCT_LUMA_8x8, 16+i_qbits, i_lambda2, 0, 64 );
uint8_t cqm_buf[64] __attribute__((__aligned__(16)));
int ret = 0, ok, used_asm;
int oks[2] = {1,1}, used_asms[2] = {0,0};
- int i, i_cqm;
+ int i, i_cqm, qp;
x264_t h_buf;
x264_t *h = &h_buf;
h->pps = h->pps_array;
x264_param_default( &h->param );
+ h->param.rc.i_qp_min = 26;
for( i_cqm = 0; i_cqm < 4; i_cqm++ )
{
} \
}
-#define TEST_QUANT( name, cqm ) \
+#define TEST_QUANT_DC( name, cqm ) \
if( qf_a.name != qf_ref.name ) \
{ \
- used_asms[0] = 1; \
- for( i = 0; i < 64; i++ ) \
- dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
- qf_c.name( (void*)dct1, cqm, 20, (1<<20)/6 ); \
- qf_a.name( (void*)dct2, cqm, 20, (1<<20)/6 ); \
- if( memcmp( dct1, dct2, 64*2 ) ) \
- { \
- oks[0] = 0; \
- fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
- } \
- }
-
-#define TEST_QUANT8( qname, cqm, shift, divider ) \
- if( qf_a.qname != qf_ref.qname ) \
- { \
- int qp; \
used_asms[0] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
- INIT_QUANT8() \
- qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
- qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
- if( memcmp( dct1, dct2, 64*2 ) ) \
+ for( i = 0; i < 16; i++ ) \
+ dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
+ qf_c.name( (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+ qf_a.name( (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+ if( memcmp( dct1, dct2, 16*2 ) ) \
{ \
oks[0] = 0; \
- fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
+ fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
break; \
} \
} \
}
-#define TEST_QUANT4( qname, cqm, shift, divider ) \
+#define TEST_QUANT( qname, block, w ) \
if( qf_a.qname != qf_ref.qname ) \
{ \
- int qp; \
used_asms[0] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
- INIT_QUANT4() \
- qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
- qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
- if( memcmp( dct1, dct2, 16*2 ) ) \
+ INIT_QUANT##w() \
+ qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ qf_a.qname( (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ if( memcmp( dct1, dct2, w*w*2 ) ) \
{ \
oks[0] = 0; \
- fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
+ fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
break; \
} \
} \
}
- TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8IY], 16, 3 );
- TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8PY], 16, 6 );
- TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4IY], 15, 3 );
- TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4PY], 15, 6 );
- TEST_QUANT( quant_4x4_dc_core, ***h->quant4_mf[CQM_4IY] );
- TEST_QUANT( quant_2x2_dc_core, ***h->quant4_mf[CQM_4IC] );
-
-#define TEST_DEQUANT8( qname, dqname, cqm, dqm, shift, divider ) \
- if( qf_a.dqname != qf_ref.dqname ) \
- { \
- int qp; \
- used_asms[1] = 1; \
- for( qp = 51; qp > 0; qp-- ) \
- { \
- INIT_QUANT8() \
- qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
- memcpy( dct2, dct1, 64*2 ); \
- qf_c.dqname( (void*)dct1, dqm, qp ); \
- qf_a.dqname( (void*)dct2, dqm, qp ); \
- if( memcmp( dct1, dct2, 64*2 ) ) \
- { \
- oks[1] = 0; \
- fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
- break; \
- } \
- } \
- }
+ TEST_QUANT( quant_8x8, CQM_8IY, 8 );
+ TEST_QUANT( quant_8x8, CQM_8PY, 8 );
+ TEST_QUANT( quant_4x4, CQM_4IY, 4 );
+ TEST_QUANT( quant_4x4, CQM_4PY, 4 );
+ TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
+ TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
-#define TEST_DEQUANT4( qname, dqname, cqm, dqm, shift, divider ) \
+#define TEST_DEQUANT( qname, dqname, block, w ) \
if( qf_a.dqname != qf_ref.dqname ) \
{ \
- int qp; \
used_asms[1] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
- INIT_QUANT4() \
- qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
- memcpy( dct2, dct1, 16*2 ); \
- qf_c.dqname( (void*)dct1, dqm, qp ); \
- qf_a.dqname( (void*)dct2, dqm, qp ); \
- if( memcmp( dct1, dct2, 16*2 ) ) \
+ INIT_QUANT##w() \
+ qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ memcpy( dct2, dct1, w*w*2 ); \
+ qf_c.dqname( (void*)dct1, h->dequant##w##_mf[block], qp ); \
+ qf_a.dqname( (void*)dct2, h->dequant##w##_mf[block], qp ); \
+ if( memcmp( dct1, dct2, w*w*2 ) ) \
{ \
oks[1] = 0; \
- fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
+ fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
break; \
} \
} \
}
- TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8IY], h->dequant8_mf[CQM_8IY], 16, 3 );
- TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8PY], h->dequant8_mf[CQM_8PY], 16, 6 );
- TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4IY], h->dequant4_mf[CQM_4IY], 15, 3 );
- TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4PY], h->dequant4_mf[CQM_4PY], 15, 6 );
+ TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 );
+ TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 );
+ TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
+ TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
}
ok = oks[0]; used_asm = used_asms[0];