%include "amd64inc.asm"
-ALIGN 16
+SECTION .rodata
+pw_1: times 4 dw 1
+pd_1: times 2 dd 1
SECTION .text
cglobal x264_quant_4x4_core32_mmxext
cglobal x264_quant_8x8_core32_mmxext
+cglobal x264_dequant_4x4_mmx
+cglobal x264_dequant_8x8_mmx
+
%macro MMX_QUANT_AC_START 0
; mov rdi, rdi ; &dct[0][0]
; mov rsi, rsi ; &quant_mf[0][0]
ret
+
+;=============================================================================
+; dequant
+;=============================================================================
+
+%macro DEQUANT16_L_1x4 3
+;;; %1 dct[y][x]
+;;; %2,%3 dequant_mf[i_mf][y][x]
+;;; mm5 i_qbits
+
+ movq mm1, %2
+ movq mm2, %3
+ movq mm0, %1
+ packssdw mm1, mm2
+ pmullw mm0, mm1
+ psllw mm0, mm5
+ movq %1, mm0
+%endmacro
+
+%macro DEQUANT16_R_1x4 3
+;;; %1 dct[y][x]
+;;; %2,%3 dequant_mf[i_mf][y][x]
+;;; mm5 -i_qbits
+;;; mm6 f as words
+
+ movq mm1, %2
+ movq mm2, %3
+ movq mm0, %1
+ packssdw mm1, mm2
+ pmullw mm0, mm1
+ paddw mm0, mm6
+ psraw mm0, mm5
+ movq %1, mm0
+%endmacro
+
+%macro DEQUANT32_R_1x4 3
+;;; %1 dct[y][x]
+;;; %2,%3 dequant_mf[i_mf][y][x]
+;;; mm5 -i_qbits
+;;; mm6 f as dwords
+;;; mm7 0
+
+ movq mm0, %1
+ movq mm1, mm0
+ punpcklwd mm0, mm0
+ punpckhwd mm1, mm1
+
+ movq mm2, mm0
+ movq mm3, mm1
+ pmulhw mm0, %2
+ pmulhw mm1, %3
+ pmullw mm2, %2
+ pmullw mm3, %3
+ pslld mm0, 16
+ pslld mm1, 16
+ paddd mm0, mm2
+ paddd mm1, mm3
+
+ paddd mm0, mm6
+ paddd mm1, mm6
+ psrad mm0, mm5
+ psrad mm1, mm5
+
+ packssdw mm0, mm1
+ movq %1, mm0
+%endmacro
+
+%macro DEQUANT_WxH 3
+ALIGN 16
+;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+%1:
+; mov rdi, rdi ; dct
+; mov rsi, rsi ; dequant_mf
+; mov edx, edx ; i_qp
+
+ imul eax, edx, 0x2b
+ shr eax, 8 ; i_qbits = i_qp / 6
+ lea ecx, [eax+eax*2]
+ sub edx, ecx
+ sub edx, ecx ; i_mf = i_qp % 6
+ shl edx, %3+2
+ movsxd rdx, edx
+ add rsi, rdx ; dequant_mf[i_mf]
+
+ sub eax, %3
+ cmp eax, -2
+ jle .rshift32 ; dct * dequant overflows 16bit
+ cmp eax, -1
+ jle .rshift16 ; negative qbits => rightshift
+
+.lshift:
+ movd mm5, eax
+
+%rep %2
+ DEQUANT16_L_1x4 [rdi], [rsi], [rsi+8]
+ add rsi, byte 16
+ add rdi, byte 8
+%endrep
+
+ ret
+
+.rshift16:
+ neg eax
+ movd mm5, eax
+ movq mm6, [pw_1]
+ pxor mm7, mm7
+ psllw mm6, mm5
+ psrlw mm6, 1
+
+%rep %2
+ DEQUANT16_R_1x4 [rdi], [rsi], [rsi+8]
+ add rsi, byte 16
+ add rdi, byte 8
+%endrep
+
+ ret
+
+.rshift32:
+ neg eax
+ movd mm5, eax
+ movq mm6, [pd_1]
+ pxor mm7, mm7
+ pslld mm6, mm5
+ psrld mm6, 1
+
+%rep %2
+ DEQUANT32_R_1x4 [rdi], [rsi], [rsi+8]
+ add rsi, byte 16
+ add rdi, byte 8
+%endrep
+
+ ret
+%endmacro
+
+DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
+DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6
%endif
%endmacro
-ALIGN 16
+SECTION .rodata
+pw_1: times 4 dw 1
+pd_1: times 2 dd 1
SECTION .text
cglobal x264_quant_4x4_core32_mmxext
cglobal x264_quant_8x8_core32_mmxext
+cglobal x264_dequant_4x4_mmx
+cglobal x264_dequant_8x8_mmx
+
%macro MMX_QUANT_AC_START 0
mov eax, [esp+ 4] ; &dct[0][0]
mov ecx, [esp+ 8] ; &quant_mf[0][0]
ret
+
+;=============================================================================
+; dequant
+;=============================================================================
+
+%macro DEQUANT16_L_1x4 3
+;;; %1 dct[y][x]
+;;; %2,%3 dequant_mf[i_mf][y][x]
+;;; mm5 i_qbits
+
+ movq mm1, %2
+ movq mm2, %3
+ movq mm0, %1
+ packssdw mm1, mm2
+ pmullw mm0, mm1
+ psllw mm0, mm5
+ movq %1, mm0
+%endmacro
+
+%macro DEQUANT16_R_1x4 3
+;;; %1 dct[y][x]
+;;; %2,%3 dequant_mf[i_mf][y][x]
+;;; mm5 -i_qbits
+;;; mm6 f as words
+
+ movq mm1, %2
+ movq mm2, %3
+ movq mm0, %1
+ packssdw mm1, mm2
+ pmullw mm0, mm1
+ paddw mm0, mm6
+ psraw mm0, mm5
+ movq %1, mm0
+%endmacro
+
+%macro DEQUANT32_R_1x4 3
+;;; %1 dct[y][x]
+;;; %2,%3 dequant_mf[i_mf][y][x]
+;;; mm5 -i_qbits
+;;; mm6 f as dwords
+;;; mm7 0
+
+ movq mm0, %1
+ movq mm1, mm0
+ punpcklwd mm0, mm0
+ punpckhwd mm1, mm1
+
+ movq mm2, mm0
+ movq mm3, mm1
+ pmulhw mm0, %2
+ pmulhw mm1, %3
+ pmullw mm2, %2
+ pmullw mm3, %3
+ pslld mm0, 16
+ pslld mm1, 16
+ paddd mm0, mm2
+ paddd mm1, mm3
+
+ paddd mm0, mm6
+ paddd mm1, mm6
+ psrad mm0, mm5
+ psrad mm1, mm5
+
+ packssdw mm0, mm1
+ movq %1, mm0
+%endmacro
+
+%macro DEQUANT_WxH 3
+ALIGN 16
+;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+%1:
+ mov edx, [esp+12] ; i_qp
+ imul eax, edx, 0x2b
+ shr eax, 8 ; i_qbits = i_qp / 6
+ lea ecx, [eax+eax*2]
+ sub edx, ecx
+ sub edx, ecx ; i_mf = i_qp % 6
+ shl edx, %3+2
+ add edx, [esp+8] ; dequant_mf[i_mf]
+ mov ecx, [esp+4] ; dct
+
+ sub eax, %3
+ jge .lshift
+ cmp eax, byte -1
+ je .rshift16 ; negative qbits => rightshift
+ jmp .rshift32 ; dct * dequant overflows 16bit
+
+.lshift:
+ movd mm5, eax
+
+ mov eax, 8*(%2-1)
+.loopl16
+%rep 2
+ DEQUANT16_L_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
+ sub eax, byte 8
+%endrep
+ jge .loopl16
+
+ nop
+ ret
+
+.rshift16:
+ neg eax
+ movq mm6, [pw_1]
+ movd mm5, eax
+ pxor mm7, mm7
+ psllw mm6, mm5
+ psrlw mm6, 1
+
+ mov eax, 8*(%2-1)
+.loopr16
+%rep 2
+ DEQUANT16_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
+ sub eax, byte 8
+%endrep
+ jge .loopr16
+
+ nop
+ ret
+
+.rshift32:
+ neg eax
+ movq mm6, [pd_1]
+ movd mm5, eax
+ pxor mm7, mm7
+ pslld mm6, mm5
+ psrld mm6, 1
+
+ mov eax, 8*(%2-1)
+.loopr32
+%rep 2
+ DEQUANT32_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
+ sub eax, byte 8
+%endrep
+ jge .loopr32
+
+ nop
+ ret
+%endmacro
+
+DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
+DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6
void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
int const i_qmf, int const i_qbits, int const f );
+void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+
#endif
return 1;
}
-/****************************************************************************
- * Scan and Quant functions
- ****************************************************************************/
-void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qscale )
-{
- const int i_qbits = i_qscale/6 - 5;
-
- if( i_qbits >= 0 )
- {
- const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
-
- dct[0][0] *= i_dmf;
- dct[0][1] *= i_dmf;
- dct[1][0] *= i_dmf;
- dct[1][1] *= i_dmf;
- }
- else
- {
- const int i_dmf = dequant_mf[i_qscale%6][0][0];
- // chroma DC is truncated, not rounded
-
- dct[0][0] = ( dct[0][0] * i_dmf ) >> (-i_qbits);
- dct[0][1] = ( dct[0][1] * i_dmf ) >> (-i_qbits);
- dct[1][0] = ( dct[1][0] * i_dmf ) >> (-i_qbits);
- dct[1][1] = ( dct[1][1] * i_dmf ) >> (-i_qbits);
- }
-}
-
-void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale )
-{
- const int i_qbits = i_qscale/6 - 6;
- int y;
-
- if( i_qbits >= 0 )
- {
- const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
-
- for( y = 0; y < 4; y++ )
- {
- dct[y][0] *= i_dmf;
- dct[y][1] *= i_dmf;
- dct[y][2] *= i_dmf;
- dct[y][3] *= i_dmf;
- }
- }
- else
- {
- const int i_dmf = dequant_mf[i_qscale%6][0][0];
- const int f = 1 << (-i_qbits-1);
-
- for( y = 0; y < 4; y++ )
- {
- dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits);
- dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits);
- dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits);
- dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits);
- }
- }
-}
-
-void x264_mb_dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale )
-{
- const int i_mf = i_qscale%6;
- const int i_qbits = i_qscale/6 - 4;
- int y;
-
- if( i_qbits >= 0 )
- {
- for( y = 0; y < 4; y++ )
- {
- dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] ) << i_qbits;
- dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] ) << i_qbits;
- dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] ) << i_qbits;
- dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] ) << i_qbits;
- }
- }
- else
- {
- const int f = 1 << (-i_qbits-1);
- for( y = 0; y < 4; y++ )
- {
- dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] + f ) >> (-i_qbits);
- dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] + f ) >> (-i_qbits);
- dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] + f ) >> (-i_qbits);
- dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] + f ) >> (-i_qbits);
- }
- }
-}
-
-void x264_mb_dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qscale )
-{
- const int i_mf = i_qscale%6;
- const int i_qbits = i_qscale/6 - 6;
- int y;
-
- if( i_qbits >= 0 )
- {
- for( y = 0; y < 8; y++ )
- {
- dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] ) << i_qbits;
- dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] ) << i_qbits;
- dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] ) << i_qbits;
- dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] ) << i_qbits;
- dct[y][4] = ( dct[y][4] * dequant_mf[i_mf][y][4] ) << i_qbits;
- dct[y][5] = ( dct[y][5] * dequant_mf[i_mf][y][5] ) << i_qbits;
- dct[y][6] = ( dct[y][6] * dequant_mf[i_mf][y][6] ) << i_qbits;
- dct[y][7] = ( dct[y][7] * dequant_mf[i_mf][y][7] ) << i_qbits;
- }
- }
- else
- {
- const int f = 1 << (-i_qbits-1);
- for( y = 0; y < 8; y++ )
- {
- dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] + f ) >> (-i_qbits);
- dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] + f ) >> (-i_qbits);
- dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] + f ) >> (-i_qbits);
- dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] + f ) >> (-i_qbits);
- dct[y][4] = ( dct[y][4] * dequant_mf[i_mf][y][4] + f ) >> (-i_qbits);
- dct[y][5] = ( dct[y][5] * dequant_mf[i_mf][y][5] + f ) >> (-i_qbits);
- dct[y][6] = ( dct[y][6] * dequant_mf[i_mf][y][6] + f ) >> (-i_qbits);
- dct[y][7] = ( dct[y][7] * dequant_mf[i_mf][y][7] + f ) >> (-i_qbits);
- }
- }
-}
-
void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] )
{
const int i8 = x264_scan8[idx];
void x264_macroblock_bipred_init( x264_t *h );
-void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
-void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qscale );
-void x264_mb_dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
-void x264_mb_dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qscale );
-
/* x264_mb_predict_mv_16x16:
* set mvp with predicted mv for D_16x16 block
* h->mb. need only valid values from other blocks */
QUANT_ONE( dct[0][3], i_quant_mf );
}
+#define DEQUANT_SHL( x ) \
+ dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] ) << i_qbits
+
+#define DEQUANT_SHR( x ) \
+ dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] + f ) >> (-i_qbits)
+
+static void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+{
+ const int i_mf = i_qp%6;
+ const int i_qbits = i_qp/6 - 4;
+ int y;
+
+ if( i_qbits >= 0 )
+ {
+ for( y = 0; y < 4; y++ )
+ {
+ DEQUANT_SHL( 0 );
+ DEQUANT_SHL( 1 );
+ DEQUANT_SHL( 2 );
+ DEQUANT_SHL( 3 );
+ }
+ }
+ else
+ {
+ const int f = 1 << (-i_qbits-1);
+ for( y = 0; y < 4; y++ )
+ {
+ DEQUANT_SHR( 0 );
+ DEQUANT_SHR( 1 );
+ DEQUANT_SHR( 2 );
+ DEQUANT_SHR( 3 );
+ }
+ }
+}
+
+static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
+{
+ const int i_mf = i_qp%6;
+ const int i_qbits = i_qp/6 - 6;
+ int y;
+
+ if( i_qbits >= 0 )
+ {
+ for( y = 0; y < 8; y++ )
+ {
+ DEQUANT_SHL( 0 );
+ DEQUANT_SHL( 1 );
+ DEQUANT_SHL( 2 );
+ DEQUANT_SHL( 3 );
+ DEQUANT_SHL( 4 );
+ DEQUANT_SHL( 5 );
+ DEQUANT_SHL( 6 );
+ DEQUANT_SHL( 7 );
+ }
+ }
+ else
+ {
+ const int f = 1 << (-i_qbits-1);
+ for( y = 0; y < 8; y++ )
+ {
+ DEQUANT_SHR( 0 );
+ DEQUANT_SHR( 1 );
+ DEQUANT_SHR( 2 );
+ DEQUANT_SHR( 3 );
+ DEQUANT_SHR( 4 );
+ DEQUANT_SHR( 5 );
+ DEQUANT_SHR( 6 );
+ DEQUANT_SHR( 7 );
+ }
+ }
+}
+
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
+{
+ const int i_qbits = i_qp/6 - 5;
+
+ if( i_qbits >= 0 )
+ {
+ const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
+ dct[0][0] *= i_dmf;
+ dct[0][1] *= i_dmf;
+ dct[1][0] *= i_dmf;
+ dct[1][1] *= i_dmf;
+ }
+ else
+ {
+ const int i_dmf = dequant_mf[i_qp%6][0][0];
+ // chroma DC is truncated, not rounded
+ dct[0][0] = ( dct[0][0] * i_dmf ) >> (-i_qbits);
+ dct[0][1] = ( dct[0][1] * i_dmf ) >> (-i_qbits);
+ dct[1][0] = ( dct[1][0] * i_dmf ) >> (-i_qbits);
+ dct[1][1] = ( dct[1][1] * i_dmf ) >> (-i_qbits);
+ }
+}
+
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+{
+ const int i_qbits = i_qp/6 - 6;
+ int y;
+
+ if( i_qbits >= 0 )
+ {
+ const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
+
+ for( y = 0; y < 4; y++ )
+ {
+ dct[y][0] *= i_dmf;
+ dct[y][1] *= i_dmf;
+ dct[y][2] *= i_dmf;
+ dct[y][3] *= i_dmf;
+ }
+ }
+ else
+ {
+ const int i_dmf = dequant_mf[i_qp%6][0][0];
+ const int f = 1 << (-i_qbits-1);
+
+ for( y = 0; y < 4; y++ )
+ {
+ dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits);
+ dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits);
+ dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits);
+ dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits);
+ }
+ }
+}
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_4x4_dc_core = quant_4x4_dc_core;
pf->quant_2x2_dc_core = quant_2x2_dc_core;
+ pf->dequant_4x4 = dequant_4x4;
+ pf->dequant_8x8 = dequant_8x8;
+
#ifdef HAVE_MMXEXT
/* determine the biggest coeffient in all quant8_mf tables */
pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
}
+ if( cpu&X264_CPU_MMXEXT )
+ {
+ /* dequant is not subject to the above CQM-dependent overflow issues,
+ * as long as the inputs are in the range generable by dct+quant.
+ * that is not guaranteed by the standard, but is true within x264 */
+ pf->dequant_4x4 = x264_dequant_4x4_mmx;
+ pf->dequant_8x8 = x264_dequant_8x8_mmx;
+ }
#endif /* HAVE_MMXEXT */
}
void (*quant_4x4_core)( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
void (*quant_4x4_dc_core)( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f );
void (*quant_2x2_dc_core)( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f );
+
+ void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+ void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qscale );
+
#endif
quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 );
scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
- x264_mb_dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
+ h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
/* output samples to fdec */
h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 );
scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
- x264_mb_dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
+ h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
h->dctf.add8x8_idct8( p_dst, i_stride, dct8x8 );
}
quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 );
scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
- x264_mb_dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
+ h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
}
h->dctf.dct4x4dc( dct4x4[0] );
/* no trellis; it doesn't seem to help chroma noticeably */
quant_4x4( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
- x264_mb_dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
+ h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
if( b_inter )
{
quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 );
scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] );
- x264_mb_dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
+ h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
if( !h->mb.b_trellis )
{
quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
- x264_mb_dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
+ h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
}
x264_quant_function_t qf_a;
int16_t dct1[64], dct2[64];
uint8_t cqm_buf[64];
- int ret = 0, ok = 1, used_asm = 0;
+ int ret = 0, ok, used_asm;
+ int oks[2] = {1,1}, used_asms[2] = {0,0};
int i, i_cqm;
x264_t h_buf;
x264_t *h = &h_buf;
#define TEST_QUANT( name, cqm ) \
if( qf_a.name != qf_ref.name ) \
{ \
- used_asm = 1; \
+ used_asms[0] = 1; \
for( i = 0; i < 64; i++ ) \
- dct1[i] = dct2[i] = rand() & 0xfff; \
+ dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
qf_c.name( (void*)dct1, cqm, 20, (1<<20)/6 ); \
qf_a.name( (void*)dct2, cqm, 20, (1<<20)/6 ); \
if( memcmp( dct1, dct2, 64*2 ) ) \
{ \
- ok = 0; \
+ oks[0] = 0; \
fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
} \
}
TEST_QUANT( quant_4x4_core, *h->quant4_mf[CQM_4PY] );
TEST_QUANT( quant_4x4_dc_core, ***h->quant4_mf[CQM_4IY] );
TEST_QUANT( quant_2x2_dc_core, ***h->quant4_mf[CQM_4IC] );
+
+#define TEST_DEQUANT( name, quant, dqm, cqm, shift ) \
+ if( qf_a.name != qf_ref.name ) \
+ { \
+ int qp; \
+ used_asms[1] = 1; \
+ for( qp = 51; qp > 0; qp-- ) \
+ { \
+ for( i = 0; i < 64; i++ ) \
+ dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
+ qf_c.quant( (void*)dct1, cqm[qp%6], shift+qp/6, 0 ); \
+ memcpy( dct2, dct1, sizeof(dct2) ); \
+ qf_c.name( (void*)dct1, dqm, qp ); \
+ qf_a.name( (void*)dct2, dqm, qp ); \
+ if( memcmp( dct1, dct2, 64*2 ) ) \
+ { \
+ oks[1] = 0; \
+ fprintf( stderr, #name "(qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm ); \
+ break; \
+ } \
+ } \
+ }
+
+ TEST_DEQUANT( dequant_8x8, quant_8x8_core, h->dequant8_mf[CQM_8PY], h->quant8_mf[CQM_8PY], 16 );
+ TEST_DEQUANT( dequant_4x4, quant_4x4_core, h->dequant4_mf[CQM_4PY], h->quant4_mf[CQM_4PY], 15 );
}
+ ok = oks[0]; used_asm = used_asms[0];
report( "quant :" );
+ ok = oks[1]; used_asm = used_asms[1];
+ report( "dequant :" );
+
return ret;
}