From: Loren Merritt Date: Fri, 21 Mar 2008 06:04:46 +0000 (-0600) Subject: faster dequant X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f63770aa6540b9ced04043ae474b1d3b99f024f9;p=libx264 faster dequant --- diff --git a/common/quant.c b/common/quant.c index ed1148cb..024dc6e1 100644 --- a/common/quant.c +++ b/common/quant.c @@ -209,9 +209,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) #ifdef ARCH_X86 pf->quant_4x4 = x264_quant_4x4_mmx; pf->quant_8x8 = x264_quant_8x8_mmx; -#endif pf->dequant_4x4 = x264_dequant_4x4_mmx; pf->dequant_8x8 = x264_dequant_8x8_mmx; +#endif } if( cpu&X264_CPU_MMXEXT ) @@ -227,6 +227,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4_dc = x264_quant_4x4_dc_sse2; pf->quant_4x4 = x264_quant_4x4_sse2; pf->quant_8x8 = x264_quant_8x8_sse2; + pf->dequant_4x4 = x264_dequant_4x4_sse2; + pf->dequant_8x8 = x264_dequant_8x8_sse2; } #endif diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 8071678b..fd87234d 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -29,34 +29,6 @@ pb_a1: times 16 db 0xa1 SECTION .text -%macro INIT_MMX 0 - %undef movq - %define m0 mm0 - %define m1 mm1 - %define m2 mm2 - %define m3 mm3 - %define m4 mm4 - %define m5 mm5 - %define m6 mm6 - %define m7 mm7 - %undef m8 - %undef m9 -%endmacro - -%macro INIT_XMM 0 - %define movq movdqa - %define m0 xmm0 - %define m1 xmm1 - %define m2 xmm2 - %define m3 xmm3 - %define m4 xmm4 - %define m5 xmm5 - %define m6 xmm6 - %define m7 xmm7 - %define m8 xmm8 - %define m9 xmm9 -%endmacro - ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ [base], [base+stride], [base+stride*2], [base3], \ diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 11e78ea6..886b3400 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -24,7 +24,8 @@ %include "x86inc.asm" SECTION_RODATA -pd_1: times 2 dd 1 +pw_1: times 8 dw 1 +pd_10000: times 4 dd 1<<16 SECTION .text @@ -133,76 +134,74 @@ QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16 ; dequant ;============================================================================= -%macro DEQUANT16_L_1x4 3 +%macro DEQUANT16_L 3 ;;; %1 dct[y][x] ;;; %2,%3 dequant_mf[i_mf][y][x] -;;; mm5 i_qbits - - movq mm1, %2 - movq mm2, %3 - movq mm0, %1 - packssdw mm1, mm2 - pmullw mm0, mm1 - psllw mm0, mm5 - movq %1, mm0 +;;; m5 i_qbits + + movq m1, %2 + movq m2, %3 + movq m0, %1 + packssdw m1, m2 + pmullw m0, m1 + psllw m0, m5 + movq %1, m0 %endmacro -%macro DEQUANT32_R_1x4 3 +%macro DEQUANT32_R 3 ;;; %1 dct[y][x] ;;; %2,%3 dequant_mf[i_mf][y][x] -;;; mm5 -i_qbits -;;; mm6 f as dwords -;;; mm7 0 - - movq mm0, %1 - movq mm1, mm0 - punpcklwd mm0, mm0 - punpckhwd mm1, mm1 - - movq mm2, mm0 - movq mm3, mm1 - pmulhw mm0, %2 - pmulhw mm1, %3 - pmullw mm2, %2 - pmullw mm3, %3 - pslld mm0, 16 - pslld mm1, 16 - paddd mm0, mm2 - paddd mm1, mm3 - - paddd mm0, mm6 - paddd mm1, mm6 - psrad mm0, mm5 - psrad mm1, mm5 - - packssdw mm0, mm1 - movq %1, mm0 +;;; m4 f +;;; m5 -i_qbits +;;; m6 1 +;;; m7 0 + + movq m0, %1 + movq m1, m0 + movq m2, %2 + movq m3, %3 + punpcklwd m0, m4 + punpckhwd m1, m4 + por m2, m6 ; FIXME munge precomputed arrays? + por m3, m6 + pmaddwd m0, m2 + pmaddwd m1, m3 + psrad m0, m5 + psrad m1, m5 + packssdw m0, m1 + movq %1, m0 %endmacro -%macro DEQUANT_LOOP 2 - mov t0d, 8*(%2-2) +%macro DEQUANT_LOOP 3 +%if 8*(%2-2*%3) + mov t0d, 8*(%2-2*%3) %%loop: - %1 [r0+t0+8], [r1+t0*2+16], [r1+t0*2+24] - %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8] - sub t0d, 16 - jge %%loop + %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3] + %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3] + sub t0d, 16*%3 + jge %%loop rep ret +%else + %1 [r0+8*%3], [r1+16*%3], [r1+24*%3] + %1 [r0 ], [r1 ], [r1+ 8*%3] + ret +%endif %endmacro ;----------------------------------------------------------------------------- ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) ;----------------------------------------------------------------------------- -%macro DEQUANT_WxH 3 +%macro DEQUANT_WxH 4 cglobal %1, 0,3 %ifdef ARCH_X86_64 %define t0 r4 %define t0d r4d imul r4d, r2d, 0x2b shr r4d, 8 ; i_qbits = i_qp / 6 - lea r3d, [r4d*3] - sub r2, r3 - sub r2, r3 ; i_mf = i_qp % 6 - shl r2, %3+2 + lea r3, [r4*3] + sub r2d, r3d + sub r2d, r3d ; i_mf = i_qp % 6 + shl r2d, %3+2 add r1, r2 ; dequant_mf[i_mf] %else %define t0 r2 @@ -222,19 +221,27 @@ cglobal %1, 0,3 jl .rshift32 ; negative qbits => rightshift .lshift: - movd mm5, t0d - DEQUANT_LOOP DEQUANT16_L_1x4, %2 + movd m5, t0d + DEQUANT_LOOP DEQUANT16_L, %2, %4 .rshift32: neg t0d - movd mm5, t0d + movd m5, t0d picgetgot t0d - movq mm6, [pd_1 GLOBAL] - pxor mm7, mm7 - pslld mm6, mm5 - psrld mm6, 1 - DEQUANT_LOOP DEQUANT32_R_1x4, %2 + movq m4, [pw_1 GLOBAL] + movq m6, [pd_10000 GLOBAL] + psllw m4, m5 + pxor m7, m7 + psrlw m4, 1 + DEQUANT_LOOP DEQUANT32_R, %2, %4 %endmacro -DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4 -DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6 +%ifndef ARCH_X86_64 +INIT_MMX +DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4, 1 +DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6, 1 +%endif +INIT_XMM +DEQUANT_WxH x264_dequant_4x4_sse2, 4, 4, 2 +DEQUANT_WxH x264_dequant_8x8_sse2, 16, 6, 2 + diff --git a/common/x86/quant.h b/common/x86/quant.h index 8532fde9..587286c3 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -35,5 +35,7 @@ void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); +void x264_dequant_4x4_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); +void x264_dequant_8x8_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); #endif diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index b8fbe13a..256aa2e4 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -328,3 +328,31 @@ SECTION ".note.GNU-stack" noalloc noexec nowrite progbits %assign FENC_STRIDE 16 %assign FDEC_STRIDE 32 +%macro INIT_MMX 0 + %undef movq + %define m0 mm0 + %define m1 mm1 + %define m2 mm2 + %define m3 mm3 + %define m4 mm4 + %define m5 mm5 + %define m6 mm6 + %define m7 mm7 + %undef m8 + %undef m9 +%endmacro + +%macro INIT_XMM 0 + %define movq movdqa + %define m0 xmm0 + %define m1 xmm1 + %define m2 xmm2 + %define m3 xmm3 + %define m4 xmm4 + %define m5 xmm5 + %define m6 xmm6 + %define m7 xmm7 + %define m8 xmm8 + %define m9 xmm9 +%endmacro +