From: Loren Merritt Date: Wed, 9 Apr 2008 02:16:50 +0000 (-0600) Subject: dct4 sse2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=539103a59525903e82a013d7a300d2a786d35d22;p=libx264 dct4 sse2 --- diff --git a/common/dct.c b/common/dct.c index 5822206f..bdc92929 100644 --- a/common/dct.c +++ b/common/dct.c @@ -420,6 +420,13 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; dctf->add16x16_idct8= x264_add16x16_idct8_sse2; } + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) ) + { + dctf->sub8x8_dct = x264_sub8x8_dct_sse2; + dctf->sub16x16_dct = x264_sub16x16_dct_sse2; + dctf->add8x8_idct = x264_add8x8_idct_sse2; + dctf->add16x16_idct = x264_add16x16_idct_sse2; + } #endif //HAVE_MMX #ifdef ARCH_PPC diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 5c3a8336..5491b238 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -32,9 +32,9 @@ pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 SECTION .text %macro LOAD_DIFF_4P 5 - movd %1, %4 + movh %1, %4 punpcklbw %1, %3 - movd %2, %5 + movh %2, %5 punpcklbw %2, %3 psubw %1, %2 %endmacro @@ -55,7 +55,7 @@ SECTION .text %endmacro %macro SUMSUB2_AB 3 - movq %3, %1 + mova %3, %1 paddw %1, %1 paddw %1, %2 psubw %3, %2 @@ -63,8 +63,8 @@ SECTION .text %endmacro %macro SUMSUBD2_AB 4 - movq %4, %1 - movq %3, %2 + mova %4, %1 + mova %3, %2 psraw %2, 1 psraw %4, 1 paddw %1, %2 @@ -86,14 +86,22 @@ SECTION .text SWAP %2, %3 %endmacro -%macro STORE_DIFF_4P 5 - paddw %1, %3 +%macro TRANSPOSE2x4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 +%endmacro + +%macro STORE_DIFF_4P 4 psraw %1, 6 - movd %2, %5 - punpcklbw %2, %4 + movh %2, %4 + punpcklbw %2, %3 paddsw %1, %2 packuswb %1, %1 - movd %5, %1 + movh %4, %1 %endmacro %macro HADAMARD4_1D 4 @@ -164,17 +172,20 @@ cglobal x264_idct4x4dc_mmx, 1,1 ;----------------------------------------------------------------------------- cglobal x264_sub4x4_dct_mmx, 3,3 .skip_prologue: +%macro SUB_DCT4 1 LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] DCT4_1D 0,1,2,3,4 - TRANSPOSE4x4W 0,1,2,3,4 + TRANSPOSE%1 0,1,2,3,4 DCT4_1D 0,1,2,3,4 movq [r0+ 0], m0 movq [r0+ 8], m1 movq [r0+16], m2 movq [r0+24], m3 +%endmacro + SUB_DCT4 4x4W RET ;----------------------------------------------------------------------------- @@ -186,18 +197,52 @@ cglobal x264_add4x4_idct_mmx, 2,2,1 movq m1, [r1+ 8] movq m2, [r1+16] movq m3, [r1+24] +%macro ADD_IDCT4 1 IDCT4_1D 0,1,2,3,4,5 - TRANSPOSE4x4W 0,1,2,3,4 + TRANSPOSE%1 0,1,2,3,4 + paddw m0, [pw_32 GLOBAL] IDCT4_1D 0,1,2,3,4,5 pxor m7, m7 - movq m6, [pw_32 GLOBAL] - STORE_DIFF_4P m0, m4, m6, m7, [r0+0*FDEC_STRIDE] - STORE_DIFF_4P m1, m4, m6, m7, [r0+1*FDEC_STRIDE] - STORE_DIFF_4P m2, m4, m6, m7, [r0+2*FDEC_STRIDE] - STORE_DIFF_4P m3, m4, m6, m7, [r0+3*FDEC_STRIDE] + STORE_DIFF_4P m0, m4, m7, [r0+0*FDEC_STRIDE] + STORE_DIFF_4P m1, m4, m7, [r0+1*FDEC_STRIDE] + STORE_DIFF_4P m2, m4, m7, [r0+2*FDEC_STRIDE] + STORE_DIFF_4P m3, m4, m7, [r0+3*FDEC_STRIDE] +%endmacro + ADD_IDCT4 4x4W RET +INIT_XMM +cglobal x264_sub8x8_dct_sse2, 3,3 +.skip_prologue: + call .8x4 + add r0, 64 + add r1, 4*FENC_STRIDE + add r2, 4*FDEC_STRIDE +.8x4: + SUB_DCT4 2x4x4W + movhps [r0+32], m0 + movhps [r0+40], m1 + movhps [r0+48], m2 + movhps [r0+56], m3 + ret + +cglobal x264_add8x8_idct_sse2, 2,2,1 +.skip_prologue: + call .8x4 + add r1, 64 + add r0, 4*FDEC_STRIDE +.8x4: + movq m0, [r1+ 0] + movq m1, [r1+ 8] + movq m2, [r1+16] + movq m3, [r1+24] + movhps m0, [r1+32] + movhps m1, [r1+40] + movhps m2, [r1+48] + movhps m3, [r1+56] + ADD_IDCT4 2x4x4W + ret ;----------------------------------------------------------------------------- ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) @@ -207,16 +252,16 @@ cglobal %1, 3,3 .skip_prologue: call %2 add r0, %3 - add r1, %4-%5*FENC_STRIDE - add r2, %4-%5*FDEC_STRIDE + add r1, %4-%5-%6*FENC_STRIDE + add r2, %4-%5-%6*FDEC_STRIDE call %2 add r0, %3 - add r1, %4*FENC_STRIDE-%6 - add r2, %4*FDEC_STRIDE-%6 + add r1, (%4-%6)*FENC_STRIDE-%5-%4 + add r2, (%4-%6)*FDEC_STRIDE-%5-%4 call %2 add r0, %3 - add r1, %4-%5*FENC_STRIDE - add r2, %4-%5*FDEC_STRIDE + add r1, %4-%5-%6*FENC_STRIDE + add r2, %4-%5-%6*FDEC_STRIDE jmp %2 %endmacro @@ -227,36 +272,39 @@ cglobal %1, 3,3 cglobal %1, 2,2,1 .skip_prologue: call %2 - add r0, %4-%5*FDEC_STRIDE + add r0, %4-%5-%6*FDEC_STRIDE add r1, %3 call %2 - add r0, %4*FDEC_STRIDE-%6 + add r0, (%4-%6)*FDEC_STRIDE-%5-%4 add r1, %3 call %2 - add r0, %4-%5*FDEC_STRIDE + add r0, %4-%5-%6*FDEC_STRIDE add r1, %3 jmp %2 %endmacro -SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 4 -ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 4 +SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0 +ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0 + +SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4 +ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4 -SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 4, 4, 12 -ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 4, 4, 12 +SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4 +ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4 %ifndef ARCH_X86_64 cextern x264_sub8x8_dct8_mmx.skip_prologue cextern x264_add8x8_idct8_mmx.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 8 -ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 8 +SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0 +ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0 %define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue %endif cextern x264_sub8x8_dct8_sse2 cextern x264_add8x8_idct8_sse2 -SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8 -ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8 +SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0 +ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0 diff --git a/common/x86/dct.h b/common/x86/dct.h index 95c9d60c..7dd60e8e 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -26,23 +26,25 @@ void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_sse2( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_sse2( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ); void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] ); void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] ); +void x264_add8x8_idct_sse2( uint8_t *p_dst, int16_t dct[4][4][4] ); +void x264_add16x16_idct_sse2( uint8_t *p_dst, int16_t dct[16][4][4] ); void x264_dct4x4dc_mmx( int16_t d[4][4] ); void x264_idct4x4dc_mmx( int16_t d[4][4] ); void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); - -void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] ); -void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] ); - void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); +void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] ); +void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] ); void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] ); void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );