From: Loren Merritt Date: Sat, 22 Mar 2008 08:46:31 +0000 (-0600) Subject: faster lossless zigzag X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b437d2d4c90056b1dcb4f3220234d06d03f3e9b4;p=libx264 faster lossless zigzag --- diff --git a/common/dct.c b/common/dct.c index 895306b5..8b57055f 100644 --- a/common/dct.c +++ b/common/dct.c @@ -542,8 +542,12 @@ static void zigzag_scan_4x4ac_field( int16_t level[15], int16_t dct[4][4] ) int oe = x+y*FENC_STRIDE;\ int od = x+y*FDEC_STRIDE;\ level[i] = p_src[oe] - p_dst[od];\ - p_dst[od] = p_src[oe];\ } +#define COPY4x4\ + *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\ + *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\ + *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\ + *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);\ static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ) { @@ -551,6 +555,7 @@ static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2) ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) + COPY4x4 } static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ) @@ -559,6 +564,7 @@ static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1) ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2) ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) + COPY4x4 } static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst ) @@ -567,6 +573,7 @@ static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uin ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2) ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2) ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3) + COPY4x4 } static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst ) @@ -575,9 +582,11 @@ static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uin ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1) ZIG( 7,0,2) ZIG( 8,1,2) ZIG( 9,2,2) ZIG(10,3,2) ZIG(11,0,3) ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,3) + COPY4x4 } #undef ZIG +#undef COPY4x4 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) { @@ -609,6 +618,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) pf->sub_4x4 = zigzag_sub_4x4_frame; pf->sub_4x4ac = zigzag_sub_4x4ac_frame; +#ifdef HAVE_SSE3 + if( cpu&X264_CPU_SSSE3 ) + pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; +#endif + #ifdef ARCH_PPC if( cpu&X264_CPU_ALTIVEC ) { diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 0c6d463b..0b21f6b2 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -27,6 +27,7 @@ SECTION_RODATA pw_1: times 8 dw 1 pw_32: times 8 dw 32 +pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 SECTION .text @@ -290,3 +291,43 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3 mov [r0+12], r2d RET +%ifdef HAVE_SSE3 +;----------------------------------------------------------------------------- +; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst ) +;----------------------------------------------------------------------------- +cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3 + movd xmm0, [r1+0*FENC_STRIDE] + movd xmm1, [r1+1*FENC_STRIDE] + movd xmm2, [r1+2*FENC_STRIDE] + movd xmm3, [r1+3*FENC_STRIDE] + movd xmm4, [r2+0*FDEC_STRIDE] + movd xmm5, [r2+1*FDEC_STRIDE] + movd xmm6, [r2+2*FDEC_STRIDE] + movd xmm7, [r2+3*FDEC_STRIDE] + movd [r2+0*FDEC_STRIDE], xmm0 + movd [r2+1*FDEC_STRIDE], xmm1 + movd [r2+2*FDEC_STRIDE], xmm2 + movd [r2+3*FDEC_STRIDE], xmm3 + picgetgot r1 + punpckldq xmm0, xmm1 + punpckldq xmm2, xmm3 + punpckldq xmm4, xmm5 + punpckldq xmm6, xmm7 + movlhps xmm0, xmm2 + movlhps xmm4, xmm6 + movdqa xmm7, [pb_zigzag4 GLOBAL] + pshufb xmm0, xmm7 + pshufb xmm4, xmm7 + pxor xmm6, xmm6 + movdqa xmm1, xmm0 + movdqa xmm5, xmm4 + punpcklbw xmm0, xmm6 + punpckhbw xmm1, xmm6 + punpcklbw xmm4, xmm6 + punpckhbw xmm5, xmm6 + psubw xmm0, xmm4 + psubw xmm1, xmm5 + movdqa [r0], xmm0 + movdqa [r0+16], xmm1 + RET +%endif diff --git a/common/x86/dct.h b/common/x86/dct.h index 5b88dbea..95c9d60c 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -47,5 +47,6 @@ void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] ); void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] ); void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ); +void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst ); #endif