int oe = x+y*FENC_STRIDE;\
int od = x+y*FDEC_STRIDE;\
level[i] = p_src[oe] - p_dst[od];\
- p_dst[od] = p_src[oe];\
}
+#define COPY4x4\
+ *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
+ *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
+ *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
+ *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);\
static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
+ COPY4x4
}
static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)
ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
+ COPY4x4
}
static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
+ COPY4x4
}
static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1)
ZIG( 7,0,2) ZIG( 8,1,2) ZIG( 9,2,2) ZIG(10,3,2)
ZIG(11,0,3) ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,3)
+ COPY4x4
}
#undef ZIG
+#undef COPY4x4
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
{
pf->sub_4x4 = zigzag_sub_4x4_frame;
pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
+#ifdef HAVE_SSE3
+ if( cpu&X264_CPU_SSSE3 )
+ pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
+#endif
+
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
{
SECTION_RODATA
pw_1: times 8 dw 1
pw_32: times 8 dw 32
+pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
SECTION .text
mov [r0+12], r2d
RET
+%ifdef HAVE_SSE3
+;-----------------------------------------------------------------------------
+; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
+;-----------------------------------------------------------------------------
+cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
+ movd xmm0, [r1+0*FENC_STRIDE]
+ movd xmm1, [r1+1*FENC_STRIDE]
+ movd xmm2, [r1+2*FENC_STRIDE]
+ movd xmm3, [r1+3*FENC_STRIDE]
+ movd xmm4, [r2+0*FDEC_STRIDE]
+ movd xmm5, [r2+1*FDEC_STRIDE]
+ movd xmm6, [r2+2*FDEC_STRIDE]
+ movd xmm7, [r2+3*FDEC_STRIDE]
+ movd [r2+0*FDEC_STRIDE], xmm0
+ movd [r2+1*FDEC_STRIDE], xmm1
+ movd [r2+2*FDEC_STRIDE], xmm2
+ movd [r2+3*FDEC_STRIDE], xmm3
+ picgetgot r1
+ punpckldq xmm0, xmm1
+ punpckldq xmm2, xmm3
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ movlhps xmm0, xmm2
+ movlhps xmm4, xmm6
+ movdqa xmm7, [pb_zigzag4 GLOBAL]
+ pshufb xmm0, xmm7
+ pshufb xmm4, xmm7
+ pxor xmm6, xmm6
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+ punpcklbw xmm0, xmm6
+ punpckhbw xmm1, xmm6
+ punpcklbw xmm4, xmm6
+ punpckhbw xmm5, xmm6
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm1
+ RET
+%endif