}
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
- pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_mbcmp_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_sad_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
} x264_pixel_function_t;
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * );
SECTION_RODATA
pb_3: times 16 db 3
+pb_shuf8x8c0: db 0,0,0,0,2,2,2,2
+pb_shuf8x8c1: db 4,4,4,4,6,6,6,6
sw_64: dd 64
SECTION .text
SAD_END_SSE2
RET
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+%macro INTRA_SAD_HV_ITER 2
+%ifidn %2, ssse3
+ movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
+ movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
+ pshufb m1, m7
+ pshufb m3, m7
+%else
+ movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
+ movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
+ punpckhbw m1, m1
+ punpckhbw m3, m3
+ pshufw m1, m1, 0xff
+ pshufw m3, m3, 0xff
+%endif
+ movq m4, [r0 + FENC_STRIDE*(%1+0)]
+ movq m5, [r0 + FENC_STRIDE*(%1+1)]
+ psadbw m1, m4
+ psadbw m3, m5
+ psadbw m4, m6
+ psadbw m5, m6
+ paddw m1, m3
+ paddw m4, m5
+%if %1
+ paddw m0, m1
+ paddw m2, m4
+%else
+ SWAP 0,1
+ SWAP 2,4
+%endif
+%endmacro
+
+%macro INTRA_SAD_8x8C 1
+cglobal x264_intra_sad_x3_8x8c_%1, 3,3
+ movq m6, [r1 - FDEC_STRIDE]
+ add r1, FDEC_STRIDE*4
+%ifidn %1,ssse3
+ movq m7, [pb_3 GLOBAL]
+%endif
+ INTRA_SAD_HV_ITER 0, %1
+ INTRA_SAD_HV_ITER 2, %1
+ INTRA_SAD_HV_ITER 4, %1
+ INTRA_SAD_HV_ITER 6, %1
+ movd [r2+4], m0
+ movd [r2+8], m2
+ pxor m7, m7
+ movq m2, [r1 + FDEC_STRIDE*-4 - 8]
+ movq m4, [r1 + FDEC_STRIDE*-2 - 8]
+ movq m3, [r1 + FDEC_STRIDE* 0 - 8]
+ movq m5, [r1 + FDEC_STRIDE* 2 - 8]
+ punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
+ punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
+ punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
+ punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
+ punpckhbw m2, m4
+ punpckhbw m3, m5
+ psrlq m2, 32
+ psrlq m3, 32
+ psadbw m2, m7 ; s2
+ psadbw m3, m7 ; s3
+ movq m1, m6
+ SWAP 0, 6
+ punpckldq m0, m7
+ punpckhdq m1, m7
+ psadbw m0, m7 ; s0
+ psadbw m1, m7 ; s1
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpckldq m0, m2 ;s0 s1 s2 s3
+ pshufw m3, m0, 11110110b ;s2,s1,s3,s3
+ pshufw m0, m0, 01110100b ;s0,s1,s3,s1
+ paddw m0, m3
+ psrlw m0, 2
+ pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
+%ifidn %1, ssse3
+ movq m1, m0
+ pshufb m0, [pb_shuf8x8c0 GLOBAL]
+ pshufb m1, [pb_shuf8x8c1 GLOBAL]
+%else
+ packuswb m0, m0
+ punpcklbw m0, m0
+ movq m1, m0
+ punpcklbw m0, m0 ; 4x dc0 4x dc1
+ punpckhbw m1, m1 ; 4x dc2 4x dc3
+%endif
+ movq m2, [r0+FENC_STRIDE*0]
+ movq m3, [r0+FENC_STRIDE*1]
+ movq m4, [r0+FENC_STRIDE*2]
+ movq m5, [r0+FENC_STRIDE*3]
+ movq m6, [r0+FENC_STRIDE*4]
+ movq m7, [r0+FENC_STRIDE*5]
+ psadbw m2, m0
+ psadbw m3, m0
+ psadbw m4, m0
+ psadbw m5, m0
+ movq m0, [r0+FENC_STRIDE*6]
+ psadbw m6, m1
+ psadbw m7, m1
+ psadbw m0, m1
+ psadbw m1, [r0+FENC_STRIDE*7]
+ paddw m2, m3
+ paddw m4, m5
+ paddw m6, m7
+ paddw m0, m1
+ paddw m2, m4
+ paddw m6, m0
+ paddw m2, m6
+ movd [r2], m2
+ RET
+%endmacro
+
+INIT_MMX
+INTRA_SAD_8x8C mmxext
+INTRA_SAD_8x8C ssse3
+
+
;-----------------------------------------------------------------------------
; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
%ifidn %1, ssse3
mova m1, [pb_3 GLOBAL]
%endif
-%assign n 0
+%assign x 0
%rep 16
- movzx r4d, byte [r1-1+FDEC_STRIDE*n]
+ movzx r4d, byte [r1-1+FDEC_STRIDE*x]
add r3d, r4d
-%assign n n+1
+%assign x x+1
%endrep
add r3d, 16
shr r3d, 5
int i_max;
int predict_mode[4];
+ int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
uint8_t *p_dstc[2], *p_srcc[2];
predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
a->i_satd_i8x8chroma = COST_MAX;
- if( i_max == 4 && h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] )
+ if( i_max == 4 && b_merged_satd )
{
int satdu[4], satdv[4];
- h->pixf.intra_satd_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
- h->pixf.intra_satd_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
+ h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
+ h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
satdu[I_PRED_CHROMA_P] =
memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
+ h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge );
report( "intra satd_x3 :" );
TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
+ TEST_INTRA_MBCMP( intra_sad_x3_8x8c , predict_8x8c , sad [PIXEL_8x8] , 0 );
report( "intra sad_x3 :" );
if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||