%endmacro
%macro SUM4x8_MM 0
- movq [spill], mm7
- MMX_ABS mm0, mm7
- MMX_ABS mm1, mm7
- MMX_ABS mm2, mm7
- MMX_ABS mm3, mm7
+ movq [spill], mm6
+ movq [spill+8], mm7
+ MMX_ABS_TWO mm0, mm1, mm6, mm7
+ MMX_ABS_TWO mm2, mm3, mm6, mm7
paddw mm0, mm2
paddw mm1, mm3
- movq mm7, [spill]
+ movq mm6, [spill]
+ movq mm7, [spill+8]
MMX_ABS_TWO mm4, mm5, mm2, mm3
MMX_ABS_TWO mm6, mm7, mm2, mm3
paddw mm4, mm6
;-----------------------------------------------------------------------------
x264_pixel_sa8d_8x8_mmxext:
SATD_START
- sub esp, 0x68
-%define args esp+0x6c
+ sub esp, 0x70
+%define args esp+0x74
%define spill esp+0x60
LOAD_DIFF_4x8P 0
HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
movq [spill], mm0
- TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 ; abcd-t -> adtc
+ TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
movq [esp+0x00], mm4
movq [esp+0x08], mm7
movq [esp+0x10], mm0
LOAD_DIFF_4x8P 4
HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
- movq [spill], mm4
- TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
+ movq [spill], mm7
+ TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7
movq [esp+0x40], mm0
movq [esp+0x48], mm3
- movq [esp+0x50], mm4
+ movq [esp+0x50], mm7
movq [esp+0x58], mm2
- movq mm4, [spill]
+ movq mm7, [spill]
TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
movq mm5, [esp+0x00]
movq mm1, [esp+0x08]
mov ecx, eax ; preserve rounding for 16x16
add eax, 1
shr eax, 1
- add esp, 0x68
+ add esp, 0x70
pop ebx
ret
%undef args
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
+ x264_pixel_cmp_t sa8d = (*h->pixf.mbcmp == *h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8];
int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
int i_cost = 0;
h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
- /* could use sa8d, but it doesn't seem worth the speed cost (without mmx at least) */
- i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dst_by, FDEC_STRIDE,
- p_src_by, FENC_STRIDE )
- + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
+ i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
+ + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
p_src_by, FENC_STRIDE )
- + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
+ + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
}