%include "amd64inc.asm"
+; sad
+
%macro SAD_INC_2x16P 0
movq mm1, [parm1q]
movq mm2, [parm1q+8]
lea parm3q, [parm3q+2*parm4q]
%endmacro
+; sad x3 / x4
+
+%macro SAD_X3_START_1x8P 1
+ mov%1 mm3, [parm1q]
+ mov%1 mm0, [parm2q]
+ mov%1 mm1, [parm3q]
+ mov%1 mm2, [parm4q]
+ psadbw mm0, mm3
+ psadbw mm1, mm3
+ psadbw mm2, mm3
+%endmacro
+
+%macro SAD_X3_1x8P 3
+ mov%1 mm3, [parm1q+%2]
+ mov%1 mm4, [parm2q+%3]
+ mov%1 mm5, [parm3q+%3]
+ mov%1 mm6, [parm4q+%3]
+ psadbw mm4, mm3
+ psadbw mm5, mm3
+ psadbw mm6, mm3
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm2, mm6
+%endmacro
+
+%macro SAD_X3_2x16P 1
+%if %1
+ SAD_X3_START_1x8P q
+%else
+ SAD_X3_1x8P q, 0, 0
+%endif
+ SAD_X3_1x8P q, 8, 8
+ SAD_X3_1x8P q, FENC_STRIDE, parm5q
+ SAD_X3_1x8P q, FENC_STRIDE+8, parm5q+8
+ add parm1q, 2*FENC_STRIDE
+ lea parm2q, [parm2q+2*parm5q]
+ lea parm3q, [parm3q+2*parm5q]
+ lea parm4q, [parm4q+2*parm5q]
+%endmacro
+
+%macro SAD_X3_2x8P 1
+%if %1
+ SAD_X3_START_1x8P q
+%else
+ SAD_X3_1x8P q, 0, 0
+%endif
+ SAD_X3_1x8P q, FENC_STRIDE, parm5q
+ add parm1q, 2*FENC_STRIDE
+ lea parm2q, [parm2q+2*parm5q]
+ lea parm3q, [parm3q+2*parm5q]
+ lea parm4q, [parm4q+2*parm5q]
+%endmacro
+
+%macro SAD_X3_2x4P 1
+%if %1
+ SAD_X3_START_1x8P d
+%else
+ SAD_X3_1x8P d, 0, 0
+%endif
+ SAD_X3_1x8P d, FENC_STRIDE, parm5q
+ add parm1q, 2*FENC_STRIDE
+ lea parm2q, [parm2q+2*parm5q]
+ lea parm3q, [parm3q+2*parm5q]
+ lea parm4q, [parm4q+2*parm5q]
+%endmacro
+
+%macro SAD_X4_START_1x8P 1
+ mov%1 mm7, [parm1q]
+ mov%1 mm0, [parm2q]
+ mov%1 mm1, [parm3q]
+ mov%1 mm2, [parm4q]
+ mov%1 mm3, [parm5q]
+ psadbw mm0, mm7
+ psadbw mm1, mm7
+ psadbw mm2, mm7
+ psadbw mm3, mm7
+%endmacro
+
+%macro SAD_X4_1x8P 2
+ movq mm7, [parm1q+%1]
+ movq mm4, [parm2q+%2]
+ movq mm5, [parm3q+%2]
+ movq mm6, [parm4q+%2]
+ psadbw mm4, mm7
+ psadbw mm5, mm7
+ psadbw mm6, mm7
+ psadbw mm7, [parm5q+%2]
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm2, mm6
+ paddw mm3, mm7
+%endmacro
+
+%macro SAD_X4_1x4P 2
+ movd mm7, [parm1q+%1]
+ movd mm4, [parm2q+%2]
+ movd mm5, [parm3q+%2]
+ movd mm6, [parm4q+%2]
+ psadbw mm4, mm7
+ psadbw mm5, mm7
+ paddw mm0, mm4
+ psadbw mm6, mm7
+ movd mm4, [parm5q+%2]
+ paddw mm1, mm5
+ psadbw mm4, mm7
+ paddw mm2, mm6
+ paddw mm3, mm4
+%endmacro
+
+%macro SAD_X4_2x16P 1
+%if %1
+ SAD_X4_START_1x8P q
+%else
+ SAD_X4_1x8P 0, 0
+%endif
+ SAD_X4_1x8P 8, 8
+ SAD_X4_1x8P FENC_STRIDE, parm6q
+ SAD_X4_1x8P FENC_STRIDE+8, parm6q+8
+ add parm1q, 2*FENC_STRIDE
+ lea parm2q, [parm2q+2*parm6q]
+ lea parm3q, [parm3q+2*parm6q]
+ lea parm4q, [parm4q+2*parm6q]
+ lea parm5q, [parm5q+2*parm6q]
+%endmacro
+
+%macro SAD_X4_2x8P 1
+%if %1
+ SAD_X4_START_1x8P q
+%else
+ SAD_X4_1x8P 0, 0
+%endif
+ SAD_X4_1x8P FENC_STRIDE, parm6q
+ add parm1q, 2*FENC_STRIDE
+ lea parm2q, [parm2q+2*parm6q]
+ lea parm3q, [parm3q+2*parm6q]
+ lea parm4q, [parm4q+2*parm6q]
+ lea parm5q, [parm5q+2*parm6q]
+%endmacro
+
+%macro SAD_X4_2x4P 1
+%if %1
+ SAD_X4_START_1x8P d
+%else
+ SAD_X4_1x4P 0, 0
+%endif
+ SAD_X4_1x4P FENC_STRIDE, parm6q
+ add parm1q, 2*FENC_STRIDE
+ lea parm2q, [parm2q+2*parm6q]
+ lea parm3q, [parm3q+2*parm6q]
+ lea parm4q, [parm4q+2*parm6q]
+ lea parm5q, [parm5q+2*parm6q]
+%endmacro
+
+%macro SAD_X3_END 0
+ movd [parm6q+0], mm0
+ movd [parm6q+4], mm1
+ movd [parm6q+8], mm2
+ ret
+%endmacro
+
+%macro SAD_X4_END 0
+ mov rax, parm7q
+ movd [rax+0], mm0
+ movd [rax+4], mm1
+ movd [rax+8], mm2
+ movd [rax+12], mm3
+ ret
+%endmacro
+
+; ssd
+
%macro SSD_INC_1x16P 0
movq mm1, [rax]
movq mm2, [rcx]
SSD_INC_1x4P
%endmacro
+; satd
+
%macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2]
movd %1, %3
movd %2, %4
cglobal x264_pixel_sad_4x8_mmxext
cglobal x264_pixel_sad_4x4_mmxext
+cglobal x264_pixel_sad_x3_16x16_mmxext
+cglobal x264_pixel_sad_x3_16x8_mmxext
+cglobal x264_pixel_sad_x3_8x16_mmxext
+cglobal x264_pixel_sad_x3_8x8_mmxext
+cglobal x264_pixel_sad_x3_8x4_mmxext
+cglobal x264_pixel_sad_x3_4x8_mmxext
+cglobal x264_pixel_sad_x3_4x4_mmxext
+
+cglobal x264_pixel_sad_x4_16x16_mmxext
+cglobal x264_pixel_sad_x4_16x8_mmxext
+cglobal x264_pixel_sad_x4_8x16_mmxext
+cglobal x264_pixel_sad_x4_8x8_mmxext
+cglobal x264_pixel_sad_x4_8x4_mmxext
+cglobal x264_pixel_sad_x4_4x8_mmxext
+cglobal x264_pixel_sad_x4_4x4_mmxext
+
cglobal x264_pixel_sad_pde_16x16_mmxext
cglobal x264_pixel_sad_pde_16x8_mmxext
cglobal x264_pixel_sad_pde_8x16_mmxext
SAD_END
+;-----------------------------------------------------------------------------
+; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, int i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X 3
+ALIGN 16
+x264_pixel_sad_x%1_%2x%3_mmxext:
+ SAD_X%1_2x%2P 1
+%rep %3/2-1
+ SAD_X%1_2x%2P 0
+%endrep
+ SAD_X%1_END
+%endmacro
+
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+SAD_X 3, 4, 8
+SAD_X 3, 4, 4
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+SAD_X 4, 4, 8
+SAD_X 4, 4, 4
+
%macro PDE_CHECK 0
movd eax, mm0
%include "i386inc.asm"
+; sad
+
%macro SAD_INC_2x16P 0
movq mm1, [eax]
movq mm2, [eax+8]
lea ecx, [ecx+2*edx]
%endmacro
+; sad x3 / x4
+
+%macro SAD_X3_START_1x8P 1
+ push edi
+ push esi
+ mov edi, [esp+12]
+ mov eax, [esp+16]
+ mov ecx, [esp+20]
+ mov edx, [esp+24]
+ mov esi, [esp+28]
+ mov%1 mm3, [edi]
+ mov%1 mm0, [eax]
+ mov%1 mm1, [ecx]
+ mov%1 mm2, [edx]
+ psadbw mm0, mm3
+ psadbw mm1, mm3
+ psadbw mm2, mm3
+%endmacro
+
+%macro SAD_X3_1x8P 3
+ mov%1 mm3, [edi+%2]
+ mov%1 mm4, [eax+%3]
+ mov%1 mm5, [ecx+%3]
+ mov%1 mm6, [edx+%3]
+ psadbw mm4, mm3
+ psadbw mm5, mm3
+ psadbw mm6, mm3
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm2, mm6
+%endmacro
+
+%macro SAD_X3_2x16P 1
+%if %1
+ SAD_X3_START_1x8P q
+%else
+ SAD_X3_1x8P q, 0, 0
+%endif
+ SAD_X3_1x8P q, 8, 8
+ SAD_X3_1x8P q, FENC_STRIDE, esi
+ SAD_X3_1x8P q, FENC_STRIDE+8, esi+8
+ add edi, 2*FENC_STRIDE
+ lea eax, [eax+2*esi]
+ lea ecx, [ecx+2*esi]
+ lea edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X3_2x8P 1
+%if %1
+ SAD_X3_START_1x8P q
+%else
+ SAD_X3_1x8P q, 0, 0
+%endif
+ SAD_X3_1x8P q, FENC_STRIDE, esi
+ add edi, 2*FENC_STRIDE
+ lea eax, [eax+2*esi]
+ lea ecx, [ecx+2*esi]
+ lea edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X3_2x4P 1
+%if %1
+ SAD_X3_START_1x8P d
+%else
+ SAD_X3_1x8P d, 0, 0
+%endif
+ SAD_X3_1x8P d, FENC_STRIDE, esi
+ add edi, 2*FENC_STRIDE
+ lea eax, [eax+2*esi]
+ lea ecx, [ecx+2*esi]
+ lea edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X4_START_1x8P 1
+ push edi
+ push esi
+ push ebx
+ mov edi, [esp+16]
+ mov eax, [esp+20]
+ mov ebx, [esp+24]
+ mov ecx, [esp+28]
+ mov edx, [esp+32]
+ mov esi, [esp+36]
+ mov%1 mm7, [edi]
+ mov%1 mm0, [eax]
+ mov%1 mm1, [ebx]
+ mov%1 mm2, [ecx]
+ mov%1 mm3, [edx]
+ psadbw mm0, mm7
+ psadbw mm1, mm7
+ psadbw mm2, mm7
+ psadbw mm3, mm7
+%endmacro
+
+%macro SAD_X4_1x8P 2
+ movq mm7, [edi+%1]
+ movq mm4, [eax+%2]
+ movq mm5, [ebx+%2]
+ movq mm6, [ecx+%2]
+ psadbw mm4, mm7
+ psadbw mm5, mm7
+ psadbw mm6, mm7
+ psadbw mm7, [edx+%2]
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm2, mm6
+ paddw mm3, mm7
+%endmacro
+
+%macro SAD_X4_1x4P 2
+ movd mm7, [edi+%1]
+ movd mm4, [eax+%2]
+ movd mm5, [ebx+%2]
+ movd mm6, [ecx+%2]
+ psadbw mm4, mm7
+ psadbw mm5, mm7
+ paddw mm0, mm4
+ psadbw mm6, mm7
+ movd mm4, [edx+%2]
+ paddw mm1, mm5
+ psadbw mm4, mm7
+ paddw mm2, mm6
+ paddw mm3, mm4
+%endmacro
+
+%macro SAD_X4_2x16P 1
+%if %1
+ SAD_X4_START_1x8P q
+%else
+ SAD_X4_1x8P 0, 0
+%endif
+ SAD_X4_1x8P 8, 8
+ SAD_X4_1x8P FENC_STRIDE, esi
+ SAD_X4_1x8P FENC_STRIDE+8, esi+8
+ add edi, 2*FENC_STRIDE
+ lea eax, [eax+2*esi]
+ lea ebx, [ebx+2*esi]
+ lea ecx, [ecx+2*esi]
+ lea edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X4_2x8P 1
+%if %1
+ SAD_X4_START_1x8P q
+%else
+ SAD_X4_1x8P 0, 0
+%endif
+ SAD_X4_1x8P FENC_STRIDE, esi
+ add edi, 2*FENC_STRIDE
+ lea eax, [eax+2*esi]
+ lea ebx, [ebx+2*esi]
+ lea ecx, [ecx+2*esi]
+ lea edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X4_2x4P 1
+%if %1
+ SAD_X4_START_1x8P d
+%else
+ SAD_X4_1x4P 0, 0
+%endif
+ SAD_X4_1x4P FENC_STRIDE, esi
+ add edi, 2*FENC_STRIDE
+ lea eax, [eax+2*esi]
+ lea ebx, [ebx+2*esi]
+ lea ecx, [ecx+2*esi]
+ lea edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X3_END 0
+ mov eax, [esp+32]
+ movd [eax+0], mm0
+ movd [eax+4], mm1
+ movd [eax+8], mm2
+ pop esi
+ pop edi
+ ret
+%endmacro
+
+%macro SAD_X4_END 0
+ mov eax, [esp+40]
+ movd [eax+0], mm0
+ movd [eax+4], mm1
+ movd [eax+8], mm2
+ movd [eax+12], mm3
+ pop ebx
+ pop esi
+ pop edi
+ ret
+%endmacro
+
+; ssd
+
%macro SSD_INC_1x16P 0
movq mm1, [eax]
movq mm2, [ecx]
SSD_INC_1x4P
%endmacro
+; satd
+
%macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2]
movd %1, %3
movd %2, %4
cglobal x264_pixel_sad_4x8_mmxext
cglobal x264_pixel_sad_4x4_mmxext
+cglobal x264_pixel_sad_x3_16x16_mmxext
+cglobal x264_pixel_sad_x3_16x8_mmxext
+cglobal x264_pixel_sad_x3_8x16_mmxext
+cglobal x264_pixel_sad_x3_8x8_mmxext
+cglobal x264_pixel_sad_x3_8x4_mmxext
+cglobal x264_pixel_sad_x3_4x8_mmxext
+cglobal x264_pixel_sad_x3_4x4_mmxext
+
+cglobal x264_pixel_sad_x4_16x16_mmxext
+cglobal x264_pixel_sad_x4_16x8_mmxext
+cglobal x264_pixel_sad_x4_8x16_mmxext
+cglobal x264_pixel_sad_x4_8x8_mmxext
+cglobal x264_pixel_sad_x4_8x4_mmxext
+cglobal x264_pixel_sad_x4_4x8_mmxext
+cglobal x264_pixel_sad_x4_4x4_mmxext
+
cglobal x264_pixel_sad_pde_16x16_mmxext
cglobal x264_pixel_sad_pde_16x8_mmxext
cglobal x264_pixel_sad_pde_8x16_mmxext
SAD_END
+;-----------------------------------------------------------------------------
+; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, int i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X 3
+ALIGN 16
+x264_pixel_sad_x%1_%2x%3_mmxext:
+ SAD_X%1_2x%2P 1
+%rep %3/2-1
+ SAD_X%1_2x%2P 0
+%endrep
+ SAD_X%1_END
+%endmacro
+
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+SAD_X 3, 4, 8
+SAD_X 3, 4, 4
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+SAD_X 4, 4, 8
+SAD_X 4, 4, 4
+
+
%macro PDE_CHECK 0
movd ebx, mm0
cmp ebx, [esp+24] ; prev_score
cglobal x264_pixel_sad_16x16_sse2
cglobal x264_pixel_sad_16x8_sse2
+cglobal x264_pixel_sad_x3_16x16_sse2
+cglobal x264_pixel_sad_x3_16x8_sse2
+cglobal x264_pixel_sad_x4_16x16_sse2
+cglobal x264_pixel_sad_x4_16x8_sse2
cglobal x264_pixel_ssd_16x16_sse2
cglobal x264_pixel_ssd_16x8_sse2
cglobal x264_pixel_satd_8x4_sse2
SAD_INC_4x16P_SSE2
SAD_END_SSE2
+
+%macro SAD_X3_START_1x16P 0
+ push edi
+ push esi
+ mov edi, [esp+12]
+ mov eax, [esp+16]
+ mov ecx, [esp+20]
+ mov edx, [esp+24]
+ mov esi, [esp+28]
+ movdqa xmm3, [edi]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [ecx]
+ movdqu xmm2, [edx]
+ psadbw xmm0, xmm3
+ psadbw xmm1, xmm3
+ psadbw xmm2, xmm3
+%endmacro
+
+%macro SAD_X3_1x16P 2
+ movdqa xmm3, [edi+%1]
+ movdqu xmm4, [eax+%2]
+ movdqu xmm5, [ecx+%2]
+ movdqu xmm6, [edx+%2]
+ psadbw xmm4, xmm3
+ psadbw xmm5, xmm3
+ psadbw xmm6, xmm3
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm6
+%endmacro
+
+%macro SAD_X3_2x16P 1
+%if %1
+ SAD_X3_START_1x16P
+%else
+ SAD_X3_1x16P 0, 0
+%endif
+ SAD_X3_1x16P FENC_STRIDE, esi
+ add edi, 2*FENC_STRIDE
+ lea eax, [eax+2*esi]
+ lea ecx, [ecx+2*esi]
+ lea edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X4_START_1x16P 0
+ push edi
+ push esi
+ push ebx
+ mov edi, [esp+16]
+ mov eax, [esp+20]
+ mov ebx, [esp+24]
+ mov ecx, [esp+28]
+ mov edx, [esp+32]
+ mov esi, [esp+36]
+ movdqa xmm7, [edi]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [ebx]
+ movdqu xmm2, [ecx]
+ movdqu xmm3, [edx]
+ psadbw xmm0, xmm7
+ psadbw xmm1, xmm7
+ psadbw xmm2, xmm7
+ psadbw xmm3, xmm7
+%endmacro
+
+%macro SAD_X4_1x16P 2
+ movdqa xmm7, [edi+%1]
+ movdqu xmm4, [eax+%2]
+ movdqu xmm5, [ebx+%2]
+ movdqu xmm6, [ecx+%2]
+ psadbw xmm4, xmm7
+ psadbw xmm5, xmm7
+ paddw xmm0, xmm4
+ psadbw xmm6, xmm7
+ movdqu xmm4, [edx+%2]
+ paddw xmm1, xmm5
+ psadbw xmm4, xmm7
+ paddw xmm2, xmm6
+ paddw xmm3, xmm4
+%endmacro
+
+%macro SAD_X4_2x16P 1
+%if %1
+ SAD_X4_START_1x16P
+%else
+ SAD_X4_1x16P 0, 0
+%endif
+ SAD_X4_1x16P FENC_STRIDE, esi
+ add edi, 2*FENC_STRIDE
+ lea eax, [eax+2*esi]
+ lea ebx, [ebx+2*esi]
+ lea ecx, [ecx+2*esi]
+ lea edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X3_END 0
+ mov eax, [esp+32]
+ pshufd xmm4, xmm0, 2
+ pshufd xmm5, xmm1, 2
+ pshufd xmm6, xmm2, 2
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm6
+ movd [eax+0], xmm0
+ movd [eax+4], xmm1
+ movd [eax+8], xmm2
+ pop esi
+ pop edi
+ ret
+%endmacro
+
+%macro SAD_X4_END 0
+ mov eax, [esp+40]
+ pshufd xmm4, xmm0, 2
+ pshufd xmm5, xmm1, 2
+ pshufd xmm6, xmm2, 2
+ pshufd xmm7, xmm3, 2
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm6
+ paddw xmm3, xmm7
+ movd [eax+0], xmm0
+ movd [eax+4], xmm1
+ movd [eax+8], xmm2
+ movd [eax+12], xmm3
+ pop ebx
+ pop esi
+ pop edi
+ ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, int i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X 3
+ALIGN 16
+x264_pixel_sad_x%1_%2x%3_sse2:
+ SAD_X%1_2x%2P 1
+%rep %3/2-1
+ SAD_X%1_2x%2P 0
+%endrep
+ SAD_X%1_END
+%endmacro
+
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+
+
%macro SSD_INC_2x16P_SSE2 0
movdqu xmm1, [eax]
movdqu xmm2, [ecx]
int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+void x264_pixel_sad_x3_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+
int x264_pixel_sad_pde_16x16_mmxext( uint8_t *, int, uint8_t *, int, int );
int x264_pixel_sad_pde_16x8_mmxext( uint8_t *, int, uint8_t *, int, int );
int x264_pixel_sad_pde_8x16_mmxext( uint8_t *, int, uint8_t *, int, int );
int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int );
+void x264_pixel_sad_x3_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+
int x264_pixel_ssd_16x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_ssd_16x8_sse2( uint8_t *, int, uint8_t *, int );
}
-PIXEL_SAD_C( pixel_sad_16x16, 16, 16 )
-PIXEL_SAD_C( pixel_sad_16x8, 16, 8 )
-PIXEL_SAD_C( pixel_sad_8x16, 8, 16 )
-PIXEL_SAD_C( pixel_sad_8x8, 8, 8 )
-PIXEL_SAD_C( pixel_sad_8x4, 8, 4 )
-PIXEL_SAD_C( pixel_sad_4x8, 4, 8 )
-PIXEL_SAD_C( pixel_sad_4x4, 4, 4 )
+PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 )
+PIXEL_SAD_C( x264_pixel_sad_16x8, 16, 8 )
+PIXEL_SAD_C( x264_pixel_sad_8x16, 8, 16 )
+PIXEL_SAD_C( x264_pixel_sad_8x8, 8, 8 )
+PIXEL_SAD_C( x264_pixel_sad_8x4, 8, 4 )
+PIXEL_SAD_C( x264_pixel_sad_4x8, 4, 8 )
+PIXEL_SAD_C( x264_pixel_sad_4x4, 4, 4 )
/****************************************************************************
return i_sum; \
}
-PIXEL_SSD_C( pixel_ssd_16x16, 16, 16 )
-PIXEL_SSD_C( pixel_ssd_16x8, 16, 8 )
-PIXEL_SSD_C( pixel_ssd_8x16, 8, 16 )
-PIXEL_SSD_C( pixel_ssd_8x8, 8, 8 )
-PIXEL_SSD_C( pixel_ssd_8x4, 8, 4 )
-PIXEL_SSD_C( pixel_ssd_4x8, 4, 8 )
-PIXEL_SSD_C( pixel_ssd_4x4, 4, 4 )
+PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 )
+PIXEL_SSD_C( x264_pixel_ssd_16x8, 16, 8 )
+PIXEL_SSD_C( x264_pixel_ssd_8x16, 8, 16 )
+PIXEL_SSD_C( x264_pixel_ssd_8x8, 8, 8 )
+PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 )
+PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 )
+PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 )
int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
{
{ \
return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
}
-PIXEL_SATD_C( pixel_satd_16x16, 16, 16 )
-PIXEL_SATD_C( pixel_satd_16x8, 16, 8 )
-PIXEL_SATD_C( pixel_satd_8x16, 8, 16 )
-PIXEL_SATD_C( pixel_satd_8x8, 8, 8 )
-PIXEL_SATD_C( pixel_satd_8x4, 8, 4 )
-PIXEL_SATD_C( pixel_satd_4x8, 4, 8 )
-PIXEL_SATD_C( pixel_satd_4x4, 4, 4 )
+PIXEL_SATD_C( x264_pixel_satd_16x16, 16, 16 )
+PIXEL_SATD_C( x264_pixel_satd_16x8, 16, 8 )
+PIXEL_SATD_C( x264_pixel_satd_8x16, 8, 16 )
+PIXEL_SATD_C( x264_pixel_satd_8x8, 8, 8 )
+PIXEL_SATD_C( x264_pixel_satd_8x4, 8, 4 )
+PIXEL_SATD_C( x264_pixel_satd_4x8, 4, 8 )
+PIXEL_SATD_C( x264_pixel_satd_4x4, 4, 4 )
/****************************************************************************
}
#define PIXEL_SA8D_C( width, height ) \
-static int pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \
- uint8_t *pix2, int i_stride_pix2 ) \
+static int x264_pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \
+ uint8_t *pix2, int i_stride_pix2 ) \
{ \
return ( pixel_sa8d_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ) + 2 ) >> 2; \
}
PIXEL_SA8D_C( 8, 16 )
PIXEL_SA8D_C( 8, 8 )
+#define SAD_X( size ) \
+static void x264_pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+{\
+ scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
+ scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
+ scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
+}\
+static void x264_pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+{\
+ scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
+ scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
+ scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
+ scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
+}
+
+SAD_X( 16x16 )
+SAD_X( 16x8 )
+SAD_X( 8x16 )
+SAD_X( 8x8 )
+SAD_X( 8x4 )
+SAD_X( 4x8 )
+SAD_X( 4x4 )
+
+#ifdef ARCH_UltraSparc
+SAD_X( 16x16_vis )
+SAD_X( 16x8_vis )
+SAD_X( 8x16_vis )
+SAD_X( 8x8_vis )
+#endif
/****************************************************************************
* x264_pixel_init:
****************************************************************************/
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
- pixf->sad[PIXEL_16x16] = pixel_sad_16x16;
- pixf->sad[PIXEL_16x8] = pixel_sad_16x8;
- pixf->sad[PIXEL_8x16] = pixel_sad_8x16;
- pixf->sad[PIXEL_8x8] = pixel_sad_8x8;
- pixf->sad[PIXEL_8x4] = pixel_sad_8x4;
- pixf->sad[PIXEL_4x8] = pixel_sad_4x8;
- pixf->sad[PIXEL_4x4] = pixel_sad_4x4;
-
- pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16;
- pixf->ssd[PIXEL_16x8] = pixel_ssd_16x8;
- pixf->ssd[PIXEL_8x16] = pixel_ssd_8x16;
- pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8;
- pixf->ssd[PIXEL_8x4] = pixel_ssd_8x4;
- pixf->ssd[PIXEL_4x8] = pixel_ssd_4x8;
- pixf->ssd[PIXEL_4x4] = pixel_ssd_4x4;
-
- pixf->satd[PIXEL_16x16]= pixel_satd_16x16;
- pixf->satd[PIXEL_16x8] = pixel_satd_16x8;
- pixf->satd[PIXEL_8x16] = pixel_satd_8x16;
- pixf->satd[PIXEL_8x8] = pixel_satd_8x8;
- pixf->satd[PIXEL_8x4] = pixel_satd_8x4;
- pixf->satd[PIXEL_4x8] = pixel_satd_4x8;
- pixf->satd[PIXEL_4x4] = pixel_satd_4x4;
-
- pixf->sa8d[PIXEL_16x16]= pixel_sa8d_16x16;
- pixf->sa8d[PIXEL_16x8] = pixel_sa8d_16x8;
- pixf->sa8d[PIXEL_8x16] = pixel_sa8d_8x16;
- pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8;
+ memset( pixf, 0, sizeof(*pixf) );
+
+#define INIT( name, cpu ) \
+ pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\
+ pixf->name[PIXEL_16x8] = x264_pixel_##name##_16x8##cpu;\
+ pixf->name[PIXEL_8x16] = x264_pixel_##name##_8x16##cpu;\
+ pixf->name[PIXEL_8x8] = x264_pixel_##name##_8x8##cpu;\
+ pixf->name[PIXEL_8x4] = x264_pixel_##name##_8x4##cpu;\
+ pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\
+ pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu;
+
+ INIT( sad, );
+ INIT( sad_x3, );
+ INIT( sad_x4, );
+ INIT( ssd, );
+ INIT( satd, );
+
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16;
+ pixf->sa8d[PIXEL_16x8] = x264_pixel_sa8d_16x8;
+ pixf->sa8d[PIXEL_8x16] = x264_pixel_sa8d_8x16;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8;
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMX )
{
- pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmx;
- pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_mmx;
- pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_mmx;
- pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_mmx;
- pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_mmx;
- pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_mmx;
- pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_mmx;
+ INIT( ssd, _mmx );
}
if( cpu&X264_CPU_MMXEXT )
{
- pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_mmxext;
- pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_mmxext;
- pixf->sad[PIXEL_8x16 ] = x264_pixel_sad_8x16_mmxext;
- pixf->sad[PIXEL_8x8 ] = x264_pixel_sad_8x8_mmxext;
- pixf->sad[PIXEL_8x4 ] = x264_pixel_sad_8x4_mmxext;
- pixf->sad[PIXEL_4x8 ] = x264_pixel_sad_4x8_mmxext;
- pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_mmxext;
+ INIT( sad, _mmxext );
+ INIT( sad_x3, _mmxext );
+ INIT( sad_x4, _mmxext );
+ INIT( satd, _mmxext );
pixf->sad_pde[PIXEL_16x16] = x264_pixel_sad_pde_16x16_mmxext;
pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext;
pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext;
-
- pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext;
- pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext;
- pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext;
- pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_mmxext;
- pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_mmxext;
- pixf->satd[PIXEL_4x8] = x264_pixel_satd_4x8_mmxext;
- pixf->satd[PIXEL_4x4] = x264_pixel_satd_4x4_mmxext;
}
#endif
pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_sse2;
pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_sse2;
pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_sse2;
+
+#ifndef ARCH_X86_64
+ pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_sse2;
+ pixf->sad_x3[PIXEL_16x8 ] = x264_pixel_sad_x3_16x8_sse2;
+
+ pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_sse2;
+ pixf->sad_x4[PIXEL_16x8 ] = x264_pixel_sad_x4_16x8_sse2;
+#endif
}
// these are faster on both Intel and AMD
if( cpu&X264_CPU_SSE2 )
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_vis;
pixf->sad[PIXEL_16x8] = x264_pixel_sad_16x8_vis;
pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_vis;
+
+ pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_vis;
+ pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_vis;
+ pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_vis;
+ pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_vis;
+
+ pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_vis;
+ pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_vis;
+ pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_vis;
+ pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_vis;
#endif
}
typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
typedef int (*x264_pixel_cmp_pde_t) ( uint8_t *, int, uint8_t *, int, int );
+typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
+typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );
enum
{
* terminate early if partial score is worse than a threshold.
* may be NULL, in which case just use sad instead. */
x264_pixel_cmp_pde_t sad_pde[7];
+
+ /* multiple parallel calls to sad. */
+ x264_pixel_cmp_x3_t sad_x3[7];
+ x264_pixel_cmp_x4_t sad_x4[7];
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
return i_satd / 2;
}
+#define SAD_X( size ) \
+static void pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+{\
+ scores[0] = pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
+ scores[1] = pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
+ scores[2] = pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
+}\
+static void pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+{\
+ scores[0] = pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
+ scores[1] = pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
+ scores[2] = pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
+ scores[3] = pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
+}
+
+SAD_X( 16x16_altivec )
+SAD_X( 16x8_altivec )
+SAD_X( 8x16_altivec )
+SAD_X( 8x8_altivec )
+
/****************************************************************************
* x264_pixel_init:
****************************************************************************/
pixf->sad[PIXEL_16x8] = pixel_sad_16x8_altivec;
pixf->sad[PIXEL_8x8] = pixel_sad_8x8_altivec;
+ pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
+ pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_altivec;
+ pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_altivec;
+ pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_altivec;
+
+ pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
+ pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_altivec;
+ pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_altivec;
+ pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_altivec;
+
pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
pixf->satd[PIXEL_8x16] = pixel_satd_8x16_altivec;
pixf->satd[PIXEL_16x8] = pixel_satd_16x8_altivec;
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
-#define COST_MV_INT( mx, my, bd, d ) \
+#define BITS_MVD( mx, my )\
+ (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
+
+#define COST_MV( mx, my ) \
{ \
int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, \
&p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] ) \
- + p_cost_mvx[ (mx)<<2 ] \
- + p_cost_mvy[ (my)<<2 ]; \
+ + BITS_MVD(mx,my); \
if( cost < bcost ) \
{ \
bcost = cost; \
bmx = mx; \
bmy = my; \
- if( bd ) \
- dir = d; \
} \
}
-#define COST_MV( mx, my ) COST_MV_INT( mx, my, 0, 0 )
-#define COST_MV_DIR( mx, my, d ) COST_MV_INT( mx, my, 1, d )
#define COST_MV_PDE( mx, my ) \
{ \
int cost = h->pixf.sad_pde[i_pixel]( m->p_fenc[0], FENC_STRIDE, \
&p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0], \
bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ); \
- if( cost < bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ) \
- { \
- bcost = cost + p_cost_mvx[ (mx)<<2 ] + p_cost_mvy[ (my)<<2 ]; \
- bmx = mx; \
- bmy = my; \
+ if( cost < bcost - BITS_MVD(mx,my) ) \
+ { \
+ bcost = cost + BITS_MVD(mx,my); \
+ bmx = mx; \
+ bmy = my; \
} \
}
-#define DIA1_ITER( mx, my )\
- {\
- omx = mx; omy = my;\
- COST_MV( omx , omy-1 );/* 1 */\
- COST_MV( omx , omy+1 );/* 101 */\
- COST_MV( omx-1, omy );/* 1 */\
- COST_MV( omx+1, omy );\
- }
+#define COPY2_IF_LT(x,y,a,b)\
+if((y)<(x))\
+{\
+ (x)=(y);\
+ (a)=(b);\
+}
-#define DIA2 \
- {\
- COST_MV( omx , omy-2 );\
- COST_MV( omx-1, omy-1 );/* 1 */\
- COST_MV( omx+1, omy-1 );/* 1 1 */\
- COST_MV( omx-2, omy );/* 1 0 1 */\
- COST_MV( omx+2, omy );/* 1 1 */\
- COST_MV( omx-1, omy+1 );/* 1 */\
- COST_MV( omx+1, omy+1 );\
- COST_MV( omx , omy+2 );\
- }\
-
-#define OCT2 \
- {\
- COST_MV( omx-1, omy-2 );\
- COST_MV( omx+1, omy-2 );/* 1 1 */\
- COST_MV( omx-2, omy-1 );/* 1 1 */\
- COST_MV( omx+2, omy-1 );/* 0 */\
- COST_MV( omx-2, omy+1 );/* 1 1 */\
- COST_MV( omx+2, omy+1 );/* 1 1 */\
- COST_MV( omx-1, omy+2 );\
- COST_MV( omx+1, omy+2 );\
- }
+#define COPY3_IF_LT(x,y,a,b,c,d)\
+if((y)<(x))\
+{\
+ (x)=(y);\
+ (a)=(b);\
+ (c)=(d);\
+}
+
+#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
+{\
+ uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\
+ h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
+ pix_base + (m0x) + (m0y)*m->i_stride[0],\
+ pix_base + (m1x) + (m1y)*m->i_stride[0],\
+ pix_base + (m2x) + (m2y)*m->i_stride[0],\
+ m->i_stride[0], costs );\
+ (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
+ (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
+ (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
+}
+
+#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
+{\
+ uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\
+ h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\
+ pix_base + (m0x) + (m0y)*m->i_stride[0],\
+ pix_base + (m1x) + (m1y)*m->i_stride[0],\
+ pix_base + (m2x) + (m2y)*m->i_stride[0],\
+ pix_base + (m3x) + (m3y)*m->i_stride[0],\
+ m->i_stride[0], costs );\
+ costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\
+ costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\
+ costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\
+ costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\
+ COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\
+ COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\
+ COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\
+ COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
+}
+
+/* 1 */
+/* 101 */
+/* 1 */
+#define DIA1_ITER( mx, my )\
+{\
+ omx = mx; omy = my;\
+ COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\
+}
#define CROSS( start, x_max, y_max ) \
{ \
uint8_t *p_fref = m->p_fref[0];
int i, j;
int dir;
+ int costs[6];
int mv_x_min = h->mb.mv_min_fpel[0];
int mv_y_min = h->mb.mv_min_fpel[1];
bcost = COST_MAX;
COST_MV( pmx, pmy );
/* I don't know why this helps */
- bcost -= p_cost_mvx[ bmx<<2 ] + p_cost_mvy[ bmy<<2 ];
+ bcost -= BITS_MVD(bmx,bmy);
/* try extra predictors if provided */
for( i = 0; i < i_mvc; i++ )
if( mx != bmx || my != bmy )
COST_MV( mx, my );
}
-
+
COST_MV( 0, 0 );
mv_x_max += 8;
}
#else
/* equivalent to the above, but eliminates duplicate candidates */
- dir = -1;
- omx = bmx; omy = bmy;
- COST_MV_DIR( omx-2, omy, 0 );
- COST_MV_DIR( omx-1, omy+2, 1 );
- COST_MV_DIR( omx+1, omy+2, 2 );
- COST_MV_DIR( omx+2, omy, 3 );
- COST_MV_DIR( omx+1, omy-2, 4 );
- COST_MV_DIR( omx-1, omy-2, 5 );
- if( dir != -1 )
+ dir = -2;
+
+ /* hexagon */
+ COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs );
+ COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+3 );
+ COPY2_IF_LT( bcost, costs[0], dir, 0 );
+ COPY2_IF_LT( bcost, costs[1], dir, 1 );
+ COPY2_IF_LT( bcost, costs[2], dir, 2 );
+ COPY2_IF_LT( bcost, costs[3], dir, 3 );
+ COPY2_IF_LT( bcost, costs[4], dir, 4 );
+ COPY2_IF_LT( bcost, costs[5], dir, 5 );
+
+ if( dir != -2 )
{
+ static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
+ bmx += hex2[dir+1][0];
+ bmy += hex2[dir+1][1];
+ /* half hexagon, not overlapping the previous iteration */
for( i = 1; i < i_me_range/2; i++ )
{
- static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
static const int mod6[8] = {5,0,1,2,3,4,5,0};
const int odir = mod6[dir+1];
- omx = bmx; omy = bmy;
- COST_MV_DIR( omx + hex2[odir+0][0], omy + hex2[odir+0][1], odir-1 );
- COST_MV_DIR( omx + hex2[odir+1][0], omy + hex2[odir+1][1], odir );
- COST_MV_DIR( omx + hex2[odir+2][0], omy + hex2[odir+2][1], odir+1 );
- if( bmx == omx && bmy == omy )
+ COST_MV_X3_DIR( hex2[odir+0][0], hex2[odir+0][1],
+ hex2[odir+1][0], hex2[odir+1][1],
+ hex2[odir+2][0], hex2[odir+2][1],
+ costs );
+ dir = -2;
+ COPY2_IF_LT( bcost, costs[0], dir, odir-1 );
+ COPY2_IF_LT( bcost, costs[1], dir, odir );
+ COPY2_IF_LT( bcost, costs[2], dir, odir+1 );
+ if( dir == -2 )
break;
+ bmx += hex2[dir+1][0];
+ bmy += hex2[dir+1][1];
}
}
#endif
/* square refine */
- DIA1_ITER( bmx, bmy );
- COST_MV( omx-1, omy-1 );
- COST_MV( omx-1, omy+1 );
- COST_MV( omx+1, omy-1 );
- COST_MV( omx+1, omy+1 );
+ omx = bmx; omy = bmy;
+ COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );
+ COST_MV_X4( -1,-1, -1,1, 1,-1, 1,1 );
break;
case X264_ME_UMH:
#define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) )
if( bcost == ucost2 && SAD_THRESH(2000) )
{
- DIA2;
+ COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 );
+ COST_MV_X4( 2, 0, -1, 1, 1, 1, 0,2 );
if( bcost == ucost1 && SAD_THRESH(500) )
break;
if( bcost == ucost2 )
{
int range = (i_me_range>>1) | 1;
CROSS( 3, range, range );
- OCT2;
+ COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 );
+ COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 );
if( bcost == ucost2 )
break;
cross_start = range + 2;
}
/* adaptive search range */
- if( i_mvc )
+ if( i_mvc )
{
/* range multipliers based on casual inspection of some statistics of
* average distance between current predictor and final mv found by ESA.
/* 5x5 ESA */
omx = bmx; omy = bmy;
- for( i = (bcost == ucost2) ? 4 : 0; i < 24; i++ )
- {
- static const int square2[24][2] = {
- { 1, 0}, { 0, 1}, {-1, 0}, { 0,-1},
- { 1, 1}, {-1, 1}, {-1,-1}, { 1,-1},
- { 2,-1}, { 2, 0}, { 2, 1}, { 2, 2},
- { 1, 2}, { 0, 2}, {-1, 2}, {-2, 2},
- {-2, 1}, {-2, 0}, {-2,-1}, {-2,-2},
- {-1,-2}, { 0,-2}, { 1,-2}, { 2,-2}
- };
- COST_MV( omx + square2[i][0], omy + square2[i][1] );
- }
+ if( bcost != ucost2 )
+ COST_MV_X4( 1, 0, 0, 1, -1, 0, 0,-1 );
+ COST_MV_X4( 1, 1, -1, 1, -1,-1, 1,-1 );
+ COST_MV_X4( 2,-1, 2, 0, 2, 1, 2, 2 );
+ COST_MV_X4( 1, 2, 0, 2, -1, 2, -2, 2 );
+ COST_MV_X4( -2, 1, -2, 0, -2,-1, -2,-2 );
+ COST_MV_X4( -1,-2, 0,-2, 1,-2, 2,-2 );
/* hexagon grid */
omx = bmx; omy = bmy;
{ 2, 3}, { 0, 4}, {-2, 3},
{-2,-3}, { 0,-4}, { 2,-3},
};
- const int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min );
- if( h->pixf.sad_pde[i_pixel] )
+ if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min,
+ mv_y_max-omy, omy-mv_y_min ) )
{
for( j = 0; j < 16; j++ )
{
int mx = omx + hex4[j][0]*i;
int my = omy + hex4[j][1]*i;
- if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
- && my >= mv_y_min && my <= mv_y_max ) )
- COST_MV_PDE( mx, my );
+ if( mx >= mv_x_min && mx <= mv_x_max
+ && my >= mv_y_min && my <= mv_y_max )
+ COST_MV( mx, my );
}
}
else
{
- for( j = 0; j < 16; j++ )
- {
- int mx = omx + hex4[j][0]*i;
- int my = omy + hex4[j][1]*i;
- if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
- && my >= mv_y_min && my <= mv_y_max ) )
- COST_MV( mx, my );
- }
+ COST_MV_X4( -4*i, 2*i, -4*i, 1*i, -4*i, 0*i, -4*i,-1*i );
+ COST_MV_X4( -4*i,-2*i, 4*i,-2*i, 4*i,-1*i, 4*i, 0*i );
+ COST_MV_X4( 4*i, 1*i, 4*i, 2*i, 2*i, 3*i, 0*i, 4*i );
+ COST_MV_X4( -2*i, 3*i, -2*i,-3*i, 0*i,-4*i, 2*i,-3*i );
}
}
goto me_hex2;
const uint16_t *integral = &integral_base[ mx + my * stride ];
const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ]
- integral[ dw ] - integral[ dh ];
- const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ];
+ const int bsad = bcost - BITS_MVD(mx,my);
if( abs( ref_dc - enc_dc ) < bsad )
COST_MV_PDE( mx, my );
}
const uint16_t *integral = &integral_base[ mx + my * stride ];
const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ]
- integral[ dw ] - integral[ dh ];
- const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ];
+ const int bsad = bcost - BITS_MVD(mx,my);
if( abs( ref_dc - enc_dc ) < bsad )
COST_MV( mx, my );
}
TEST_PIXEL( ssd );
TEST_PIXEL( satd );
+#define TEST_PIXEL_X( N ) \
+ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
+ { \
+ int res_c[4]={0}, res_asm[4]={0}; \
+ if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
+ { \
+ used_asm = 1; \
+ res_c[0] = pixel_c.sad[i]( buf1, 16, buf2, 24 ); \
+ res_c[1] = pixel_c.sad[i]( buf1, 16, buf2+30, 24 ); \
+ res_c[2] = pixel_c.sad[i]( buf1, 16, buf2+1, 24 ); \
+ if(N==4) \
+ { \
+ res_c[3] = pixel_c.sad[i]( buf1, 16, buf2+99, 24 ); \
+ pixel_asm.sad_x4[i]( buf1, buf2, buf2+30, buf2+1, buf2+99, 24, res_asm ); \
+ } \
+ else \
+ pixel_asm.sad_x3[i]( buf1, buf2, buf2+30, buf2+1, 24, res_asm ); \
+ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
+ i, res_c[0], res_c[1], res_c[2], res_c[3], \
+ res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
+ } \
+ } \
+ } \
+ report( "pixel sad_x"#N" :" );
+
+ TEST_PIXEL_X(3);
+ TEST_PIXEL_X(4);
return ret;
}