From: Loren Merritt Date: Mon, 10 Apr 2006 03:03:13 +0000 (+0000) Subject: interleave multiple calls to SAD. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8947b51f35151f821c3718b01c1e93d517d814b5;p=libx264 interleave multiple calls to SAD. 15% faster fullpel motion estimation. git-svn-id: svn://svn.videolan.org/x264/trunk@490 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm index d107a37f..9edb94c4 100644 --- a/common/amd64/pixel-a.asm +++ b/common/amd64/pixel-a.asm @@ -29,6 +29,8 @@ BITS 64 %include "amd64inc.asm" +; sad + %macro SAD_INC_2x16P 0 movq mm1, [parm1q] movq mm2, [parm1q+8] @@ -72,6 +74,177 @@ BITS 64 lea parm3q, [parm3q+2*parm4q] %endmacro +; sad x3 / x4 + +%macro SAD_X3_START_1x8P 1 + mov%1 mm3, [parm1q] + mov%1 mm0, [parm2q] + mov%1 mm1, [parm3q] + mov%1 mm2, [parm4q] + psadbw mm0, mm3 + psadbw mm1, mm3 + psadbw mm2, mm3 +%endmacro + +%macro SAD_X3_1x8P 3 + mov%1 mm3, [parm1q+%2] + mov%1 mm4, [parm2q+%3] + mov%1 mm5, [parm3q+%3] + mov%1 mm6, [parm4q+%3] + psadbw mm4, mm3 + psadbw mm5, mm3 + psadbw mm6, mm3 + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 +%endmacro + +%macro SAD_X3_2x16P 1 +%if %1 + SAD_X3_START_1x8P q +%else + SAD_X3_1x8P q, 0, 0 +%endif + SAD_X3_1x8P q, 8, 8 + SAD_X3_1x8P q, FENC_STRIDE, parm5q + SAD_X3_1x8P q, FENC_STRIDE+8, parm5q+8 + add parm1q, 2*FENC_STRIDE + lea parm2q, [parm2q+2*parm5q] + lea parm3q, [parm3q+2*parm5q] + lea parm4q, [parm4q+2*parm5q] +%endmacro + +%macro SAD_X3_2x8P 1 +%if %1 + SAD_X3_START_1x8P q +%else + SAD_X3_1x8P q, 0, 0 +%endif + SAD_X3_1x8P q, FENC_STRIDE, parm5q + add parm1q, 2*FENC_STRIDE + lea parm2q, [parm2q+2*parm5q] + lea parm3q, [parm3q+2*parm5q] + lea parm4q, [parm4q+2*parm5q] +%endmacro + +%macro SAD_X3_2x4P 1 +%if %1 + SAD_X3_START_1x8P d +%else + SAD_X3_1x8P d, 0, 0 +%endif + SAD_X3_1x8P d, FENC_STRIDE, parm5q + add parm1q, 2*FENC_STRIDE + lea parm2q, [parm2q+2*parm5q] + lea parm3q, [parm3q+2*parm5q] + lea parm4q, [parm4q+2*parm5q] +%endmacro + +%macro SAD_X4_START_1x8P 1 + mov%1 mm7, [parm1q] + mov%1 mm0, [parm2q] + mov%1 mm1, [parm3q] + mov%1 mm2, [parm4q] + mov%1 mm3, [parm5q] + psadbw mm0, mm7 + psadbw mm1, mm7 + psadbw mm2, mm7 + psadbw mm3, mm7 +%endmacro + +%macro SAD_X4_1x8P 2 + movq mm7, [parm1q+%1] + movq mm4, [parm2q+%2] + movq mm5, [parm3q+%2] + movq mm6, [parm4q+%2] + psadbw mm4, mm7 + psadbw mm5, mm7 + psadbw mm6, mm7 + psadbw mm7, [parm5q+%2] + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 + paddw mm3, mm7 +%endmacro + +%macro SAD_X4_1x4P 2 + movd mm7, [parm1q+%1] + movd mm4, [parm2q+%2] + movd mm5, [parm3q+%2] + movd mm6, [parm4q+%2] + psadbw mm4, mm7 + psadbw mm5, mm7 + paddw mm0, mm4 + psadbw mm6, mm7 + movd mm4, [parm5q+%2] + paddw mm1, mm5 + psadbw mm4, mm7 + paddw mm2, mm6 + paddw mm3, mm4 +%endmacro + +%macro SAD_X4_2x16P 1 +%if %1 + SAD_X4_START_1x8P q +%else + SAD_X4_1x8P 0, 0 +%endif + SAD_X4_1x8P 8, 8 + SAD_X4_1x8P FENC_STRIDE, parm6q + SAD_X4_1x8P FENC_STRIDE+8, parm6q+8 + add parm1q, 2*FENC_STRIDE + lea parm2q, [parm2q+2*parm6q] + lea parm3q, [parm3q+2*parm6q] + lea parm4q, [parm4q+2*parm6q] + lea parm5q, [parm5q+2*parm6q] +%endmacro + +%macro SAD_X4_2x8P 1 +%if %1 + SAD_X4_START_1x8P q +%else + SAD_X4_1x8P 0, 0 +%endif + SAD_X4_1x8P FENC_STRIDE, parm6q + add parm1q, 2*FENC_STRIDE + lea parm2q, [parm2q+2*parm6q] + lea parm3q, [parm3q+2*parm6q] + lea parm4q, [parm4q+2*parm6q] + lea parm5q, [parm5q+2*parm6q] +%endmacro + +%macro SAD_X4_2x4P 1 +%if %1 + SAD_X4_START_1x8P d +%else + SAD_X4_1x4P 0, 0 +%endif + SAD_X4_1x4P FENC_STRIDE, parm6q + add parm1q, 2*FENC_STRIDE + lea parm2q, [parm2q+2*parm6q] + lea parm3q, [parm3q+2*parm6q] + lea parm4q, [parm4q+2*parm6q] + lea parm5q, [parm5q+2*parm6q] +%endmacro + +%macro SAD_X3_END 0 + movd [parm6q+0], mm0 + movd [parm6q+4], mm1 + movd [parm6q+8], mm2 + ret +%endmacro + +%macro SAD_X4_END 0 + mov rax, parm7q + movd [rax+0], mm0 + movd [rax+4], mm1 + movd [rax+8], mm2 + movd [rax+12], mm3 + ret +%endmacro + +; ssd + %macro SSD_INC_1x16P 0 movq mm1, [rax] movq mm2, [rcx] @@ -168,6 +341,8 @@ BITS 64 SSD_INC_1x4P %endmacro +; satd + %macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2] movd %1, %3 movd %2, %4 @@ -262,6 +437,22 @@ cglobal x264_pixel_sad_8x4_mmxext cglobal x264_pixel_sad_4x8_mmxext cglobal x264_pixel_sad_4x4_mmxext +cglobal x264_pixel_sad_x3_16x16_mmxext +cglobal x264_pixel_sad_x3_16x8_mmxext +cglobal x264_pixel_sad_x3_8x16_mmxext +cglobal x264_pixel_sad_x3_8x8_mmxext +cglobal x264_pixel_sad_x3_8x4_mmxext +cglobal x264_pixel_sad_x3_4x8_mmxext +cglobal x264_pixel_sad_x3_4x4_mmxext + +cglobal x264_pixel_sad_x4_16x16_mmxext +cglobal x264_pixel_sad_x4_16x8_mmxext +cglobal x264_pixel_sad_x4_8x16_mmxext +cglobal x264_pixel_sad_x4_8x8_mmxext +cglobal x264_pixel_sad_x4_8x4_mmxext +cglobal x264_pixel_sad_x4_4x8_mmxext +cglobal x264_pixel_sad_x4_4x4_mmxext + cglobal x264_pixel_sad_pde_16x16_mmxext cglobal x264_pixel_sad_pde_16x8_mmxext cglobal x264_pixel_sad_pde_8x16_mmxext @@ -380,6 +571,35 @@ x264_pixel_sad_4x4_mmxext: SAD_END +;----------------------------------------------------------------------------- +; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, +; uint8_t *pix2, int i_stride, int scores[3] ) +;----------------------------------------------------------------------------- +%macro SAD_X 3 +ALIGN 16 +x264_pixel_sad_x%1_%2x%3_mmxext: + SAD_X%1_2x%2P 1 +%rep %3/2-1 + SAD_X%1_2x%2P 0 +%endrep + SAD_X%1_END +%endmacro + +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +SAD_X 3, 4, 8 +SAD_X 3, 4, 4 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 +SAD_X 4, 4, 8 +SAD_X 4, 4, 4 + %macro PDE_CHECK 0 movd eax, mm0 diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm index 8b635470..019cc3e6 100644 --- a/common/i386/pixel-a.asm +++ b/common/i386/pixel-a.asm @@ -29,6 +29,8 @@ BITS 32 %include "i386inc.asm" +; sad + %macro SAD_INC_2x16P 0 movq mm1, [eax] movq mm2, [eax+8] @@ -72,6 +74,199 @@ BITS 32 lea ecx, [ecx+2*edx] %endmacro +; sad x3 / x4 + +%macro SAD_X3_START_1x8P 1 + push edi + push esi + mov edi, [esp+12] + mov eax, [esp+16] + mov ecx, [esp+20] + mov edx, [esp+24] + mov esi, [esp+28] + mov%1 mm3, [edi] + mov%1 mm0, [eax] + mov%1 mm1, [ecx] + mov%1 mm2, [edx] + psadbw mm0, mm3 + psadbw mm1, mm3 + psadbw mm2, mm3 +%endmacro + +%macro SAD_X3_1x8P 3 + mov%1 mm3, [edi+%2] + mov%1 mm4, [eax+%3] + mov%1 mm5, [ecx+%3] + mov%1 mm6, [edx+%3] + psadbw mm4, mm3 + psadbw mm5, mm3 + psadbw mm6, mm3 + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 +%endmacro + +%macro SAD_X3_2x16P 1 +%if %1 + SAD_X3_START_1x8P q +%else + SAD_X3_1x8P q, 0, 0 +%endif + SAD_X3_1x8P q, 8, 8 + SAD_X3_1x8P q, FENC_STRIDE, esi + SAD_X3_1x8P q, FENC_STRIDE+8, esi+8 + add edi, 2*FENC_STRIDE + lea eax, [eax+2*esi] + lea ecx, [ecx+2*esi] + lea edx, [edx+2*esi] +%endmacro + +%macro SAD_X3_2x8P 1 +%if %1 + SAD_X3_START_1x8P q +%else + SAD_X3_1x8P q, 0, 0 +%endif + SAD_X3_1x8P q, FENC_STRIDE, esi + add edi, 2*FENC_STRIDE + lea eax, [eax+2*esi] + lea ecx, [ecx+2*esi] + lea edx, [edx+2*esi] +%endmacro + +%macro SAD_X3_2x4P 1 +%if %1 + SAD_X3_START_1x8P d +%else + SAD_X3_1x8P d, 0, 0 +%endif + SAD_X3_1x8P d, FENC_STRIDE, esi + add edi, 2*FENC_STRIDE + lea eax, [eax+2*esi] + lea ecx, [ecx+2*esi] + lea edx, [edx+2*esi] +%endmacro + +%macro SAD_X4_START_1x8P 1 + push edi + push esi + push ebx + mov edi, [esp+16] + mov eax, [esp+20] + mov ebx, [esp+24] + mov ecx, [esp+28] + mov edx, [esp+32] + mov esi, [esp+36] + mov%1 mm7, [edi] + mov%1 mm0, [eax] + mov%1 mm1, [ebx] + mov%1 mm2, [ecx] + mov%1 mm3, [edx] + psadbw mm0, mm7 + psadbw mm1, mm7 + psadbw mm2, mm7 + psadbw mm3, mm7 +%endmacro + +%macro SAD_X4_1x8P 2 + movq mm7, [edi+%1] + movq mm4, [eax+%2] + movq mm5, [ebx+%2] + movq mm6, [ecx+%2] + psadbw mm4, mm7 + psadbw mm5, mm7 + psadbw mm6, mm7 + psadbw mm7, [edx+%2] + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 + paddw mm3, mm7 +%endmacro + +%macro SAD_X4_1x4P 2 + movd mm7, [edi+%1] + movd mm4, [eax+%2] + movd mm5, [ebx+%2] + movd mm6, [ecx+%2] + psadbw mm4, mm7 + psadbw mm5, mm7 + paddw mm0, mm4 + psadbw mm6, mm7 + movd mm4, [edx+%2] + paddw mm1, mm5 + psadbw mm4, mm7 + paddw mm2, mm6 + paddw mm3, mm4 +%endmacro + +%macro SAD_X4_2x16P 1 +%if %1 + SAD_X4_START_1x8P q +%else + SAD_X4_1x8P 0, 0 +%endif + SAD_X4_1x8P 8, 8 + SAD_X4_1x8P FENC_STRIDE, esi + SAD_X4_1x8P FENC_STRIDE+8, esi+8 + add edi, 2*FENC_STRIDE + lea eax, [eax+2*esi] + lea ebx, [ebx+2*esi] + lea ecx, [ecx+2*esi] + lea edx, [edx+2*esi] +%endmacro + +%macro SAD_X4_2x8P 1 +%if %1 + SAD_X4_START_1x8P q +%else + SAD_X4_1x8P 0, 0 +%endif + SAD_X4_1x8P FENC_STRIDE, esi + add edi, 2*FENC_STRIDE + lea eax, [eax+2*esi] + lea ebx, [ebx+2*esi] + lea ecx, [ecx+2*esi] + lea edx, [edx+2*esi] +%endmacro + +%macro SAD_X4_2x4P 1 +%if %1 + SAD_X4_START_1x8P d +%else + SAD_X4_1x4P 0, 0 +%endif + SAD_X4_1x4P FENC_STRIDE, esi + add edi, 2*FENC_STRIDE + lea eax, [eax+2*esi] + lea ebx, [ebx+2*esi] + lea ecx, [ecx+2*esi] + lea edx, [edx+2*esi] +%endmacro + +%macro SAD_X3_END 0 + mov eax, [esp+32] + movd [eax+0], mm0 + movd [eax+4], mm1 + movd [eax+8], mm2 + pop esi + pop edi + ret +%endmacro + +%macro SAD_X4_END 0 + mov eax, [esp+40] + movd [eax+0], mm0 + movd [eax+4], mm1 + movd [eax+8], mm2 + movd [eax+12], mm3 + pop ebx + pop esi + pop edi + ret +%endmacro + +; ssd + %macro SSD_INC_1x16P 0 movq mm1, [eax] movq mm2, [ecx] @@ -168,6 +363,8 @@ BITS 32 SSD_INC_1x4P %endmacro +; satd + %macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2] movd %1, %3 movd %2, %4 @@ -262,6 +459,22 @@ cglobal x264_pixel_sad_8x4_mmxext cglobal x264_pixel_sad_4x8_mmxext cglobal x264_pixel_sad_4x4_mmxext +cglobal x264_pixel_sad_x3_16x16_mmxext +cglobal x264_pixel_sad_x3_16x8_mmxext +cglobal x264_pixel_sad_x3_8x16_mmxext +cglobal x264_pixel_sad_x3_8x8_mmxext +cglobal x264_pixel_sad_x3_8x4_mmxext +cglobal x264_pixel_sad_x3_4x8_mmxext +cglobal x264_pixel_sad_x3_4x4_mmxext + +cglobal x264_pixel_sad_x4_16x16_mmxext +cglobal x264_pixel_sad_x4_16x8_mmxext +cglobal x264_pixel_sad_x4_8x16_mmxext +cglobal x264_pixel_sad_x4_8x8_mmxext +cglobal x264_pixel_sad_x4_8x4_mmxext +cglobal x264_pixel_sad_x4_4x8_mmxext +cglobal x264_pixel_sad_x4_4x4_mmxext + cglobal x264_pixel_sad_pde_16x16_mmxext cglobal x264_pixel_sad_pde_16x8_mmxext cglobal x264_pixel_sad_pde_8x16_mmxext @@ -388,6 +601,36 @@ x264_pixel_sad_4x4_mmxext: SAD_END +;----------------------------------------------------------------------------- +; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, +; uint8_t *pix2, int i_stride, int scores[3] ) +;----------------------------------------------------------------------------- +%macro SAD_X 3 +ALIGN 16 +x264_pixel_sad_x%1_%2x%3_mmxext: + SAD_X%1_2x%2P 1 +%rep %3/2-1 + SAD_X%1_2x%2P 0 +%endrep + SAD_X%1_END +%endmacro + +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +SAD_X 3, 4, 8 +SAD_X 3, 4, 4 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 +SAD_X 4, 4, 8 +SAD_X 4, 4, 4 + + %macro PDE_CHECK 0 movd ebx, mm0 cmp ebx, [esp+24] ; prev_score diff --git a/common/i386/pixel-sse2.asm b/common/i386/pixel-sse2.asm index e89c3895..66d5aa0c 100644 --- a/common/i386/pixel-sse2.asm +++ b/common/i386/pixel-sse2.asm @@ -38,6 +38,10 @@ SECTION .text cglobal x264_pixel_sad_16x16_sse2 cglobal x264_pixel_sad_16x8_sse2 +cglobal x264_pixel_sad_x3_16x16_sse2 +cglobal x264_pixel_sad_x3_16x8_sse2 +cglobal x264_pixel_sad_x4_16x16_sse2 +cglobal x264_pixel_sad_x4_16x8_sse2 cglobal x264_pixel_ssd_16x16_sse2 cglobal x264_pixel_ssd_16x8_sse2 cglobal x264_pixel_satd_8x4_sse2 @@ -164,6 +168,158 @@ x264_pixel_sad_16x8_sse2: SAD_INC_4x16P_SSE2 SAD_END_SSE2 + +%macro SAD_X3_START_1x16P 0 + push edi + push esi + mov edi, [esp+12] + mov eax, [esp+16] + mov ecx, [esp+20] + mov edx, [esp+24] + mov esi, [esp+28] + movdqa xmm3, [edi] + movdqu xmm0, [eax] + movdqu xmm1, [ecx] + movdqu xmm2, [edx] + psadbw xmm0, xmm3 + psadbw xmm1, xmm3 + psadbw xmm2, xmm3 +%endmacro + +%macro SAD_X3_1x16P 2 + movdqa xmm3, [edi+%1] + movdqu xmm4, [eax+%2] + movdqu xmm5, [ecx+%2] + movdqu xmm6, [edx+%2] + psadbw xmm4, xmm3 + psadbw xmm5, xmm3 + psadbw xmm6, xmm3 + paddw xmm0, xmm4 + paddw xmm1, xmm5 + paddw xmm2, xmm6 +%endmacro + +%macro SAD_X3_2x16P 1 +%if %1 + SAD_X3_START_1x16P +%else + SAD_X3_1x16P 0, 0 +%endif + SAD_X3_1x16P FENC_STRIDE, esi + add edi, 2*FENC_STRIDE + lea eax, [eax+2*esi] + lea ecx, [ecx+2*esi] + lea edx, [edx+2*esi] +%endmacro + +%macro SAD_X4_START_1x16P 0 + push edi + push esi + push ebx + mov edi, [esp+16] + mov eax, [esp+20] + mov ebx, [esp+24] + mov ecx, [esp+28] + mov edx, [esp+32] + mov esi, [esp+36] + movdqa xmm7, [edi] + movdqu xmm0, [eax] + movdqu xmm1, [ebx] + movdqu xmm2, [ecx] + movdqu xmm3, [edx] + psadbw xmm0, xmm7 + psadbw xmm1, xmm7 + psadbw xmm2, xmm7 + psadbw xmm3, xmm7 +%endmacro + +%macro SAD_X4_1x16P 2 + movdqa xmm7, [edi+%1] + movdqu xmm4, [eax+%2] + movdqu xmm5, [ebx+%2] + movdqu xmm6, [ecx+%2] + psadbw xmm4, xmm7 + psadbw xmm5, xmm7 + paddw xmm0, xmm4 + psadbw xmm6, xmm7 + movdqu xmm4, [edx+%2] + paddw xmm1, xmm5 + psadbw xmm4, xmm7 + paddw xmm2, xmm6 + paddw xmm3, xmm4 +%endmacro + +%macro SAD_X4_2x16P 1 +%if %1 + SAD_X4_START_1x16P +%else + SAD_X4_1x16P 0, 0 +%endif + SAD_X4_1x16P FENC_STRIDE, esi + add edi, 2*FENC_STRIDE + lea eax, [eax+2*esi] + lea ebx, [ebx+2*esi] + lea ecx, [ecx+2*esi] + lea edx, [edx+2*esi] +%endmacro + +%macro SAD_X3_END 0 + mov eax, [esp+32] + pshufd xmm4, xmm0, 2 + pshufd xmm5, xmm1, 2 + pshufd xmm6, xmm2, 2 + paddw xmm0, xmm4 + paddw xmm1, xmm5 + paddw xmm2, xmm6 + movd [eax+0], xmm0 + movd [eax+4], xmm1 + movd [eax+8], xmm2 + pop esi + pop edi + ret +%endmacro + +%macro SAD_X4_END 0 + mov eax, [esp+40] + pshufd xmm4, xmm0, 2 + pshufd xmm5, xmm1, 2 + pshufd xmm6, xmm2, 2 + pshufd xmm7, xmm3, 2 + paddw xmm0, xmm4 + paddw xmm1, xmm5 + paddw xmm2, xmm6 + paddw xmm3, xmm7 + movd [eax+0], xmm0 + movd [eax+4], xmm1 + movd [eax+8], xmm2 + movd [eax+12], xmm3 + pop ebx + pop esi + pop edi + ret +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, +; uint8_t *pix2, int i_stride, int scores[3] ) +;----------------------------------------------------------------------------- +%macro SAD_X 3 +ALIGN 16 +x264_pixel_sad_x%1_%2x%3_sse2: + SAD_X%1_2x%2P 1 +%rep %3/2-1 + SAD_X%1_2x%2P 0 +%endrep + SAD_X%1_END +%endmacro + +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 + + %macro SSD_INC_2x16P_SSE2 0 movdqu xmm1, [eax] movdqu xmm2, [ecx] diff --git a/common/i386/pixel.h b/common/i386/pixel.h index df7ea616..c0f9f3e4 100644 --- a/common/i386/pixel.h +++ b/common/i386/pixel.h @@ -32,6 +32,21 @@ int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int ); +void x264_pixel_sad_x3_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x3_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x3_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x3_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x3_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x3_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x3_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x4_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x4_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x4_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x4_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x4_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x4_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x4_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); + int x264_pixel_sad_pde_16x16_mmxext( uint8_t *, int, uint8_t *, int, int ); int x264_pixel_sad_pde_16x8_mmxext( uint8_t *, int, uint8_t *, int, int ); int x264_pixel_sad_pde_8x16_mmxext( uint8_t *, int, uint8_t *, int, int ); @@ -55,6 +70,11 @@ int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int ); +void x264_pixel_sad_x3_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x3_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x4_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); +void x264_pixel_sad_x4_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); + int x264_pixel_ssd_16x16_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_ssd_16x8_sse2( uint8_t *, int, uint8_t *, int ); diff --git a/common/pixel.c b/common/pixel.c index 9eb4f933..4e3e7870 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -59,13 +59,13 @@ static int name( uint8_t *pix1, int i_stride_pix1, \ } -PIXEL_SAD_C( pixel_sad_16x16, 16, 16 ) -PIXEL_SAD_C( pixel_sad_16x8, 16, 8 ) -PIXEL_SAD_C( pixel_sad_8x16, 8, 16 ) -PIXEL_SAD_C( pixel_sad_8x8, 8, 8 ) -PIXEL_SAD_C( pixel_sad_8x4, 8, 4 ) -PIXEL_SAD_C( pixel_sad_4x8, 4, 8 ) -PIXEL_SAD_C( pixel_sad_4x4, 4, 4 ) +PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 ) +PIXEL_SAD_C( x264_pixel_sad_16x8, 16, 8 ) +PIXEL_SAD_C( x264_pixel_sad_8x16, 8, 16 ) +PIXEL_SAD_C( x264_pixel_sad_8x8, 8, 8 ) +PIXEL_SAD_C( x264_pixel_sad_8x4, 8, 4 ) +PIXEL_SAD_C( x264_pixel_sad_4x8, 4, 8 ) +PIXEL_SAD_C( x264_pixel_sad_4x4, 4, 4 ) /**************************************************************************** @@ -90,13 +90,13 @@ static int name( uint8_t *pix1, int i_stride_pix1, \ return i_sum; \ } -PIXEL_SSD_C( pixel_ssd_16x16, 16, 16 ) -PIXEL_SSD_C( pixel_ssd_16x8, 16, 8 ) -PIXEL_SSD_C( pixel_ssd_8x16, 8, 16 ) -PIXEL_SSD_C( pixel_ssd_8x8, 8, 8 ) -PIXEL_SSD_C( pixel_ssd_8x4, 8, 4 ) -PIXEL_SSD_C( pixel_ssd_4x8, 4, 8 ) -PIXEL_SSD_C( pixel_ssd_4x4, 4, 4 ) +PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 ) +PIXEL_SSD_C( x264_pixel_ssd_16x8, 16, 8 ) +PIXEL_SSD_C( x264_pixel_ssd_8x16, 8, 16 ) +PIXEL_SSD_C( x264_pixel_ssd_8x8, 8, 8 ) +PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 ) +PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 ) +PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 ) int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ) { @@ -207,13 +207,13 @@ static int name( uint8_t *pix1, int i_stride_pix1, \ { \ return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \ } -PIXEL_SATD_C( pixel_satd_16x16, 16, 16 ) -PIXEL_SATD_C( pixel_satd_16x8, 16, 8 ) -PIXEL_SATD_C( pixel_satd_8x16, 8, 16 ) -PIXEL_SATD_C( pixel_satd_8x8, 8, 8 ) -PIXEL_SATD_C( pixel_satd_8x4, 8, 4 ) -PIXEL_SATD_C( pixel_satd_4x8, 4, 8 ) -PIXEL_SATD_C( pixel_satd_4x4, 4, 4 ) +PIXEL_SATD_C( x264_pixel_satd_16x16, 16, 16 ) +PIXEL_SATD_C( x264_pixel_satd_16x8, 16, 8 ) +PIXEL_SATD_C( x264_pixel_satd_8x16, 8, 16 ) +PIXEL_SATD_C( x264_pixel_satd_8x8, 8, 8 ) +PIXEL_SATD_C( x264_pixel_satd_8x4, 8, 4 ) +PIXEL_SATD_C( x264_pixel_satd_4x8, 4, 8 ) +PIXEL_SATD_C( x264_pixel_satd_4x4, 4, 4 ) /**************************************************************************** @@ -282,8 +282,8 @@ static inline int pixel_sa8d_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int } #define PIXEL_SA8D_C( width, height ) \ -static int pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \ - uint8_t *pix2, int i_stride_pix2 ) \ +static int x264_pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \ + uint8_t *pix2, int i_stride_pix2 ) \ { \ return ( pixel_sa8d_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ) + 2 ) >> 2; \ } @@ -292,74 +292,79 @@ PIXEL_SA8D_C( 16, 8 ) PIXEL_SA8D_C( 8, 16 ) PIXEL_SA8D_C( 8, 8 ) +#define SAD_X( size ) \ +static void x264_pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\ +{\ + scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\ +}\ +static void x264_pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\ +{\ + scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\ + scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\ +} + +SAD_X( 16x16 ) +SAD_X( 16x8 ) +SAD_X( 8x16 ) +SAD_X( 8x8 ) +SAD_X( 8x4 ) +SAD_X( 4x8 ) +SAD_X( 4x4 ) + +#ifdef ARCH_UltraSparc +SAD_X( 16x16_vis ) +SAD_X( 16x8_vis ) +SAD_X( 8x16_vis ) +SAD_X( 8x8_vis ) +#endif /**************************************************************************** * x264_pixel_init: ****************************************************************************/ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) { - pixf->sad[PIXEL_16x16] = pixel_sad_16x16; - pixf->sad[PIXEL_16x8] = pixel_sad_16x8; - pixf->sad[PIXEL_8x16] = pixel_sad_8x16; - pixf->sad[PIXEL_8x8] = pixel_sad_8x8; - pixf->sad[PIXEL_8x4] = pixel_sad_8x4; - pixf->sad[PIXEL_4x8] = pixel_sad_4x8; - pixf->sad[PIXEL_4x4] = pixel_sad_4x4; - - pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16; - pixf->ssd[PIXEL_16x8] = pixel_ssd_16x8; - pixf->ssd[PIXEL_8x16] = pixel_ssd_8x16; - pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8; - pixf->ssd[PIXEL_8x4] = pixel_ssd_8x4; - pixf->ssd[PIXEL_4x8] = pixel_ssd_4x8; - pixf->ssd[PIXEL_4x4] = pixel_ssd_4x4; - - pixf->satd[PIXEL_16x16]= pixel_satd_16x16; - pixf->satd[PIXEL_16x8] = pixel_satd_16x8; - pixf->satd[PIXEL_8x16] = pixel_satd_8x16; - pixf->satd[PIXEL_8x8] = pixel_satd_8x8; - pixf->satd[PIXEL_8x4] = pixel_satd_8x4; - pixf->satd[PIXEL_4x8] = pixel_satd_4x8; - pixf->satd[PIXEL_4x4] = pixel_satd_4x4; - - pixf->sa8d[PIXEL_16x16]= pixel_sa8d_16x16; - pixf->sa8d[PIXEL_16x8] = pixel_sa8d_16x8; - pixf->sa8d[PIXEL_8x16] = pixel_sa8d_8x16; - pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8; + memset( pixf, 0, sizeof(*pixf) ); + +#define INIT( name, cpu ) \ + pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\ + pixf->name[PIXEL_16x8] = x264_pixel_##name##_16x8##cpu;\ + pixf->name[PIXEL_8x16] = x264_pixel_##name##_8x16##cpu;\ + pixf->name[PIXEL_8x8] = x264_pixel_##name##_8x8##cpu;\ + pixf->name[PIXEL_8x4] = x264_pixel_##name##_8x4##cpu;\ + pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\ + pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu; + + INIT( sad, ); + INIT( sad_x3, ); + INIT( sad_x4, ); + INIT( ssd, ); + INIT( satd, ); + + pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16; + pixf->sa8d[PIXEL_16x8] = x264_pixel_sa8d_16x8; + pixf->sa8d[PIXEL_8x16] = x264_pixel_sa8d_8x16; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8; #ifdef HAVE_MMXEXT if( cpu&X264_CPU_MMX ) { - pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmx; - pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_mmx; - pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_mmx; - pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_mmx; - pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_mmx; - pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_mmx; - pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_mmx; + INIT( ssd, _mmx ); } if( cpu&X264_CPU_MMXEXT ) { - pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_mmxext; - pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_mmxext; - pixf->sad[PIXEL_8x16 ] = x264_pixel_sad_8x16_mmxext; - pixf->sad[PIXEL_8x8 ] = x264_pixel_sad_8x8_mmxext; - pixf->sad[PIXEL_8x4 ] = x264_pixel_sad_8x4_mmxext; - pixf->sad[PIXEL_4x8 ] = x264_pixel_sad_4x8_mmxext; - pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_mmxext; + INIT( sad, _mmxext ); + INIT( sad_x3, _mmxext ); + INIT( sad_x4, _mmxext ); + INIT( satd, _mmxext ); pixf->sad_pde[PIXEL_16x16] = x264_pixel_sad_pde_16x16_mmxext; pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext; pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext; - - pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext; - pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext; - pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext; - pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_mmxext; - pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_mmxext; - pixf->satd[PIXEL_4x8] = x264_pixel_satd_4x8_mmxext; - pixf->satd[PIXEL_4x4] = x264_pixel_satd_4x4_mmxext; } #endif @@ -375,6 +380,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_sse2; pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_sse2; pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_sse2; + +#ifndef ARCH_X86_64 + pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_sse2; + pixf->sad_x3[PIXEL_16x8 ] = x264_pixel_sad_x3_16x8_sse2; + + pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_sse2; + pixf->sad_x4[PIXEL_16x8 ] = x264_pixel_sad_x4_16x8_sse2; +#endif } // these are faster on both Intel and AMD if( cpu&X264_CPU_SSE2 ) @@ -395,6 +408,16 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_vis; pixf->sad[PIXEL_16x8] = x264_pixel_sad_16x8_vis; pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_vis; + + pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_vis; + pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_vis; + pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_vis; + pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_vis; + + pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_vis; + pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_vis; + pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_vis; + pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_vis; #endif } diff --git a/common/pixel.h b/common/pixel.h index 2300bc0c..18a0746f 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -26,6 +26,8 @@ typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int ); typedef int (*x264_pixel_cmp_pde_t) ( uint8_t *, int, uint8_t *, int, int ); +typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] ); +typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] ); enum { @@ -72,6 +74,10 @@ typedef struct * terminate early if partial score is worse than a threshold. * may be NULL, in which case just use sad instead. */ x264_pixel_cmp_pde_t sad_pde[7]; + + /* multiple parallel calls to sad. */ + x264_pixel_cmp_x3_t sad_x3[7]; + x264_pixel_cmp_x4_t sad_x4[7]; } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c index 6aaf73c4..f1bee711 100644 --- a/common/ppc/pixel.c +++ b/common/ppc/pixel.c @@ -594,6 +594,26 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1, return i_satd / 2; } +#define SAD_X( size ) \ +static void pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\ +{\ + scores[0] = pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\ +}\ +static void pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\ +{\ + scores[0] = pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\ + scores[3] = pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\ +} + +SAD_X( 16x16_altivec ) +SAD_X( 16x8_altivec ) +SAD_X( 8x16_altivec ) +SAD_X( 8x8_altivec ) + /**************************************************************************** * x264_pixel_init: ****************************************************************************/ @@ -604,6 +624,16 @@ void x264_pixel_altivec_init( x264_pixel_function_t *pixf ) pixf->sad[PIXEL_16x8] = pixel_sad_16x8_altivec; pixf->sad[PIXEL_8x8] = pixel_sad_8x8_altivec; + pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec; + pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_altivec; + pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_altivec; + pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_altivec; + + pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec; + pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_altivec; + pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_altivec; + pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_altivec; + pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec; pixf->satd[PIXEL_8x16] = pixel_satd_8x16_altivec; pixf->satd[PIXEL_16x8] = pixel_satd_16x8_altivec; diff --git a/encoder/me.c b/encoder/me.c index 66bfdfc9..479949b8 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -44,69 +44,90 @@ static const int subpel_iterations[][4] = static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel ); -#define COST_MV_INT( mx, my, bd, d ) \ +#define BITS_MVD( mx, my )\ + (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2]) + +#define COST_MV( mx, my ) \ { \ int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, \ &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] ) \ - + p_cost_mvx[ (mx)<<2 ] \ - + p_cost_mvy[ (my)<<2 ]; \ + + BITS_MVD(mx,my); \ if( cost < bcost ) \ { \ bcost = cost; \ bmx = mx; \ bmy = my; \ - if( bd ) \ - dir = d; \ } \ } -#define COST_MV( mx, my ) COST_MV_INT( mx, my, 0, 0 ) -#define COST_MV_DIR( mx, my, d ) COST_MV_INT( mx, my, 1, d ) #define COST_MV_PDE( mx, my ) \ { \ int cost = h->pixf.sad_pde[i_pixel]( m->p_fenc[0], FENC_STRIDE, \ &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0], \ bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ); \ - if( cost < bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ) \ - { \ - bcost = cost + p_cost_mvx[ (mx)<<2 ] + p_cost_mvy[ (my)<<2 ]; \ - bmx = mx; \ - bmy = my; \ + if( cost < bcost - BITS_MVD(mx,my) ) \ + { \ + bcost = cost + BITS_MVD(mx,my); \ + bmx = mx; \ + bmy = my; \ } \ } -#define DIA1_ITER( mx, my )\ - {\ - omx = mx; omy = my;\ - COST_MV( omx , omy-1 );/* 1 */\ - COST_MV( omx , omy+1 );/* 101 */\ - COST_MV( omx-1, omy );/* 1 */\ - COST_MV( omx+1, omy );\ - } +#define COPY2_IF_LT(x,y,a,b)\ +if((y)<(x))\ +{\ + (x)=(y);\ + (a)=(b);\ +} -#define DIA2 \ - {\ - COST_MV( omx , omy-2 );\ - COST_MV( omx-1, omy-1 );/* 1 */\ - COST_MV( omx+1, omy-1 );/* 1 1 */\ - COST_MV( omx-2, omy );/* 1 0 1 */\ - COST_MV( omx+2, omy );/* 1 1 */\ - COST_MV( omx-1, omy+1 );/* 1 */\ - COST_MV( omx+1, omy+1 );\ - COST_MV( omx , omy+2 );\ - }\ - -#define OCT2 \ - {\ - COST_MV( omx-1, omy-2 );\ - COST_MV( omx+1, omy-2 );/* 1 1 */\ - COST_MV( omx-2, omy-1 );/* 1 1 */\ - COST_MV( omx+2, omy-1 );/* 0 */\ - COST_MV( omx-2, omy+1 );/* 1 1 */\ - COST_MV( omx+2, omy+1 );/* 1 1 */\ - COST_MV( omx-1, omy+2 );\ - COST_MV( omx+1, omy+2 );\ - } +#define COPY3_IF_LT(x,y,a,b,c,d)\ +if((y)<(x))\ +{\ + (x)=(y);\ + (a)=(b);\ + (c)=(d);\ +} + +#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\ +{\ + uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\ + h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\ + pix_base + (m0x) + (m0y)*m->i_stride[0],\ + pix_base + (m1x) + (m1y)*m->i_stride[0],\ + pix_base + (m2x) + (m2y)*m->i_stride[0],\ + m->i_stride[0], costs );\ + (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\ + (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\ + (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\ +} + +#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\ +{\ + uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\ + h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\ + pix_base + (m0x) + (m0y)*m->i_stride[0],\ + pix_base + (m1x) + (m1y)*m->i_stride[0],\ + pix_base + (m2x) + (m2y)*m->i_stride[0],\ + pix_base + (m3x) + (m3y)*m->i_stride[0],\ + m->i_stride[0], costs );\ + costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\ + costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\ + costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\ + costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\ + COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\ + COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\ + COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\ + COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\ +} + +/* 1 */ +/* 101 */ +/* 1 */ +#define DIA1_ITER( mx, my )\ +{\ + omx = mx; omy = my;\ + COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\ +} #define CROSS( start, x_max, y_max ) \ { \ @@ -136,6 +157,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int uint8_t *p_fref = m->p_fref[0]; int i, j; int dir; + int costs[6]; int mv_x_min = h->mb.mv_min_fpel[0]; int mv_y_min = h->mb.mv_min_fpel[1]; @@ -157,7 +179,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int bcost = COST_MAX; COST_MV( pmx, pmy ); /* I don't know why this helps */ - bcost -= p_cost_mvx[ bmx<<2 ] + p_cost_mvy[ bmy<<2 ]; + bcost -= BITS_MVD(bmx,bmy); /* try extra predictors if provided */ for( i = 0; i < i_mvc; i++ ) @@ -167,7 +189,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int if( mx != bmx || my != bmy ) COST_MV( mx, my ); } - + COST_MV( 0, 0 ); mv_x_max += 8; @@ -205,36 +227,47 @@ me_hex2: } #else /* equivalent to the above, but eliminates duplicate candidates */ - dir = -1; - omx = bmx; omy = bmy; - COST_MV_DIR( omx-2, omy, 0 ); - COST_MV_DIR( omx-1, omy+2, 1 ); - COST_MV_DIR( omx+1, omy+2, 2 ); - COST_MV_DIR( omx+2, omy, 3 ); - COST_MV_DIR( omx+1, omy-2, 4 ); - COST_MV_DIR( omx-1, omy-2, 5 ); - if( dir != -1 ) + dir = -2; + + /* hexagon */ + COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs ); + COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+3 ); + COPY2_IF_LT( bcost, costs[0], dir, 0 ); + COPY2_IF_LT( bcost, costs[1], dir, 1 ); + COPY2_IF_LT( bcost, costs[2], dir, 2 ); + COPY2_IF_LT( bcost, costs[3], dir, 3 ); + COPY2_IF_LT( bcost, costs[4], dir, 4 ); + COPY2_IF_LT( bcost, costs[5], dir, 5 ); + + if( dir != -2 ) { + static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}}; + bmx += hex2[dir+1][0]; + bmy += hex2[dir+1][1]; + /* half hexagon, not overlapping the previous iteration */ for( i = 1; i < i_me_range/2; i++ ) { - static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}}; static const int mod6[8] = {5,0,1,2,3,4,5,0}; const int odir = mod6[dir+1]; - omx = bmx; omy = bmy; - COST_MV_DIR( omx + hex2[odir+0][0], omy + hex2[odir+0][1], odir-1 ); - COST_MV_DIR( omx + hex2[odir+1][0], omy + hex2[odir+1][1], odir ); - COST_MV_DIR( omx + hex2[odir+2][0], omy + hex2[odir+2][1], odir+1 ); - if( bmx == omx && bmy == omy ) + COST_MV_X3_DIR( hex2[odir+0][0], hex2[odir+0][1], + hex2[odir+1][0], hex2[odir+1][1], + hex2[odir+2][0], hex2[odir+2][1], + costs ); + dir = -2; + COPY2_IF_LT( bcost, costs[0], dir, odir-1 ); + COPY2_IF_LT( bcost, costs[1], dir, odir ); + COPY2_IF_LT( bcost, costs[2], dir, odir+1 ); + if( dir == -2 ) break; + bmx += hex2[dir+1][0]; + bmy += hex2[dir+1][1]; } } #endif /* square refine */ - DIA1_ITER( bmx, bmy ); - COST_MV( omx-1, omy-1 ); - COST_MV( omx-1, omy+1 ); - COST_MV( omx+1, omy-1 ); - COST_MV( omx+1, omy+1 ); + omx = bmx; omy = bmy; + COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 ); + COST_MV_X4( -1,-1, -1,1, 1,-1, 1,1 ); break; case X264_ME_UMH: @@ -267,14 +300,16 @@ me_hex2: #define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) ) if( bcost == ucost2 && SAD_THRESH(2000) ) { - DIA2; + COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 ); + COST_MV_X4( 2, 0, -1, 1, 1, 1, 0,2 ); if( bcost == ucost1 && SAD_THRESH(500) ) break; if( bcost == ucost2 ) { int range = (i_me_range>>1) | 1; CROSS( 3, range, range ); - OCT2; + COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 ); + COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 ); if( bcost == ucost2 ) break; cross_start = range + 2; @@ -282,7 +317,7 @@ me_hex2: } /* adaptive search range */ - if( i_mvc ) + if( i_mvc ) { /* range multipliers based on casual inspection of some statistics of * average distance between current predictor and final mv found by ESA. @@ -342,18 +377,13 @@ me_hex2: /* 5x5 ESA */ omx = bmx; omy = bmy; - for( i = (bcost == ucost2) ? 4 : 0; i < 24; i++ ) - { - static const int square2[24][2] = { - { 1, 0}, { 0, 1}, {-1, 0}, { 0,-1}, - { 1, 1}, {-1, 1}, {-1,-1}, { 1,-1}, - { 2,-1}, { 2, 0}, { 2, 1}, { 2, 2}, - { 1, 2}, { 0, 2}, {-1, 2}, {-2, 2}, - {-2, 1}, {-2, 0}, {-2,-1}, {-2,-2}, - {-1,-2}, { 0,-2}, { 1,-2}, { 2,-2} - }; - COST_MV( omx + square2[i][0], omy + square2[i][1] ); - } + if( bcost != ucost2 ) + COST_MV_X4( 1, 0, 0, 1, -1, 0, 0,-1 ); + COST_MV_X4( 1, 1, -1, 1, -1,-1, 1,-1 ); + COST_MV_X4( 2,-1, 2, 0, 2, 1, 2, 2 ); + COST_MV_X4( 1, 2, 0, 2, -1, 2, -2, 2 ); + COST_MV_X4( -2, 1, -2, 0, -2,-1, -2,-2 ); + COST_MV_X4( -1,-2, 0,-2, 1,-2, 2,-2 ); /* hexagon grid */ omx = bmx; omy = bmy; @@ -365,29 +395,25 @@ me_hex2: { 2, 3}, { 0, 4}, {-2, 3}, {-2,-3}, { 0,-4}, { 2,-3}, }; - const int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min ); - if( h->pixf.sad_pde[i_pixel] ) + if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min, + mv_y_max-omy, omy-mv_y_min ) ) { for( j = 0; j < 16; j++ ) { int mx = omx + hex4[j][0]*i; int my = omy + hex4[j][1]*i; - if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max - && my >= mv_y_min && my <= mv_y_max ) ) - COST_MV_PDE( mx, my ); + if( mx >= mv_x_min && mx <= mv_x_max + && my >= mv_y_min && my <= mv_y_max ) + COST_MV( mx, my ); } } else { - for( j = 0; j < 16; j++ ) - { - int mx = omx + hex4[j][0]*i; - int my = omy + hex4[j][1]*i; - if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max - && my >= mv_y_min && my <= mv_y_max ) ) - COST_MV( mx, my ); - } + COST_MV_X4( -4*i, 2*i, -4*i, 1*i, -4*i, 0*i, -4*i,-1*i ); + COST_MV_X4( -4*i,-2*i, 4*i,-2*i, 4*i,-1*i, 4*i, 0*i ); + COST_MV_X4( 4*i, 1*i, 4*i, 2*i, 2*i, 3*i, 0*i, 4*i ); + COST_MV_X4( -2*i, 3*i, -2*i,-3*i, 0*i,-4*i, 2*i,-3*i ); } } goto me_hex2; @@ -423,7 +449,7 @@ me_hex2: const uint16_t *integral = &integral_base[ mx + my * stride ]; const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ] - integral[ dw ] - integral[ dh ]; - const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ]; + const int bsad = bcost - BITS_MVD(mx,my); if( abs( ref_dc - enc_dc ) < bsad ) COST_MV_PDE( mx, my ); } @@ -436,7 +462,7 @@ me_hex2: const uint16_t *integral = &integral_base[ mx + my * stride ]; const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ] - integral[ dw ] - integral[ dh ]; - const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ]; + const int bsad = bcost - BITS_MVD(mx,my); if( abs( ref_dc - enc_dc ) < bsad ) COST_MV( mx, my ); } diff --git a/tools/checkasm.c b/tools/checkasm.c index 04674759..680d4706 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -61,6 +61,36 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_PIXEL( ssd ); TEST_PIXEL( satd ); +#define TEST_PIXEL_X( N ) \ + for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \ + { \ + int res_c[4]={0}, res_asm[4]={0}; \ + if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \ + { \ + used_asm = 1; \ + res_c[0] = pixel_c.sad[i]( buf1, 16, buf2, 24 ); \ + res_c[1] = pixel_c.sad[i]( buf1, 16, buf2+30, 24 ); \ + res_c[2] = pixel_c.sad[i]( buf1, 16, buf2+1, 24 ); \ + if(N==4) \ + { \ + res_c[3] = pixel_c.sad[i]( buf1, 16, buf2+99, 24 ); \ + pixel_asm.sad_x4[i]( buf1, buf2, buf2+30, buf2+1, buf2+99, 24, res_asm ); \ + } \ + else \ + pixel_asm.sad_x3[i]( buf1, buf2, buf2+30, buf2+1, 24, res_asm ); \ + if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ + { \ + ok = 0; \ + fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \ + i, res_c[0], res_c[1], res_c[2], res_c[3], \ + res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \ + } \ + } \ + } \ + report( "pixel sad_x"#N" :" ); + + TEST_PIXEL_X(3); + TEST_PIXEL_X(4); return ret; }