From: Loren Merritt <pengvado@videolan.org>
Date: Mon, 10 Apr 2006 03:03:13 +0000 (+0000)
Subject: interleave multiple calls to SAD.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8947b51f35151f821c3718b01c1e93d517d814b5;p=libx264

interleave multiple calls to SAD.
15% faster fullpel motion estimation.


git-svn-id: svn://svn.videolan.org/x264/trunk@490 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm
index d107a37f..9edb94c4 100644
--- a/common/amd64/pixel-a.asm
+++ b/common/amd64/pixel-a.asm
@@ -29,6 +29,8 @@ BITS 64
 
 %include "amd64inc.asm"
 
+; sad
+
 %macro SAD_INC_2x16P 0
     movq    mm1,    [parm1q]
     movq    mm2,    [parm1q+8]
@@ -72,6 +74,177 @@ BITS 64
     lea     parm3q, [parm3q+2*parm4q]
 %endmacro
 
+; sad x3 / x4
+
+%macro SAD_X3_START_1x8P 1
+    mov%1   mm3,    [parm1q]
+    mov%1   mm0,    [parm2q]
+    mov%1   mm1,    [parm3q]
+    mov%1   mm2,    [parm4q]
+    psadbw  mm0,    mm3
+    psadbw  mm1,    mm3
+    psadbw  mm2,    mm3
+%endmacro
+
+%macro SAD_X3_1x8P 3
+    mov%1   mm3,    [parm1q+%2]
+    mov%1   mm4,    [parm2q+%3]
+    mov%1   mm5,    [parm3q+%3]
+    mov%1   mm6,    [parm4q+%3]
+    psadbw  mm4,    mm3
+    psadbw  mm5,    mm3
+    psadbw  mm6,    mm3
+    paddw   mm0,    mm4
+    paddw   mm1,    mm5
+    paddw   mm2,    mm6
+%endmacro
+
+%macro SAD_X3_2x16P 1
+%if %1
+    SAD_X3_START_1x8P q
+%else
+    SAD_X3_1x8P q, 0, 0
+%endif
+    SAD_X3_1x8P q, 8, 8
+    SAD_X3_1x8P q, FENC_STRIDE, parm5q
+    SAD_X3_1x8P q, FENC_STRIDE+8, parm5q+8
+    add     parm1q, 2*FENC_STRIDE
+    lea     parm2q, [parm2q+2*parm5q]
+    lea     parm3q, [parm3q+2*parm5q]
+    lea     parm4q, [parm4q+2*parm5q]
+%endmacro
+
+%macro SAD_X3_2x8P 1
+%if %1
+    SAD_X3_START_1x8P q
+%else
+    SAD_X3_1x8P q, 0, 0
+%endif
+    SAD_X3_1x8P q, FENC_STRIDE, parm5q
+    add     parm1q, 2*FENC_STRIDE
+    lea     parm2q, [parm2q+2*parm5q]
+    lea     parm3q, [parm3q+2*parm5q]
+    lea     parm4q, [parm4q+2*parm5q]
+%endmacro
+
+%macro SAD_X3_2x4P 1
+%if %1
+    SAD_X3_START_1x8P d
+%else
+    SAD_X3_1x8P d, 0, 0
+%endif
+    SAD_X3_1x8P d, FENC_STRIDE, parm5q
+    add     parm1q, 2*FENC_STRIDE
+    lea     parm2q, [parm2q+2*parm5q]
+    lea     parm3q, [parm3q+2*parm5q]
+    lea     parm4q, [parm4q+2*parm5q]
+%endmacro
+
+%macro SAD_X4_START_1x8P 1
+    mov%1   mm7,    [parm1q]
+    mov%1   mm0,    [parm2q]
+    mov%1   mm1,    [parm3q]
+    mov%1   mm2,    [parm4q]
+    mov%1   mm3,    [parm5q]
+    psadbw  mm0,    mm7
+    psadbw  mm1,    mm7
+    psadbw  mm2,    mm7
+    psadbw  mm3,    mm7
+%endmacro
+
+%macro SAD_X4_1x8P 2
+    movq    mm7,    [parm1q+%1]
+    movq    mm4,    [parm2q+%2]
+    movq    mm5,    [parm3q+%2]
+    movq    mm6,    [parm4q+%2]
+    psadbw  mm4,    mm7
+    psadbw  mm5,    mm7
+    psadbw  mm6,    mm7
+    psadbw  mm7,    [parm5q+%2]
+    paddw   mm0,    mm4
+    paddw   mm1,    mm5
+    paddw   mm2,    mm6
+    paddw   mm3,    mm7
+%endmacro
+
+%macro SAD_X4_1x4P 2
+    movd    mm7,    [parm1q+%1]
+    movd    mm4,    [parm2q+%2]
+    movd    mm5,    [parm3q+%2]
+    movd    mm6,    [parm4q+%2]
+    psadbw  mm4,    mm7
+    psadbw  mm5,    mm7
+    paddw   mm0,    mm4
+    psadbw  mm6,    mm7
+    movd    mm4,    [parm5q+%2]
+    paddw   mm1,    mm5
+    psadbw  mm4,    mm7
+    paddw   mm2,    mm6
+    paddw   mm3,    mm4
+%endmacro
+
+%macro SAD_X4_2x16P 1
+%if %1
+    SAD_X4_START_1x8P q
+%else
+    SAD_X4_1x8P 0, 0
+%endif
+    SAD_X4_1x8P 8, 8
+    SAD_X4_1x8P FENC_STRIDE, parm6q
+    SAD_X4_1x8P FENC_STRIDE+8, parm6q+8
+    add     parm1q, 2*FENC_STRIDE
+    lea     parm2q, [parm2q+2*parm6q]
+    lea     parm3q, [parm3q+2*parm6q]
+    lea     parm4q, [parm4q+2*parm6q]
+    lea     parm5q, [parm5q+2*parm6q]
+%endmacro
+
+%macro SAD_X4_2x8P 1
+%if %1
+    SAD_X4_START_1x8P q
+%else
+    SAD_X4_1x8P 0, 0
+%endif
+    SAD_X4_1x8P FENC_STRIDE, parm6q
+    add     parm1q, 2*FENC_STRIDE
+    lea     parm2q, [parm2q+2*parm6q]
+    lea     parm3q, [parm3q+2*parm6q]
+    lea     parm4q, [parm4q+2*parm6q]
+    lea     parm5q, [parm5q+2*parm6q]
+%endmacro
+
+%macro SAD_X4_2x4P 1
+%if %1
+    SAD_X4_START_1x8P d
+%else
+    SAD_X4_1x4P 0, 0
+%endif
+    SAD_X4_1x4P FENC_STRIDE, parm6q
+    add     parm1q, 2*FENC_STRIDE
+    lea     parm2q, [parm2q+2*parm6q]
+    lea     parm3q, [parm3q+2*parm6q]
+    lea     parm4q, [parm4q+2*parm6q]
+    lea     parm5q, [parm5q+2*parm6q]
+%endmacro
+
+%macro SAD_X3_END 0
+    movd    [parm6q+0], mm0
+    movd    [parm6q+4], mm1
+    movd    [parm6q+8], mm2
+    ret
+%endmacro
+
+%macro SAD_X4_END 0
+    mov     rax, parm7q
+    movd    [rax+0], mm0
+    movd    [rax+4], mm1
+    movd    [rax+8], mm2
+    movd    [rax+12], mm3
+    ret
+%endmacro
+
+; ssd
+
 %macro SSD_INC_1x16P 0
     movq    mm1,    [rax]
     movq    mm2,    [rcx]
@@ -168,6 +341,8 @@ BITS 64
     SSD_INC_1x4P
 %endmacro
 
+; satd
+
 %macro LOAD_DIFF_4P 4  ; MMP, MMT, [pix1], [pix2]
     movd        %1, %3
     movd        %2, %4
@@ -262,6 +437,22 @@ cglobal x264_pixel_sad_8x4_mmxext
 cglobal x264_pixel_sad_4x8_mmxext
 cglobal x264_pixel_sad_4x4_mmxext
 
+cglobal x264_pixel_sad_x3_16x16_mmxext
+cglobal x264_pixel_sad_x3_16x8_mmxext
+cglobal x264_pixel_sad_x3_8x16_mmxext
+cglobal x264_pixel_sad_x3_8x8_mmxext
+cglobal x264_pixel_sad_x3_8x4_mmxext
+cglobal x264_pixel_sad_x3_4x8_mmxext
+cglobal x264_pixel_sad_x3_4x4_mmxext
+
+cglobal x264_pixel_sad_x4_16x16_mmxext
+cglobal x264_pixel_sad_x4_16x8_mmxext
+cglobal x264_pixel_sad_x4_8x16_mmxext
+cglobal x264_pixel_sad_x4_8x8_mmxext
+cglobal x264_pixel_sad_x4_8x4_mmxext
+cglobal x264_pixel_sad_x4_4x8_mmxext
+cglobal x264_pixel_sad_x4_4x4_mmxext
+
 cglobal x264_pixel_sad_pde_16x16_mmxext
 cglobal x264_pixel_sad_pde_16x8_mmxext
 cglobal x264_pixel_sad_pde_8x16_mmxext
@@ -380,6 +571,35 @@ x264_pixel_sad_4x4_mmxext:
     SAD_END
 
 
+;-----------------------------------------------------------------------------
+;  void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+;                                       uint8_t *pix2, int i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X 3
+ALIGN 16
+x264_pixel_sad_x%1_%2x%3_mmxext:
+    SAD_X%1_2x%2P 1
+%rep %3/2-1
+    SAD_X%1_2x%2P 0
+%endrep
+    SAD_X%1_END
+%endmacro
+
+SAD_X 3, 16, 16
+SAD_X 3, 16,  8
+SAD_X 3,  8, 16
+SAD_X 3,  8,  8
+SAD_X 3,  8,  4
+SAD_X 3,  4,  8
+SAD_X 3,  4,  4
+SAD_X 4, 16, 16
+SAD_X 4, 16,  8
+SAD_X 4,  8, 16
+SAD_X 4,  8,  8
+SAD_X 4,  8,  4
+SAD_X 4,  4,  8
+SAD_X 4,  4,  4
+
 
 %macro PDE_CHECK 0
     movd eax, mm0
diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm
index 8b635470..019cc3e6 100644
--- a/common/i386/pixel-a.asm
+++ b/common/i386/pixel-a.asm
@@ -29,6 +29,8 @@ BITS 32
 
 %include "i386inc.asm"
 
+; sad
+
 %macro SAD_INC_2x16P 0
     movq    mm1,    [eax]
     movq    mm2,    [eax+8]
@@ -72,6 +74,199 @@ BITS 32
     lea     ecx,    [ecx+2*edx]
 %endmacro
 
+; sad x3 / x4
+
+%macro SAD_X3_START_1x8P 1
+    push    edi
+    push    esi
+    mov     edi,    [esp+12]
+    mov     eax,    [esp+16]
+    mov     ecx,    [esp+20]
+    mov     edx,    [esp+24]
+    mov     esi,    [esp+28]
+    mov%1   mm3,    [edi]
+    mov%1   mm0,    [eax]
+    mov%1   mm1,    [ecx]
+    mov%1   mm2,    [edx]
+    psadbw  mm0,    mm3
+    psadbw  mm1,    mm3
+    psadbw  mm2,    mm3
+%endmacro
+
+%macro SAD_X3_1x8P 3
+    mov%1   mm3,    [edi+%2]
+    mov%1   mm4,    [eax+%3]
+    mov%1   mm5,    [ecx+%3]
+    mov%1   mm6,    [edx+%3]
+    psadbw  mm4,    mm3
+    psadbw  mm5,    mm3
+    psadbw  mm6,    mm3
+    paddw   mm0,    mm4
+    paddw   mm1,    mm5
+    paddw   mm2,    mm6
+%endmacro
+
+%macro SAD_X3_2x16P 1
+%if %1
+    SAD_X3_START_1x8P q
+%else
+    SAD_X3_1x8P q, 0, 0
+%endif
+    SAD_X3_1x8P q, 8, 8
+    SAD_X3_1x8P q, FENC_STRIDE, esi
+    SAD_X3_1x8P q, FENC_STRIDE+8, esi+8
+    add     edi, 2*FENC_STRIDE
+    lea     eax, [eax+2*esi]
+    lea     ecx, [ecx+2*esi]
+    lea     edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X3_2x8P 1
+%if %1
+    SAD_X3_START_1x8P q
+%else
+    SAD_X3_1x8P q, 0, 0
+%endif
+    SAD_X3_1x8P q, FENC_STRIDE, esi
+    add     edi, 2*FENC_STRIDE
+    lea     eax, [eax+2*esi]
+    lea     ecx, [ecx+2*esi]
+    lea     edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X3_2x4P 1
+%if %1
+    SAD_X3_START_1x8P d
+%else
+    SAD_X3_1x8P d, 0, 0
+%endif
+    SAD_X3_1x8P d, FENC_STRIDE, esi
+    add     edi, 2*FENC_STRIDE
+    lea     eax, [eax+2*esi]
+    lea     ecx, [ecx+2*esi]
+    lea     edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X4_START_1x8P 1
+    push    edi
+    push    esi
+    push    ebx
+    mov     edi,    [esp+16]
+    mov     eax,    [esp+20]
+    mov     ebx,    [esp+24]
+    mov     ecx,    [esp+28]
+    mov     edx,    [esp+32]
+    mov     esi,    [esp+36]
+    mov%1   mm7,    [edi]
+    mov%1   mm0,    [eax]
+    mov%1   mm1,    [ebx]
+    mov%1   mm2,    [ecx]
+    mov%1   mm3,    [edx]
+    psadbw  mm0,    mm7
+    psadbw  mm1,    mm7
+    psadbw  mm2,    mm7
+    psadbw  mm3,    mm7
+%endmacro
+
+%macro SAD_X4_1x8P 2
+    movq    mm7,    [edi+%1]
+    movq    mm4,    [eax+%2]
+    movq    mm5,    [ebx+%2]
+    movq    mm6,    [ecx+%2]
+    psadbw  mm4,    mm7
+    psadbw  mm5,    mm7
+    psadbw  mm6,    mm7
+    psadbw  mm7,    [edx+%2]
+    paddw   mm0,    mm4
+    paddw   mm1,    mm5
+    paddw   mm2,    mm6
+    paddw   mm3,    mm7
+%endmacro
+
+%macro SAD_X4_1x4P 2
+    movd    mm7,    [edi+%1]
+    movd    mm4,    [eax+%2]
+    movd    mm5,    [ebx+%2]
+    movd    mm6,    [ecx+%2]
+    psadbw  mm4,    mm7
+    psadbw  mm5,    mm7
+    paddw   mm0,    mm4
+    psadbw  mm6,    mm7
+    movd    mm4,    [edx+%2]
+    paddw   mm1,    mm5
+    psadbw  mm4,    mm7
+    paddw   mm2,    mm6
+    paddw   mm3,    mm4
+%endmacro
+
+%macro SAD_X4_2x16P 1
+%if %1
+    SAD_X4_START_1x8P q
+%else
+    SAD_X4_1x8P 0, 0
+%endif
+    SAD_X4_1x8P 8, 8
+    SAD_X4_1x8P FENC_STRIDE, esi
+    SAD_X4_1x8P FENC_STRIDE+8, esi+8
+    add     edi, 2*FENC_STRIDE
+    lea     eax, [eax+2*esi]
+    lea     ebx, [ebx+2*esi]
+    lea     ecx, [ecx+2*esi]
+    lea     edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X4_2x8P 1
+%if %1
+    SAD_X4_START_1x8P q
+%else
+    SAD_X4_1x8P 0, 0
+%endif
+    SAD_X4_1x8P FENC_STRIDE, esi
+    add     edi, 2*FENC_STRIDE
+    lea     eax, [eax+2*esi]
+    lea     ebx, [ebx+2*esi]
+    lea     ecx, [ecx+2*esi]
+    lea     edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X4_2x4P 1
+%if %1
+    SAD_X4_START_1x8P d
+%else
+    SAD_X4_1x4P 0, 0
+%endif
+    SAD_X4_1x4P FENC_STRIDE, esi
+    add     edi, 2*FENC_STRIDE
+    lea     eax, [eax+2*esi]
+    lea     ebx, [ebx+2*esi]
+    lea     ecx, [ecx+2*esi]
+    lea     edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X3_END 0
+    mov     eax,  [esp+32]
+    movd    [eax+0], mm0
+    movd    [eax+4], mm1  
+    movd    [eax+8], mm2
+    pop     esi
+    pop     edi
+    ret
+%endmacro
+
+%macro SAD_X4_END 0
+    mov     eax,  [esp+40]
+    movd    [eax+0], mm0
+    movd    [eax+4], mm1  
+    movd    [eax+8], mm2
+    movd    [eax+12], mm3
+    pop     ebx
+    pop     esi
+    pop     edi
+    ret
+%endmacro
+
+; ssd
+
 %macro SSD_INC_1x16P 0
     movq    mm1,    [eax]
     movq    mm2,    [ecx]
@@ -168,6 +363,8 @@ BITS 32
     SSD_INC_1x4P
 %endmacro
 
+; satd
+
 %macro LOAD_DIFF_4P 4  ; MMP, MMT, [pix1], [pix2]
     movd        %1, %3
     movd        %2, %4
@@ -262,6 +459,22 @@ cglobal x264_pixel_sad_8x4_mmxext
 cglobal x264_pixel_sad_4x8_mmxext
 cglobal x264_pixel_sad_4x4_mmxext
 
+cglobal x264_pixel_sad_x3_16x16_mmxext
+cglobal x264_pixel_sad_x3_16x8_mmxext
+cglobal x264_pixel_sad_x3_8x16_mmxext
+cglobal x264_pixel_sad_x3_8x8_mmxext
+cglobal x264_pixel_sad_x3_8x4_mmxext
+cglobal x264_pixel_sad_x3_4x8_mmxext
+cglobal x264_pixel_sad_x3_4x4_mmxext
+
+cglobal x264_pixel_sad_x4_16x16_mmxext
+cglobal x264_pixel_sad_x4_16x8_mmxext
+cglobal x264_pixel_sad_x4_8x16_mmxext
+cglobal x264_pixel_sad_x4_8x8_mmxext
+cglobal x264_pixel_sad_x4_8x4_mmxext
+cglobal x264_pixel_sad_x4_4x8_mmxext
+cglobal x264_pixel_sad_x4_4x4_mmxext
+
 cglobal x264_pixel_sad_pde_16x16_mmxext
 cglobal x264_pixel_sad_pde_16x8_mmxext
 cglobal x264_pixel_sad_pde_8x16_mmxext
@@ -388,6 +601,36 @@ x264_pixel_sad_4x4_mmxext:
     SAD_END
 
 
+;-----------------------------------------------------------------------------
+;  void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+;                                       uint8_t *pix2, int i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X 3
+ALIGN 16
+x264_pixel_sad_x%1_%2x%3_mmxext:
+    SAD_X%1_2x%2P 1
+%rep %3/2-1
+    SAD_X%1_2x%2P 0
+%endrep
+    SAD_X%1_END
+%endmacro
+
+SAD_X 3, 16, 16
+SAD_X 3, 16,  8
+SAD_X 3,  8, 16
+SAD_X 3,  8,  8
+SAD_X 3,  8,  4
+SAD_X 3,  4,  8
+SAD_X 3,  4,  4
+SAD_X 4, 16, 16
+SAD_X 4, 16,  8
+SAD_X 4,  8, 16
+SAD_X 4,  8,  8
+SAD_X 4,  8,  4
+SAD_X 4,  4,  8
+SAD_X 4,  4,  4
+
+
 %macro PDE_CHECK 0
     movd ebx, mm0
     cmp  ebx, [esp+24] ; prev_score
diff --git a/common/i386/pixel-sse2.asm b/common/i386/pixel-sse2.asm
index e89c3895..66d5aa0c 100644
--- a/common/i386/pixel-sse2.asm
+++ b/common/i386/pixel-sse2.asm
@@ -38,6 +38,10 @@ SECTION .text
 
 cglobal x264_pixel_sad_16x16_sse2
 cglobal x264_pixel_sad_16x8_sse2
+cglobal x264_pixel_sad_x3_16x16_sse2
+cglobal x264_pixel_sad_x3_16x8_sse2
+cglobal x264_pixel_sad_x4_16x16_sse2
+cglobal x264_pixel_sad_x4_16x8_sse2
 cglobal x264_pixel_ssd_16x16_sse2
 cglobal x264_pixel_ssd_16x8_sse2
 cglobal x264_pixel_satd_8x4_sse2
@@ -164,6 +168,158 @@ x264_pixel_sad_16x8_sse2:
     SAD_INC_4x16P_SSE2
     SAD_END_SSE2
 
+
+%macro SAD_X3_START_1x16P 0
+    push    edi
+    push    esi
+    mov     edi,    [esp+12]
+    mov     eax,    [esp+16]
+    mov     ecx,    [esp+20]
+    mov     edx,    [esp+24]
+    mov     esi,    [esp+28]
+    movdqa  xmm3,   [edi]
+    movdqu  xmm0,   [eax]
+    movdqu  xmm1,   [ecx]
+    movdqu  xmm2,   [edx]
+    psadbw  xmm0,   xmm3
+    psadbw  xmm1,   xmm3
+    psadbw  xmm2,   xmm3
+%endmacro
+
+%macro SAD_X3_1x16P 2
+    movdqa  xmm3,   [edi+%1]
+    movdqu  xmm4,   [eax+%2]
+    movdqu  xmm5,   [ecx+%2]
+    movdqu  xmm6,   [edx+%2]
+    psadbw  xmm4,   xmm3
+    psadbw  xmm5,   xmm3
+    psadbw  xmm6,   xmm3
+    paddw   xmm0,   xmm4
+    paddw   xmm1,   xmm5
+    paddw   xmm2,   xmm6
+%endmacro
+
+%macro SAD_X3_2x16P 1
+%if %1
+    SAD_X3_START_1x16P
+%else
+    SAD_X3_1x16P 0, 0
+%endif
+    SAD_X3_1x16P FENC_STRIDE, esi
+    add     edi, 2*FENC_STRIDE
+    lea     eax, [eax+2*esi]
+    lea     ecx, [ecx+2*esi]
+    lea     edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X4_START_1x16P 0
+    push    edi
+    push    esi
+    push    ebx
+    mov     edi,    [esp+16]
+    mov     eax,    [esp+20]
+    mov     ebx,    [esp+24]
+    mov     ecx,    [esp+28]
+    mov     edx,    [esp+32]
+    mov     esi,    [esp+36]
+    movdqa  xmm7,   [edi]
+    movdqu  xmm0,   [eax]
+    movdqu  xmm1,   [ebx]
+    movdqu  xmm2,   [ecx]
+    movdqu  xmm3,   [edx]
+    psadbw  xmm0,   xmm7
+    psadbw  xmm1,   xmm7
+    psadbw  xmm2,   xmm7
+    psadbw  xmm3,   xmm7
+%endmacro
+
+%macro SAD_X4_1x16P 2
+    movdqa  xmm7,   [edi+%1]
+    movdqu  xmm4,   [eax+%2]
+    movdqu  xmm5,   [ebx+%2]
+    movdqu  xmm6,   [ecx+%2]
+    psadbw  xmm4,   xmm7
+    psadbw  xmm5,   xmm7
+    paddw   xmm0,   xmm4
+    psadbw  xmm6,   xmm7
+    movdqu  xmm4,   [edx+%2]
+    paddw   xmm1,   xmm5
+    psadbw  xmm4,   xmm7
+    paddw   xmm2,   xmm6
+    paddw   xmm3,   xmm4
+%endmacro
+
+%macro SAD_X4_2x16P 1
+%if %1
+    SAD_X4_START_1x16P
+%else
+    SAD_X4_1x16P 0, 0
+%endif
+    SAD_X4_1x16P FENC_STRIDE, esi
+    add     edi, 2*FENC_STRIDE
+    lea     eax, [eax+2*esi]
+    lea     ebx, [ebx+2*esi]
+    lea     ecx, [ecx+2*esi]
+    lea     edx, [edx+2*esi]
+%endmacro
+
+%macro SAD_X3_END 0
+    mov     eax,  [esp+32]
+    pshufd  xmm4, xmm0, 2
+    pshufd  xmm5, xmm1, 2
+    pshufd  xmm6, xmm2, 2
+    paddw   xmm0, xmm4
+    paddw   xmm1, xmm5
+    paddw   xmm2, xmm6
+    movd    [eax+0], xmm0
+    movd    [eax+4], xmm1
+    movd    [eax+8], xmm2
+    pop     esi
+    pop     edi
+    ret
+%endmacro
+
+%macro SAD_X4_END 0
+    mov     eax,  [esp+40]
+    pshufd  xmm4, xmm0, 2
+    pshufd  xmm5, xmm1, 2
+    pshufd  xmm6, xmm2, 2
+    pshufd  xmm7, xmm3, 2
+    paddw   xmm0, xmm4
+    paddw   xmm1, xmm5
+    paddw   xmm2, xmm6
+    paddw   xmm3, xmm7
+    movd    [eax+0], xmm0
+    movd    [eax+4], xmm1
+    movd    [eax+8], xmm2
+    movd    [eax+12], xmm3
+    pop     ebx
+    pop     esi
+    pop     edi
+    ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;  void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+;                                     uint8_t *pix2, int i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X 3
+ALIGN 16
+x264_pixel_sad_x%1_%2x%3_sse2:
+    SAD_X%1_2x%2P 1
+%rep %3/2-1
+    SAD_X%1_2x%2P 0
+%endrep
+    SAD_X%1_END
+%endmacro
+
+SAD_X 3, 16, 16
+SAD_X 3, 16,  8
+SAD_X 4, 16, 16
+SAD_X 4, 16,  8
+
+
 %macro SSD_INC_2x16P_SSE2 0
     movdqu  xmm1,   [eax]
     movdqu  xmm2,   [ecx]
diff --git a/common/i386/pixel.h b/common/i386/pixel.h
index df7ea616..c0f9f3e4 100644
--- a/common/i386/pixel.h
+++ b/common/i386/pixel.h
@@ -32,6 +32,21 @@ int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int );
 int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
 int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
 
+void x264_pixel_sad_x3_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+
 int x264_pixel_sad_pde_16x16_mmxext( uint8_t *, int, uint8_t *, int, int );
 int x264_pixel_sad_pde_16x8_mmxext( uint8_t *, int, uint8_t *, int, int );
 int x264_pixel_sad_pde_8x16_mmxext( uint8_t *, int, uint8_t *, int, int );
@@ -55,6 +70,11 @@ int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
 int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int );
 int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int );
 
+void x264_pixel_sad_x3_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x3_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+void x264_pixel_sad_x4_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
+
 int x264_pixel_ssd_16x16_sse2( uint8_t *, int, uint8_t *, int );
 int x264_pixel_ssd_16x8_sse2( uint8_t *, int, uint8_t *, int );
 
diff --git a/common/pixel.c b/common/pixel.c
index 9eb4f933..4e3e7870 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -59,13 +59,13 @@ static int name( uint8_t *pix1, int i_stride_pix1,  \
 }
 
 
-PIXEL_SAD_C( pixel_sad_16x16, 16, 16 )
-PIXEL_SAD_C( pixel_sad_16x8,  16,  8 )
-PIXEL_SAD_C( pixel_sad_8x16,   8, 16 )
-PIXEL_SAD_C( pixel_sad_8x8,    8,  8 )
-PIXEL_SAD_C( pixel_sad_8x4,    8,  4 )
-PIXEL_SAD_C( pixel_sad_4x8,    4,  8 )
-PIXEL_SAD_C( pixel_sad_4x4,    4,  4 )
+PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 )
+PIXEL_SAD_C( x264_pixel_sad_16x8,  16,  8 )
+PIXEL_SAD_C( x264_pixel_sad_8x16,   8, 16 )
+PIXEL_SAD_C( x264_pixel_sad_8x8,    8,  8 )
+PIXEL_SAD_C( x264_pixel_sad_8x4,    8,  4 )
+PIXEL_SAD_C( x264_pixel_sad_4x8,    4,  8 )
+PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )
 
 
 /****************************************************************************
@@ -90,13 +90,13 @@ static int name( uint8_t *pix1, int i_stride_pix1,  \
     return i_sum;                                   \
 }
 
-PIXEL_SSD_C( pixel_ssd_16x16, 16, 16 )
-PIXEL_SSD_C( pixel_ssd_16x8,  16,  8 )
-PIXEL_SSD_C( pixel_ssd_8x16,   8, 16 )
-PIXEL_SSD_C( pixel_ssd_8x8,    8,  8 )
-PIXEL_SSD_C( pixel_ssd_8x4,    8,  4 )
-PIXEL_SSD_C( pixel_ssd_4x8,    4,  8 )
-PIXEL_SSD_C( pixel_ssd_4x4,    4,  4 )
+PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 )
+PIXEL_SSD_C( x264_pixel_ssd_16x8,  16,  8 )
+PIXEL_SSD_C( x264_pixel_ssd_8x16,   8, 16 )
+PIXEL_SSD_C( x264_pixel_ssd_8x8,    8,  8 )
+PIXEL_SSD_C( x264_pixel_ssd_8x4,    8,  4 )
+PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
+PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )
 
 int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
 {
@@ -207,13 +207,13 @@ static int name( uint8_t *pix1, int i_stride_pix1, \
 { \
     return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
 }
-PIXEL_SATD_C( pixel_satd_16x16, 16, 16 )
-PIXEL_SATD_C( pixel_satd_16x8,  16, 8 )
-PIXEL_SATD_C( pixel_satd_8x16,  8, 16 )
-PIXEL_SATD_C( pixel_satd_8x8,   8, 8 )
-PIXEL_SATD_C( pixel_satd_8x4,   8, 4 )
-PIXEL_SATD_C( pixel_satd_4x8,   4, 8 )
-PIXEL_SATD_C( pixel_satd_4x4,   4, 4 )
+PIXEL_SATD_C( x264_pixel_satd_16x16, 16, 16 )
+PIXEL_SATD_C( x264_pixel_satd_16x8,  16, 8 )
+PIXEL_SATD_C( x264_pixel_satd_8x16,  8, 16 )
+PIXEL_SATD_C( x264_pixel_satd_8x8,   8, 8 )
+PIXEL_SATD_C( x264_pixel_satd_8x4,   8, 4 )
+PIXEL_SATD_C( x264_pixel_satd_4x8,   4, 8 )
+PIXEL_SATD_C( x264_pixel_satd_4x4,   4, 4 )
 
 
 /****************************************************************************
@@ -282,8 +282,8 @@ static inline int pixel_sa8d_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int
 }
 
 #define PIXEL_SA8D_C( width, height ) \
-static int pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \
-                 uint8_t *pix2, int i_stride_pix2 ) \
+static int x264_pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \
+                                               uint8_t *pix2, int i_stride_pix2 ) \
 { \
     return ( pixel_sa8d_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ) + 2 ) >> 2; \
 }
@@ -292,74 +292,79 @@ PIXEL_SA8D_C( 16, 8 )
 PIXEL_SA8D_C( 8, 16 )
 PIXEL_SA8D_C( 8, 8 )
 
+#define SAD_X( size ) \
+static void x264_pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+{\
+    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
+    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
+    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
+}\
+static void x264_pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+{\
+    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
+    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
+    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
+    scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
+}
+
+SAD_X( 16x16 )
+SAD_X( 16x8 )
+SAD_X( 8x16 )
+SAD_X( 8x8 )
+SAD_X( 8x4 )
+SAD_X( 4x8 )
+SAD_X( 4x4 )
+
+#ifdef ARCH_UltraSparc
+SAD_X( 16x16_vis )
+SAD_X( 16x8_vis )
+SAD_X( 8x16_vis )
+SAD_X( 8x8_vis )
+#endif
 
 /****************************************************************************
  * x264_pixel_init:
  ****************************************************************************/
 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 {
-    pixf->sad[PIXEL_16x16] = pixel_sad_16x16;
-    pixf->sad[PIXEL_16x8]  = pixel_sad_16x8;
-    pixf->sad[PIXEL_8x16]  = pixel_sad_8x16;
-    pixf->sad[PIXEL_8x8]   = pixel_sad_8x8;
-    pixf->sad[PIXEL_8x4]   = pixel_sad_8x4;
-    pixf->sad[PIXEL_4x8]   = pixel_sad_4x8;
-    pixf->sad[PIXEL_4x4]   = pixel_sad_4x4;
-
-    pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16;
-    pixf->ssd[PIXEL_16x8]  = pixel_ssd_16x8;
-    pixf->ssd[PIXEL_8x16]  = pixel_ssd_8x16;
-    pixf->ssd[PIXEL_8x8]   = pixel_ssd_8x8;
-    pixf->ssd[PIXEL_8x4]   = pixel_ssd_8x4;
-    pixf->ssd[PIXEL_4x8]   = pixel_ssd_4x8;
-    pixf->ssd[PIXEL_4x4]   = pixel_ssd_4x4;
-
-    pixf->satd[PIXEL_16x16]= pixel_satd_16x16;
-    pixf->satd[PIXEL_16x8] = pixel_satd_16x8;
-    pixf->satd[PIXEL_8x16] = pixel_satd_8x16;
-    pixf->satd[PIXEL_8x8]  = pixel_satd_8x8;
-    pixf->satd[PIXEL_8x4]  = pixel_satd_8x4;
-    pixf->satd[PIXEL_4x8]  = pixel_satd_4x8;
-    pixf->satd[PIXEL_4x4]  = pixel_satd_4x4;
-
-    pixf->sa8d[PIXEL_16x16]= pixel_sa8d_16x16;
-    pixf->sa8d[PIXEL_16x8] = pixel_sa8d_16x8;
-    pixf->sa8d[PIXEL_8x16] = pixel_sa8d_8x16;
-    pixf->sa8d[PIXEL_8x8]  = pixel_sa8d_8x8;
+    memset( pixf, 0, sizeof(*pixf) );
+
+#define INIT( name, cpu ) \
+    pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\
+    pixf->name[PIXEL_16x8]  = x264_pixel_##name##_16x8##cpu;\
+    pixf->name[PIXEL_8x16]  = x264_pixel_##name##_8x16##cpu;\
+    pixf->name[PIXEL_8x8]   = x264_pixel_##name##_8x8##cpu;\
+    pixf->name[PIXEL_8x4]   = x264_pixel_##name##_8x4##cpu;\
+    pixf->name[PIXEL_4x8]   = x264_pixel_##name##_4x8##cpu;\
+    pixf->name[PIXEL_4x4]   = x264_pixel_##name##_4x4##cpu;
+
+    INIT( sad, );
+    INIT( sad_x3, );
+    INIT( sad_x4, );
+    INIT( ssd, );
+    INIT( satd, );
+
+    pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16;
+    pixf->sa8d[PIXEL_16x8] = x264_pixel_sa8d_16x8;
+    pixf->sa8d[PIXEL_8x16] = x264_pixel_sa8d_8x16;
+    pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8;
 
 #ifdef HAVE_MMXEXT
     if( cpu&X264_CPU_MMX )
     {
-        pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmx;
-        pixf->ssd[PIXEL_16x8]  = x264_pixel_ssd_16x8_mmx;
-        pixf->ssd[PIXEL_8x16]  = x264_pixel_ssd_8x16_mmx;
-        pixf->ssd[PIXEL_8x8]   = x264_pixel_ssd_8x8_mmx;
-        pixf->ssd[PIXEL_8x4]   = x264_pixel_ssd_8x4_mmx;
-        pixf->ssd[PIXEL_4x8]   = x264_pixel_ssd_4x8_mmx;
-        pixf->ssd[PIXEL_4x4]   = x264_pixel_ssd_4x4_mmx;
+        INIT( ssd, _mmx );
     }
 
     if( cpu&X264_CPU_MMXEXT )
     {
-        pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_mmxext;
-        pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_mmxext;
-        pixf->sad[PIXEL_8x16 ] = x264_pixel_sad_8x16_mmxext;
-        pixf->sad[PIXEL_8x8  ] = x264_pixel_sad_8x8_mmxext;
-        pixf->sad[PIXEL_8x4  ] = x264_pixel_sad_8x4_mmxext;
-        pixf->sad[PIXEL_4x8  ] = x264_pixel_sad_4x8_mmxext;
-        pixf->sad[PIXEL_4x4]   = x264_pixel_sad_4x4_mmxext;
+        INIT( sad, _mmxext );
+        INIT( sad_x3, _mmxext );
+        INIT( sad_x4, _mmxext );
+        INIT( satd, _mmxext );
 
         pixf->sad_pde[PIXEL_16x16] = x264_pixel_sad_pde_16x16_mmxext;
         pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext;
         pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext;
-
-        pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext;
-        pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext;
-        pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext;
-        pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_mmxext;
-        pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_mmxext;
-        pixf->satd[PIXEL_4x8]  = x264_pixel_satd_4x8_mmxext;
-        pixf->satd[PIXEL_4x4]  = x264_pixel_satd_4x4_mmxext;
     }
 #endif
 
@@ -375,6 +380,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_sse2;
         pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_sse2;
         pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_sse2;
+
+#ifndef ARCH_X86_64
+        pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_sse2;
+        pixf->sad_x3[PIXEL_16x8 ] = x264_pixel_sad_x3_16x8_sse2;
+
+        pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_sse2;
+        pixf->sad_x4[PIXEL_16x8 ] = x264_pixel_sad_x4_16x8_sse2;
+#endif
     }
     // these are faster on both Intel and AMD
     if( cpu&X264_CPU_SSE2 )
@@ -395,6 +408,16 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->sad[PIXEL_8x16]  = x264_pixel_sad_8x16_vis;
     pixf->sad[PIXEL_16x8]  = x264_pixel_sad_16x8_vis;
     pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_vis;
+
+    pixf->sad_x3[PIXEL_8x8]   = x264_pixel_sad_x3_8x8_vis;
+    pixf->sad_x3[PIXEL_8x16]  = x264_pixel_sad_x3_8x16_vis;
+    pixf->sad_x3[PIXEL_16x8]  = x264_pixel_sad_x3_16x8_vis;
+    pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_vis;
+
+    pixf->sad_x4[PIXEL_8x8]   = x264_pixel_sad_x4_8x8_vis;
+    pixf->sad_x4[PIXEL_8x16]  = x264_pixel_sad_x4_8x16_vis;
+    pixf->sad_x4[PIXEL_16x8]  = x264_pixel_sad_x4_16x8_vis;
+    pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_vis;
 #endif
 }
 
diff --git a/common/pixel.h b/common/pixel.h
index 2300bc0c..18a0746f 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -26,6 +26,8 @@
 
 typedef int  (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
 typedef int  (*x264_pixel_cmp_pde_t) ( uint8_t *, int, uint8_t *, int, int );
+typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
+typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );
 
 enum
 {
@@ -72,6 +74,10 @@ typedef struct
      * terminate early if partial score is worse than a threshold.
      * may be NULL, in which case just use sad instead. */
     x264_pixel_cmp_pde_t sad_pde[7];
+
+    /* multiple parallel calls to sad. */
+    x264_pixel_cmp_x3_t sad_x3[7];
+    x264_pixel_cmp_x4_t sad_x4[7];
 } x264_pixel_function_t;
 
 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 6aaf73c4..f1bee711 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -594,6 +594,26 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
     return i_satd / 2;
 }
 
+#define SAD_X( size ) \
+static void pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+{\
+    scores[0] = pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
+    scores[1] = pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
+    scores[2] = pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
+}\
+static void pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+{\
+    scores[0] = pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
+    scores[1] = pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
+    scores[2] = pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
+    scores[3] = pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
+}
+
+SAD_X( 16x16_altivec )
+SAD_X( 16x8_altivec )
+SAD_X( 8x16_altivec )
+SAD_X( 8x8_altivec )
+
 /****************************************************************************
  * x264_pixel_init:
  ****************************************************************************/
@@ -604,6 +624,16 @@ void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
     pixf->sad[PIXEL_16x8]   = pixel_sad_16x8_altivec;
     pixf->sad[PIXEL_8x8]    = pixel_sad_8x8_altivec;
 
+    pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
+    pixf->sad_x3[PIXEL_8x16]  = pixel_sad_x3_8x16_altivec;
+    pixf->sad_x3[PIXEL_16x8]  = pixel_sad_x3_16x8_altivec;
+    pixf->sad_x3[PIXEL_8x8]   = pixel_sad_x3_8x8_altivec;
+
+    pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
+    pixf->sad_x4[PIXEL_8x16]  = pixel_sad_x4_8x16_altivec;
+    pixf->sad_x4[PIXEL_16x8]  = pixel_sad_x4_16x8_altivec;
+    pixf->sad_x4[PIXEL_8x8]   = pixel_sad_x4_8x8_altivec;
+
     pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
     pixf->satd[PIXEL_8x16]  = pixel_satd_8x16_altivec;
     pixf->satd[PIXEL_16x8]  = pixel_satd_16x8_altivec;
diff --git a/encoder/me.c b/encoder/me.c
index 66bfdfc9..479949b8 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -44,69 +44,90 @@ static const int subpel_iterations[][4] =
 
 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
 
-#define COST_MV_INT( mx, my, bd, d ) \
+#define BITS_MVD( mx, my )\
+     (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
+
+#define COST_MV( mx, my ) \
 { \
     int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, \
                    &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] ) \
-             + p_cost_mvx[ (mx)<<2 ]  \
-             + p_cost_mvy[ (my)<<2 ]; \
+             + BITS_MVD(mx,my); \
     if( cost < bcost ) \
     {                  \
         bcost = cost;  \
         bmx = mx;      \
         bmy = my;      \
-        if( bd ) \
-            dir = d; \
     } \
 }
-#define COST_MV( mx, my )         COST_MV_INT( mx, my, 0, 0 )
-#define COST_MV_DIR( mx, my, d )  COST_MV_INT( mx, my, 1, d )
 
 #define COST_MV_PDE( mx, my ) \
 { \
     int cost = h->pixf.sad_pde[i_pixel]( m->p_fenc[0], FENC_STRIDE, \
                    &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0], \
                    bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ); \
-    if( cost < bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ) \
-    {                  \
-        bcost = cost + p_cost_mvx[ (mx)<<2 ] + p_cost_mvy[ (my)<<2 ];  \
-        bmx = mx;      \
-        bmy = my;      \
+    if( cost < bcost - BITS_MVD(mx,my) ) \
+    { \
+        bcost = cost + BITS_MVD(mx,my); \
+        bmx = mx; \
+        bmy = my; \
     } \
 }
 
-#define DIA1_ITER( mx, my )\
-    {\
-        omx = mx; omy = my;\
-        COST_MV( omx  , omy-1 );/*  1  */\
-        COST_MV( omx  , omy+1 );/* 101 */\
-        COST_MV( omx-1, omy   );/*  1  */\
-        COST_MV( omx+1, omy   );\
-    }
+#define COPY2_IF_LT(x,y,a,b)\
+if((y)<(x))\
+{\
+    (x)=(y);\
+    (a)=(b);\
+}
 
-#define DIA2 \
-    {\
-        COST_MV( omx  , omy-2 );\
-        COST_MV( omx-1, omy-1 );/*   1   */\
-        COST_MV( omx+1, omy-1 );/*  1 1  */\
-        COST_MV( omx-2, omy   );/* 1 0 1 */\
-        COST_MV( omx+2, omy   );/*  1 1  */\
-        COST_MV( omx-1, omy+1 );/*   1   */\
-        COST_MV( omx+1, omy+1 );\
-        COST_MV( omx  , omy+2 );\
-    }\
-
-#define OCT2 \
-    {\
-        COST_MV( omx-1, omy-2 );\
-        COST_MV( omx+1, omy-2 );/*  1 1  */\
-        COST_MV( omx-2, omy-1 );/* 1   1 */\
-        COST_MV( omx+2, omy-1 );/*   0   */\
-        COST_MV( omx-2, omy+1 );/* 1   1 */\
-        COST_MV( omx+2, omy+1 );/*  1 1  */\
-        COST_MV( omx-1, omy+2 );\
-        COST_MV( omx+1, omy+2 );\
-    }
+#define COPY3_IF_LT(x,y,a,b,c,d)\
+if((y)<(x))\
+{\
+    (x)=(y);\
+    (a)=(b);\
+    (c)=(d);\
+}
+
+#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
+{\
+    uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\
+    h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
+        pix_base + (m0x) + (m0y)*m->i_stride[0],\
+        pix_base + (m1x) + (m1y)*m->i_stride[0],\
+        pix_base + (m2x) + (m2y)*m->i_stride[0],\
+        m->i_stride[0], costs );\
+    (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
+    (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
+    (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
+}
+
+#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
+{\
+    uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\
+    h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\
+        pix_base + (m0x) + (m0y)*m->i_stride[0],\
+        pix_base + (m1x) + (m1y)*m->i_stride[0],\
+        pix_base + (m2x) + (m2y)*m->i_stride[0],\
+        pix_base + (m3x) + (m3y)*m->i_stride[0],\
+        m->i_stride[0], costs );\
+    costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\
+    costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\
+    costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\
+    costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\
+    COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\
+    COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\
+    COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\
+    COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
+}
+
+/*  1  */
+/* 101 */
+/*  1  */
+#define DIA1_ITER( mx, my )\
+{\
+    omx = mx; omy = my;\
+    COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\
+}
 
 #define CROSS( start, x_max, y_max ) \
     { \
@@ -136,6 +157,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int
     uint8_t *p_fref = m->p_fref[0];
     int i, j;
     int dir;
+    int costs[6];
 
     int mv_x_min = h->mb.mv_min_fpel[0];
     int mv_y_min = h->mb.mv_min_fpel[1];
@@ -157,7 +179,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int
     bcost = COST_MAX;
     COST_MV( pmx, pmy );
     /* I don't know why this helps */
-    bcost -= p_cost_mvx[ bmx<<2 ] + p_cost_mvy[ bmy<<2 ];
+    bcost -= BITS_MVD(bmx,bmy);
 
     /* try extra predictors if provided */
     for( i = 0; i < i_mvc; i++ )
@@ -167,7 +189,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int
         if( mx != bmx || my != bmy )
             COST_MV( mx, my );
     }
-    
+
     COST_MV( 0, 0 );
 
     mv_x_max += 8;
@@ -205,36 +227,47 @@ me_hex2:
         }
 #else
         /* equivalent to the above, but eliminates duplicate candidates */
-        dir = -1;
-        omx = bmx; omy = bmy;
-        COST_MV_DIR( omx-2, omy,   0 );
-        COST_MV_DIR( omx-1, omy+2, 1 );
-        COST_MV_DIR( omx+1, omy+2, 2 );
-        COST_MV_DIR( omx+2, omy,   3 );
-        COST_MV_DIR( omx+1, omy-2, 4 );
-        COST_MV_DIR( omx-1, omy-2, 5 );
-        if( dir != -1 )
+        dir = -2;
+
+        /* hexagon */
+        COST_MV_X3_DIR( -2,0, -1, 2,  1, 2, costs   );
+        COST_MV_X3_DIR(  2,0,  1,-2, -1,-2, costs+3 );
+        COPY2_IF_LT( bcost, costs[0], dir, 0 );
+        COPY2_IF_LT( bcost, costs[1], dir, 1 );
+        COPY2_IF_LT( bcost, costs[2], dir, 2 );
+        COPY2_IF_LT( bcost, costs[3], dir, 3 );
+        COPY2_IF_LT( bcost, costs[4], dir, 4 );
+        COPY2_IF_LT( bcost, costs[5], dir, 5 );
+
+        if( dir != -2 )
         {
+            static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
+            bmx += hex2[dir+1][0];
+            bmy += hex2[dir+1][1];
+            /* half hexagon, not overlapping the previous iteration */
             for( i = 1; i < i_me_range/2; i++ )
             {
-                static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
                 static const int mod6[8] = {5,0,1,2,3,4,5,0};
                 const int odir = mod6[dir+1];
-                omx = bmx; omy = bmy;
-                COST_MV_DIR( omx + hex2[odir+0][0], omy + hex2[odir+0][1], odir-1 );
-                COST_MV_DIR( omx + hex2[odir+1][0], omy + hex2[odir+1][1], odir   );
-                COST_MV_DIR( omx + hex2[odir+2][0], omy + hex2[odir+2][1], odir+1 );
-                if( bmx == omx && bmy == omy )
+                COST_MV_X3_DIR( hex2[odir+0][0], hex2[odir+0][1],
+                                hex2[odir+1][0], hex2[odir+1][1],
+                                hex2[odir+2][0], hex2[odir+2][1],
+                                costs );
+                dir = -2;
+                COPY2_IF_LT( bcost, costs[0], dir, odir-1 );
+                COPY2_IF_LT( bcost, costs[1], dir, odir   );
+                COPY2_IF_LT( bcost, costs[2], dir, odir+1 );
+                if( dir == -2 )
                     break;
+                bmx += hex2[dir+1][0];
+                bmy += hex2[dir+1][1];
             }
         }
 #endif
         /* square refine */
-        DIA1_ITER( bmx, bmy );
-        COST_MV( omx-1, omy-1 );
-        COST_MV( omx-1, omy+1 );
-        COST_MV( omx+1, omy-1 );
-        COST_MV( omx+1, omy+1 );
+        omx = bmx; omy = bmy;
+        COST_MV_X4(  0,-1,  0,1, -1,0, 1,0 );
+        COST_MV_X4( -1,-1, -1,1, 1,-1, 1,1 );
         break;
 
     case X264_ME_UMH:
@@ -267,14 +300,16 @@ me_hex2:
 #define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) )
             if( bcost == ucost2 && SAD_THRESH(2000) )
             {
-                DIA2;
+                COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 );
+                COST_MV_X4( 2, 0, -1, 1, 1, 1,  0,2 );
                 if( bcost == ucost1 && SAD_THRESH(500) )
                     break;
                 if( bcost == ucost2 )
                 {
                     int range = (i_me_range>>1) | 1;
                     CROSS( 3, range, range );
-                    OCT2;
+                    COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 );
+                    COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 );
                     if( bcost == ucost2 )
                         break;
                     cross_start = range + 2;
@@ -282,7 +317,7 @@ me_hex2:
             }
 
             /* adaptive search range */
-            if( i_mvc ) 
+            if( i_mvc )
             {
                 /* range multipliers based on casual inspection of some statistics of
                  * average distance between current predictor and final mv found by ESA.
@@ -342,18 +377,13 @@ me_hex2:
 
             /* 5x5 ESA */
             omx = bmx; omy = bmy;
-            for( i = (bcost == ucost2) ? 4 : 0; i < 24; i++ )
-            {
-                static const int square2[24][2] = {
-                    { 1, 0}, { 0, 1}, {-1, 0}, { 0,-1},
-                    { 1, 1}, {-1, 1}, {-1,-1}, { 1,-1},
-                    { 2,-1}, { 2, 0}, { 2, 1}, { 2, 2},
-                    { 1, 2}, { 0, 2}, {-1, 2}, {-2, 2},
-                    {-2, 1}, {-2, 0}, {-2,-1}, {-2,-2},
-                    {-1,-2}, { 0,-2}, { 1,-2}, { 2,-2}
-                };
-                COST_MV( omx + square2[i][0], omy + square2[i][1] );
-            }
+            if( bcost != ucost2 )
+                COST_MV_X4(  1, 0,  0, 1, -1, 0,  0,-1 );
+            COST_MV_X4(  1, 1, -1, 1, -1,-1,  1,-1 );
+            COST_MV_X4(  2,-1,  2, 0,  2, 1,  2, 2 );
+            COST_MV_X4(  1, 2,  0, 2, -1, 2, -2, 2 );
+            COST_MV_X4( -2, 1, -2, 0, -2,-1, -2,-2 );
+            COST_MV_X4( -1,-2,  0,-2,  1,-2,  2,-2 );
 
             /* hexagon grid */
             omx = bmx; omy = bmy;
@@ -365,29 +395,25 @@ me_hex2:
                     { 2, 3}, { 0, 4}, {-2, 3},
                     {-2,-3}, { 0,-4}, { 2,-3},
                 };
-                const int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min );
 
-                if( h->pixf.sad_pde[i_pixel] )
+                if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min,
+                                     mv_y_max-omy, omy-mv_y_min ) )
                 {
                     for( j = 0; j < 16; j++ )
                     {
                         int mx = omx + hex4[j][0]*i;
                         int my = omy + hex4[j][1]*i;
-                        if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
-                                            && my >= mv_y_min && my <= mv_y_max ) )
-                            COST_MV_PDE( mx, my );
+                        if(    mx >= mv_x_min && mx <= mv_x_max
+                            && my >= mv_y_min && my <= mv_y_max )
+                            COST_MV( mx, my );
                     }
                 }
                 else
                 {
-                    for( j = 0; j < 16; j++ )
-                    {
-                        int mx = omx + hex4[j][0]*i;
-                        int my = omy + hex4[j][1]*i;
-                        if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
-                                            && my >= mv_y_min && my <= mv_y_max ) )
-                            COST_MV( mx, my );
-                    }
+                    COST_MV_X4( -4*i, 2*i, -4*i, 1*i, -4*i, 0*i, -4*i,-1*i );
+                    COST_MV_X4( -4*i,-2*i,  4*i,-2*i,  4*i,-1*i,  4*i, 0*i );
+                    COST_MV_X4(  4*i, 1*i,  4*i, 2*i,  2*i, 3*i,  0*i, 4*i );
+                    COST_MV_X4( -2*i, 3*i, -2*i,-3*i,  0*i,-4*i,  2*i,-3*i );
                 }
             }
             goto me_hex2;
@@ -423,7 +449,7 @@ me_hex2:
                         const uint16_t *integral = &integral_base[ mx + my * stride ];
                         const uint16_t ref_dc = integral[  0 ] + integral[ dh + dw ]
                                               - integral[ dw ] - integral[ dh ];
-                        const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ];
+                        const int bsad = bcost - BITS_MVD(mx,my);
                         if( abs( ref_dc - enc_dc ) < bsad )
                             COST_MV_PDE( mx, my );
                     }
@@ -436,7 +462,7 @@ me_hex2:
                         const uint16_t *integral = &integral_base[ mx + my * stride ];
                         const uint16_t ref_dc = integral[  0 ] + integral[ dh + dw ]
                                               - integral[ dw ] - integral[ dh ];
-                        const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ];
+                        const int bsad = bcost - BITS_MVD(mx,my);
                         if( abs( ref_dc - enc_dc ) < bsad )
                             COST_MV( mx, my );
                     }
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 04674759..680d4706 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -61,6 +61,36 @@ static int check_pixel( int cpu_ref, int cpu_new )
     TEST_PIXEL( ssd );
     TEST_PIXEL( satd );
 
+#define TEST_PIXEL_X( N ) \
+    for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
+    { \
+        int res_c[4]={0}, res_asm[4]={0}; \
+        if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
+        { \
+            used_asm = 1; \
+            res_c[0] = pixel_c.sad[i]( buf1, 16, buf2, 24 ); \
+            res_c[1] = pixel_c.sad[i]( buf1, 16, buf2+30, 24 ); \
+            res_c[2] = pixel_c.sad[i]( buf1, 16, buf2+1, 24 ); \
+            if(N==4) \
+            { \
+                res_c[3] = pixel_c.sad[i]( buf1, 16, buf2+99, 24 ); \
+                pixel_asm.sad_x4[i]( buf1, buf2, buf2+30, buf2+1, buf2+99, 24, res_asm ); \
+            } \
+            else \
+                pixel_asm.sad_x3[i]( buf1, buf2, buf2+30, buf2+1, 24, res_asm ); \
+            if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
+            { \
+                ok = 0; \
+                fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
+                         i, res_c[0], res_c[1], res_c[2], res_c[3], \
+                         res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
+            } \
+        } \
+    } \
+    report( "pixel sad_x"#N" :" );
+
+    TEST_PIXEL_X(3);
+    TEST_PIXEL_X(4);
     return ret;
 }