From: Loren Merritt <pengvado@videolan.org>
Date: Fri, 10 Feb 2006 21:58:43 +0000 (+0000)
Subject: amd64 mmx for some intra pred functions
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=681b394485671f977a1a19d2279ace4c22eb0177;p=libx264

amd64 mmx for some intra pred functions


git-svn-id: svn://svn.videolan.org/x264/trunk@429 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/Makefile b/Makefile
index deefe36f..7580617f 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ endif
 
 # MMX/SSE optims
 ifeq ($(ARCH),X86_64)
-SRCS   += common/i386/mc-c.c common/i386/dct-c.c common/amd64/predict.c
+SRCS   += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
 ASMSRC  = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
           common/amd64/pixel-a.asm common/amd64/mc-a.asm \
           common/amd64/mc-a2.asm common/amd64/predict-a.asm \
diff --git a/common/amd64/predict-a.asm b/common/amd64/predict-a.asm
index a6bec9bd..0bb7fb07 100644
--- a/common/amd64/predict-a.asm
+++ b/common/amd64/predict-a.asm
@@ -3,6 +3,8 @@
 ;*****************************************************************************
 ;* Copyright (C) 2005 x264 project
 ;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
 ;* the Free Software Foundation; either version 2 of the License, or
@@ -26,23 +28,139 @@ BITS 64
 
 %include "amd64inc.asm"
 
-;=============================================================================
-; Macros
-;=============================================================================
-
 %macro SAVE_0_1 1
     movq        [%1]         , mm0
     movq        [%1 + 8]     , mm1
 %endmacro
 
+%macro SAVE_0_0 1
+    movq        [%1]         , mm0
+    movq        [%1 + 8]     , mm0
+%endmacro
+
+
+SECTION .rodata align=16
+
+ALIGN 8
+pw_2: times 4 dw 2
+pw_8: times 4 dw 8
+pb_1: times 8 db 1
+pw_3210:
+    dw 0
+    dw 1
+    dw 2
+    dw 3
+
 ;=============================================================================
 ; Code
 ;=============================================================================
 
 SECTION .text
 
+cglobal predict_8x8_v_mmxext
+cglobal predict_8x8_dc_core_mmxext
 cglobal predict_8x8c_v_mmx
+cglobal predict_8x8c_dc_core_mmxext
+cglobal predict_8x8c_p_core_mmx
+cglobal predict_16x16_p_core_mmx
 cglobal predict_16x16_v_mmx
+cglobal predict_16x16_dc_core_mmxext
+cglobal predict_16x16_dc_top_mmxext
+
+
+
+%macro PRED8x8_LOWPASS 2
+    movq        mm3, mm1
+    pavgb       mm1, mm2
+    pxor        mm2, mm3
+    movq        %1 , %2
+    pand        mm2, [pb_1 GLOBAL]
+    psubusb     mm1, mm2
+    pavgb       %1 , mm1     ; %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
+%endmacro
+
+%macro PRED8x8_LOAD_TOP 0
+    sub         parm1q, parm2q
+
+    and         parm3d, 12
+    movq        mm1, [parm1q-1]
+    movq        mm2, [parm1q+1]
+
+    cmp         parm3d, byte 8
+    jge         .have_topleft
+    mov         al,  [parm1q]
+    mov         ah,  [parm1q]
+    pinsrw      mm1, eax, 0
+.have_topleft:
+
+    and         parm3d, byte 4
+    jne         .have_topright
+    mov         al,  [parm1q+7]
+    mov         ah,  [parm1q+7]
+    pinsrw      mm2, eax, 3
+.have_topright:
+
+    PRED8x8_LOWPASS mm0, [parm1q]
+%endmacro
+
+;-----------------------------------------------------------------------------
+;
+; void predict_8x8_v_mmxext( uint8_t *src, int i_stride, int i_neighbors )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_8x8_v_mmxext:
+    PRED8x8_LOAD_TOP
+
+    lea         rax, [parm2q + 2*parm2q]
+    movq        [parm1q +   parm2q], mm0      ; 0
+    movq        [parm1q + 2*parm2q], mm0      ; 1
+    movq        [parm1q + 4*parm2q], mm0      ; 3
+    movq        [parm1q + 8*parm2q], mm0      ; 7
+    add         parm1q, rax
+    movq        [parm1q], mm0                 ; 2
+    movq        [parm1q + 2*parm2q], mm0      ; 4
+    movq        [parm1q +   rax   ], mm0      ; 5
+    movq        [parm1q + 4*parm2q], mm0      ; 6
+
+    ret
+
+;-----------------------------------------------------------------------------
+;
+; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_stride, int i_neighbors, uint8_t *pix_left );
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_8x8_dc_core_mmxext:
+    movq        mm1, [parm4q-1]
+    movq        mm2, [parm4q+1]
+    PRED8x8_LOWPASS mm4, [parm4q]
+
+    PRED8x8_LOAD_TOP
+
+    pxor        mm1, mm1
+    psadbw      mm0, mm1
+    psadbw      mm4, mm1
+    paddw       mm0, [pw_8 GLOBAL]
+    paddw       mm0, mm4
+    psrlw       mm0, 4
+    pshufw      mm0, mm0, 0
+    packuswb    mm0, mm0
+
+    lea         rax, [parm2q + 2*parm2q]
+    movq        [parm1q +   parm2q], mm0      ; 0
+    movq        [parm1q + 2*parm2q], mm0      ; 1
+    movq        [parm1q + 4*parm2q], mm0      ; 3
+    movq        [parm1q + 8*parm2q], mm0      ; 7
+    add         parm1q, rax
+    movq        [parm1q], mm0                 ; 2
+    movq        [parm1q + 2*parm2q], mm0      ; 4
+    movq        [parm1q +   rax   ], mm0      ; 5
+    movq        [parm1q + 4*parm2q], mm0      ; 6
+
+    ret
 
 ;-----------------------------------------------------------------------------
 ;
@@ -68,6 +186,154 @@ predict_8x8c_v_mmx :
 
     ret
 
+;-----------------------------------------------------------------------------
+;
+; void predict_8x8c_dc_core_mmxext( uint8_t *src, int i_stride, int s2, int s3 )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_8x8c_dc_core_mmxext:
+    sub         parm1q, parm2q
+    lea         rax, [parm2q + 2*parm2q]
+
+    movq        mm0, [parm1q]
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpckhbw   mm1, mm0
+    punpcklbw   mm0, mm2
+    psadbw      mm1, mm2        ; s1
+    psadbw      mm0, mm2        ; s0
+
+    movd        mm4, parm3d
+    movd        mm5, parm4d
+    paddw       mm0, mm4
+    pshufw      mm2, mm5, 0
+    psrlw       mm0, 3
+    paddw       mm1, [pw_2 GLOBAL]
+    movq        mm3, mm2
+    pshufw      mm1, mm1, 0
+    pshufw      mm0, mm0, 0     ; dc0 (w)
+    paddw       mm3, mm1
+    psrlw       mm3, 3          ; dc3 (w)
+    psrlw       mm2, 2          ; dc2 (w)
+    psrlw       mm1, 2          ; dc1 (w)
+
+    packuswb    mm0, mm1        ; dc0,dc1 (b)
+    packuswb    mm2, mm3        ; dc2,dc3 (b)
+
+    movq        [parm1q +   parm2q], mm0 ; 0
+    movq        [parm1q + 2*parm2q], mm0 ; 1
+    movq        [parm1q +   rax   ], mm0 ; 2
+    movq        [parm1q + 4*parm2q], mm0 ; 3
+    lea         parm1q, [parm1q + 4*parm2q]
+    movq        [parm1q +   parm2q], mm2 ; 4
+    movq        [parm1q + 2*parm2q], mm2 ; 5
+    movq        [parm1q +   rax   ], mm2 ; 6
+    movq        [parm1q + 4*parm2q], mm2 ; 7
+
+    ret
+
+;-----------------------------------------------------------------------------
+;
+; void predict_8x8c_p_core_mmx( uint8_t *src, int i_stride, int i00, int b, int c )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_8x8c_p_core_mmx:
+    movd        mm0, parm3d
+    movd        mm2, parm4d
+    movd        mm4, parm5d
+    pshufw      mm0, mm0, 0
+    pshufw      mm2, mm2, 0
+    pshufw      mm4, mm4, 0
+    movq        mm1, mm2
+    pmullw      mm2, [pw_3210 GLOBAL]
+    psllw       mm1, 2
+    paddsw      mm0, mm2        ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
+    paddsw      mm1, mm0        ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
+    pxor        mm5, mm5
+
+    mov         eax, 8
+ALIGN 4
+.loop:
+    movq        mm6, mm0
+    movq        mm7, mm1
+    psraw       mm6, 5
+    psraw       mm7, 5
+    pmaxsw      mm6, mm5
+    pmaxsw      mm7, mm5
+    packuswb    mm6, mm7
+    movq        [parm1q], mm6
+
+    paddsw      mm0, mm4
+    paddsw      mm1, mm4
+    add         parm1q, parm2q
+    dec         eax
+    jg          .loop
+
+    nop
+    ret
+
+;-----------------------------------------------------------------------------
+;
+; void predict_16x16_p_core_mmx( uint8_t *src, int i_stride, int i00, int b, int c )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_16x16_p_core_mmx:
+    movd        mm0, parm3d
+    movd        mm2, parm4d
+    movd        mm4, parm5d
+    pshufw      mm0, mm0, 0
+    pshufw      mm2, mm2, 0
+    pshufw      mm4, mm4, 0
+    movq        mm5, mm2
+    movq        mm1, mm2
+    pmullw      mm5, [pw_3210 GLOBAL]
+    psllw       mm2, 3
+    psllw       mm1, 2
+    movq        mm3, mm2
+    paddsw      mm0, mm5        ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
+    paddsw      mm1, mm0        ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
+    paddsw      mm2, mm0        ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
+    paddsw      mm3, mm1        ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
+    pxor        mm5, mm5
+
+    mov         eax, 16
+ALIGN 4
+.loop:
+    movq        mm6, mm0
+    movq        mm7, mm1
+    psraw       mm6, 5
+    psraw       mm7, 5
+    pmaxsw      mm6, mm5
+    pmaxsw      mm7, mm5
+    packuswb    mm6, mm7
+    movq        [parm1q], mm6
+
+    movq        mm6, mm2
+    movq        mm7, mm3
+    psraw       mm6, 5
+    psraw       mm7, 5
+    pmaxsw      mm6, mm5
+    pmaxsw      mm7, mm5
+    packuswb    mm6, mm7
+    movq        [parm1q+8], mm6
+
+    paddsw      mm0, mm4
+    paddsw      mm1, mm4
+    paddsw      mm2, mm4
+    paddsw      mm3, mm4
+    add         parm1q, parm2q
+    dec         eax
+    jg          .loop
+
+    nop
+    ret
+    
 ;-----------------------------------------------------------------------------
 ;
 ; void predict_16x16_v_mmx( uint8_t *src, int i_stride )
@@ -103,3 +369,48 @@ predict_16x16_v_mmx :
     SAVE_0_1    (parm1q + rax)                 ; 15
 
     ret
+
+;-----------------------------------------------------------------------------
+;
+; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_stride, int i_dc_left )
+;
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_DC 2
+    sub         parm1q, parm2q                ; parm1q <-- line -1
+
+    pxor        mm0, mm0
+    pxor        mm1, mm1
+    psadbw      mm0, [parm1q]
+    psadbw      mm1, [parm1q + 8]
+    paddusw     mm0, mm1
+    paddusw     mm0, %1
+    psrlw       mm0, %2                       ; dc
+    pshufw      mm0, mm0, 0
+    lea         r8,  [parm2q + 2*parm2q]      ; eax <-- 3* stride
+    packuswb    mm0, mm0                      ; dc in bytes
+
+    mov         eax, 4
+ALIGN 4
+.loop:
+    SAVE_0_0    (parm1q +     parm2q)         ; 0
+    SAVE_0_0    (parm1q + 2 * parm2q)         ; 1
+    SAVE_0_0    (parm1q +     r8    )         ; 2
+    SAVE_0_0    (parm1q + 4 * parm2q)         ; 3
+    dec         eax
+    lea         parm1q, [parm1q + 4 * parm2q]
+    jg          .loop
+    nop
+%endmacro
+
+ALIGN 16
+predict_16x16_dc_core_mmxext:
+    movd         mm2, parm3d
+    PRED16x16_DC mm2, 5
+    ret
+
+ALIGN 16
+predict_16x16_dc_top_mmxext:
+    PRED16x16_DC [pw_8 GLOBAL], 4
+    ret
+
diff --git a/common/amd64/predict.c b/common/amd64/predict.c
deleted file mode 100644
index 5384134d..00000000
--- a/common/amd64/predict.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*****************************************************************************
- * predict.c: h264 encoder
- *****************************************************************************
- * Copyright (C) 2006 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110 USA
- *****************************************************************************/
-
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#else
-#include <inttypes.h>
-#endif
-
-#include "common/predict.h"
-#include "common/i386/predict.h"
-
-extern void predict_16x16_v_mmx( uint8_t *src, int i_stride );
-extern void predict_8x8c_v_mmx( uint8_t *src, int i_stride );
-
-/****************************************************************************
- * 16x16 prediction for intra luma block
- ****************************************************************************/
-
-#define PREDICT_16x16_DC(v) \
-    for( i = 0; i < 16; i++ )\
-    {\
-        uint64_t *p = (uint64_t*)src;\
-        *p++ = v;\
-        *p++ = v;\
-        src += i_stride;\
-    }
-
-static void predict_16x16_dc( uint8_t *src, int i_stride )
-{
-    uint32_t s = 0;
-    uint64_t dc;
-    int i;
-
-    /* calculate DC value */
-    for( i = 0; i < 16; i++ )
-    {
-        s += src[-1 + i * i_stride];
-        s += src[i - i_stride];
-    }
-    dc = (( s + 16 ) >> 5) * 0x0101010101010101ULL;
-
-    PREDICT_16x16_DC(dc);
-}
-static void predict_16x16_dc_left( uint8_t *src, int i_stride )
-{
-    uint32_t s = 0;
-    uint64_t dc;
-    int i;
-
-    for( i = 0; i < 16; i++ )
-    {
-        s += src[-1 + i * i_stride];
-    }
-    dc = (( s + 8 ) >> 4) * 0x0101010101010101ULL;
-
-    PREDICT_16x16_DC(dc);
-}
-static void predict_16x16_h( uint8_t *src, int i_stride )
-{
-    int i;
-    for( i = 0; i < 16; i++ )
-    {
-        const uint64_t v = 0x0101010101010101ULL * src[-1];
-        uint64_t *p = (uint64_t*)src;
-        *p++ = v;
-        *p++ = v;
-        src += i_stride;
-    }
-}
-
-
-/****************************************************************************
- * 8x8 prediction for intra chroma block
- ****************************************************************************/
-
-static void predict_8x8c_dc_left( uint8_t *src, int i_stride )
-{
-    int y;
-    uint32_t s0 = 0, s1 = 0;
-    uint64_t dc0, dc1;
-
-    for( y = 0; y < 4; y++ )
-    {
-        s0 += src[y * i_stride     - 1];
-        s1 += src[(y+4) * i_stride - 1];
-    }
-    dc0 = (( s0 + 2 ) >> 2)*0x0101010101010101ULL;
-    dc1 = (( s1 + 2 ) >> 2)*0x0101010101010101ULL;
-
-    for( y = 0; y < 4; y++ )
-    {
-        *(uint64_t*)src = dc0;
-        src += i_stride;
-    }
-    for( y = 0; y < 4; y++ )
-    {
-        *(uint64_t*)src = dc1;
-        src += i_stride;
-    }
-
-}
-static void predict_8x8c_dc_top( uint8_t *src, int i_stride )
-{
-    int y, x;
-    uint32_t s0 = 0, s1 = 0;
-    uint64_t dc;
-
-    for( x = 0; x < 4; x++ )
-    {
-        s0 += src[x     - i_stride];
-        s1 += src[x + 4 - i_stride];
-    }
-    dc = (( s0 + 2 ) >> 2)*0x01010101
-       + (( s1 + 2 ) >> 2)*0x0101010100000000ULL;
-
-    for( y = 0; y < 8; y++ )
-    {
-        *(uint64_t*)src = dc;
-        src += i_stride;
-    }
-}
-static void predict_8x8c_h( uint8_t *src, int i_stride )
-{
-    int i;
-    for( i = 0; i < 8; i++ )
-    {
-        *(uint64_t*)src = 0x0101010101010101ULL * src[-1];
-        src += i_stride;
-    }
-}
-
-
-/****************************************************************************
- * Exported functions:
- ****************************************************************************/
-void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
-{
-    pf[I_PRED_16x16_V ]     = predict_16x16_v_mmx;
-    pf[I_PRED_16x16_H ]     = predict_16x16_h;
-    pf[I_PRED_16x16_DC]     = predict_16x16_dc;
-    pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
-}
-
-void x264_predict_8x8c_init_mmxext( x264_predict_t pf[7] )
-{
-    pf[I_PRED_CHROMA_V ]     = predict_8x8c_v_mmx;
-    pf[I_PRED_CHROMA_H ]     = predict_8x8c_h;
-    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8c_dc_left;
-    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8c_dc_top;
-}
-
-void x264_predict_8x8_init_mmxext( x264_predict8x8_t pf[12] )
-{
-}
-
diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm
index 53a16275..beee70bb 100644
--- a/common/i386/predict-a.asm
+++ b/common/i386/predict-a.asm
@@ -91,7 +91,7 @@ cglobal predict_16x16_dc_top_mmxext
     jge         .have_topleft
     mov         al,  [edx]
     mov         ah,  [edx]
-    pinsrw      mm1, ax, 0
+    pinsrw      mm1, eax, 0
     mov         eax, [picesp + 12]
 .have_topleft:
 
@@ -99,7 +99,7 @@ cglobal predict_16x16_dc_top_mmxext
     jne         .have_topright
     mov         al,  [edx+7]
     mov         ah,  [edx+7]
-    pinsrw      mm2, ax, 3
+    pinsrw      mm2, eax, 3
 .have_topright:
 
     PRED8x8_LOWPASS mm0, [edx]
@@ -133,7 +133,7 @@ predict_8x8_v_mmxext:
 
 ;-----------------------------------------------------------------------------
 ;
-; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_stride, int i_neighbors, int i_dc_left );
+; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_stride, int i_neighbors, uint8_t *pix_left );
 ;
 ;-----------------------------------------------------------------------------
 
@@ -264,13 +264,9 @@ predict_8x8c_p_core_mmx:
 
     mov         edx, [picesp + 4]
     mov         ecx, [picesp + 8]
-
-    movd        mm0, [picesp +12]
-    movd        mm2, [picesp +16]
-    movd        mm4, [picesp +20]
-    pshufw      mm0, mm0, 0
-    pshufw      mm2, mm2, 0
-    pshufw      mm4, mm4, 0
+    pshufw      mm0, [picesp +12], 0
+    pshufw      mm2, [picesp +16], 0
+    pshufw      mm4, [picesp +20], 0
     movq        mm1, mm2
     pmullw      mm2, [pw_3210 GLOBAL]
     psllw       mm1, 2
@@ -314,13 +310,9 @@ predict_16x16_p_core_mmx:
 
     mov         edx, [picesp + 4]
     mov         ecx, [picesp + 8]
-
-    movd        mm0, [picesp +12]
-    movd        mm2, [picesp +16]
-    movd        mm4, [picesp +20]
-    pshufw      mm0, mm0, 0     ; FIXME shuf these directly from memory
-    pshufw      mm2, mm2, 0     ;       if there is stack alignment?
-    pshufw      mm4, mm4, 0
+    pshufw      mm0, [picesp +12], 0
+    pshufw      mm2, [picesp +16], 0
+    pshufw      mm4, [picesp +20], 0
     movq        mm5, mm2
     movq        mm1, mm2
     pmullw      mm5, [pw_3210 GLOBAL]
diff --git a/common/i386/predict.c b/common/i386/predict.c
index b6bc9c94..ed067734 100644
--- a/common/i386/predict.c
+++ b/common/i386/predict.c
@@ -21,14 +21,8 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  *****************************************************************************/
 
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#else
-#include <inttypes.h>
-#endif
-
-#include "common/clip1.h"
 #include "common/common.h"
+#include "common/clip1.h"
 #include "predict.h"
 
 extern void predict_16x16_v_mmx( uint8_t *src, int i_stride );
@@ -132,27 +126,130 @@ static void predict_8x8_dc( uint8_t *src, int i_stride, int i_neighbor )
     predict_8x8_dc_core_mmxext( src, i_stride, i_neighbor, l+1 );
 }
 
+#ifdef ARCH_X86_64
+static void predict_16x16_h( uint8_t *src, int i_stride )
+{
+    int y;
+    for( y = 0; y < 16; y++ )
+    {
+        const uint64_t v = 0x0101010101010101ULL * src[-1];
+        uint64_t *p = (uint64_t*)src;
+        p[0] = p[1] = v;
+        src += i_stride;
+    }
+}
+
+static void predict_8x8c_h( uint8_t *src, int i_stride )
+{
+    int y;
+    for( y = 0; y < 8; y++ )
+    {
+        *(uint64_t*)src = 0x0101010101010101ULL * src[-1];
+        src += i_stride;
+    }
+}
+
+static void predict_16x16_dc_left( uint8_t *src, int i_stride )
+{
+    uint32_t s = 0;
+    uint64_t dc; 
+    int y;
+    
+    for( y = 0; y < 16; y++ )
+    {
+        s += src[-1 + y * i_stride];
+    }   
+    dc = (( s + 8 ) >> 4) * 0x0101010101010101ULL;
+    
+    for( y = 0; y < 16; y++ )
+    {
+        uint64_t *p = (uint64_t*)src;
+        p[0] = p[1] = dc;
+        src += i_stride;
+    }
+}
+
+static void predict_8x8c_dc_left( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t s0 = 0, s1 = 0;
+    uint64_t dc0, dc1;
+
+    for( y = 0; y < 4; y++ )
+    {
+        s0 += src[y * i_stride     - 1];
+        s1 += src[(y+4) * i_stride - 1];
+    }
+    dc0 = (( s0 + 2 ) >> 2) * 0x0101010101010101ULL;
+    dc1 = (( s1 + 2 ) >> 2) * 0x0101010101010101ULL;
+
+    for( y = 0; y < 4; y++ )
+    {
+        *(uint64_t*)src = dc0;
+        src += i_stride;
+    }
+    for( y = 0; y < 4; y++ )
+    {
+        *(uint64_t*)src = dc1;
+        src += i_stride;
+    }
+
+}
+
+static void predict_8x8c_dc_top( uint8_t *src, int i_stride )
+{
+    int y, x;
+    uint32_t s0 = 0, s1 = 0;
+    uint64_t dc;
+
+    for( x = 0; x < 4; x++ )
+    {
+        s0 += src[x     - i_stride];
+        s1 += src[x + 4 - i_stride];
+    }
+    dc = (( s0 + 2 ) >> 2) * 0x01010101
+       + (( s1 + 2 ) >> 2) * 0x0101010100000000ULL;
+
+    for( y = 0; y < 8; y++ )
+    {
+        *(uint64_t*)src = dc;
+        src += i_stride;
+    }
+}
+#endif
+
 /****************************************************************************
  * Exported functions:
  ****************************************************************************/
 void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
 {
-    pf[I_PRED_16x16_V] = predict_16x16_v_mmx;
-    pf[I_PRED_16x16_DC] = predict_16x16_dc;
-    pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
-    pf[I_PRED_16x16_P] = predict_16x16_p;
+    pf[I_PRED_16x16_V]       = predict_16x16_v_mmx;
+    pf[I_PRED_16x16_DC]      = predict_16x16_dc;
+    pf[I_PRED_16x16_DC_TOP]  = predict_16x16_dc_top_mmxext;
+    pf[I_PRED_16x16_P]       = predict_16x16_p;
+
+#ifdef ARCH_X86_64
+    pf[I_PRED_16x16_H]       = predict_16x16_h;
+    pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left;
+#endif
 }
 
 void x264_predict_8x8c_init_mmxext( x264_predict_t pf[7] )
 {
-    pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx;
-    pf[I_PRED_CHROMA_P] = predict_8x8c_p;
-    pf[I_PRED_CHROMA_DC] = predict_8x8c_dc;
+    pf[I_PRED_CHROMA_V]       = predict_8x8c_v_mmx;
+    pf[I_PRED_CHROMA_P]       = predict_8x8c_p;
+    pf[I_PRED_CHROMA_DC]      = predict_8x8c_dc;
+
+#ifdef ARCH_X86_64
+    pf[I_PRED_CHROMA_H]       = predict_8x8c_h;
+    pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP]  = predict_8x8c_dc_top;
+#endif
 }
 
 void x264_predict_8x8_init_mmxext( x264_predict8x8_t pf[12] )
 {
-    pf[I_PRED_8x8_V] = predict_8x8_v_mmxext;
+    pf[I_PRED_8x8_V]  = predict_8x8_v_mmxext;
     pf[I_PRED_8x8_DC] = predict_8x8_dc;
 }