From 413d8fa90917044e0ffaffb7009ccbc8059c61b0 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Tue, 19 Apr 2005 18:35:45 +0000
Subject: [PATCH] amd64 asm patch, part1.

git-svn-id: svn://svn.videolan.org/x264/trunk@212 df754926-b1dd-0310-bc7b-ec298dee348c
---
 common/amd64/cpu-a.asm     |  117 ++++
 common/amd64/dct-a.asm     |  313 ++++++++++
 common/amd64/dct-c.c       |  299 ++++++++++
 common/amd64/dct.h         |   38 ++
 common/amd64/mc-a.asm      |  489 +++++++++++++++
 common/amd64/mc-a2.asm     |  402 +++++++++++++
 common/amd64/mc-c.c        | 1161 ++++++++++++++++++++++++++++++++++++
 common/amd64/mc.h          |   34 ++
 common/amd64/pixel-a.asm   |  811 +++++++++++++++++++++++++
 common/amd64/pixel.h       |   51 ++
 common/amd64/predict-a.asm |  141 +++++
 common/amd64/predict.c     |  444 ++++++++++++++
 common/amd64/predict.h     |   31 +
 13 files changed, 4331 insertions(+)
 create mode 100644 common/amd64/cpu-a.asm
 create mode 100644 common/amd64/dct-a.asm
 create mode 100644 common/amd64/dct-c.c
 create mode 100644 common/amd64/dct.h
 create mode 100644 common/amd64/mc-a.asm
 create mode 100644 common/amd64/mc-a2.asm
 create mode 100644 common/amd64/mc-c.c
 create mode 100644 common/amd64/mc.h
 create mode 100644 common/amd64/pixel-a.asm
 create mode 100644 common/amd64/pixel.h
 create mode 100644 common/amd64/predict-a.asm
 create mode 100644 common/amd64/predict.c
 create mode 100644 common/amd64/predict.h

diff --git a/common/amd64/cpu-a.asm b/common/amd64/cpu-a.asm
new file mode 100644
index 00000000..729ece64
--- /dev/null
+++ b/common/amd64/cpu-a.asm
@@ -0,0 +1,117 @@
+;*****************************************************************************
+;* cpu.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_cpu_cpuid_test
+cglobal x264_cpu_cpuid
+cglobal x264_emms
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported
+;-----------------------------------------------------------------------------
+x264_cpu_cpuid_test:
+    pushfd
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+
+    pushfd
+    pop     eax
+    mov     ebx, eax
+    xor     eax, 0x200000
+    push    eax
+    popfd
+    pushfd
+    pop     eax
+    xor     eax, ebx
+    
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+    popfd
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+;-----------------------------------------------------------------------------
+x264_cpu_cpuid:
+
+    push    ebp
+    mov     ebp,    esp
+    push    ebx
+    push    esi
+    push    edi
+    
+    mov     eax,    [ebp +  8]
+    cpuid
+
+    mov     esi,    [ebp + 12]
+    mov     [esi],  eax
+
+    mov     esi,    [ebp + 16]
+    mov     [esi],  ebx
+
+    mov     esi,    [ebp + 20]
+    mov     [esi],  ecx
+
+    mov     esi,    [ebp + 24]
+    mov     [esi],  edx
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     ebp
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_emms( void )
+;-----------------------------------------------------------------------------
+x264_emms:
+    emms
+    ret
+
diff --git a/common/amd64/dct-a.asm b/common/amd64/dct-a.asm
new file mode 100644
index 00000000..92dbc5ae
--- /dev/null
+++ b/common/amd64/dct-a.asm
@@ -0,0 +1,313 @@
+;*****************************************************************************
+;* dct.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
+;*          Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2004.04.28  portab all 4x4 function to nasm (CM)                         *
+;*                                                                           *
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro MMX_ZERO 1
+    pxor    %1, %1
+%endmacro
+
+%macro MMX_LOAD_DIFF_4P 5
+    movd        %1, %4
+    punpcklbw   %1, %3
+    movd        %2, %5
+    punpcklbw   %2, %3
+    psubw       %1, %2
+%endmacro
+
+%macro MMX_SUMSUB_BA 2
+    paddw   %1, %2
+    paddw   %2, %2
+    psubw   %2, %1
+%endmacro
+
+%macro MMX_SUMSUB_BADC 4
+    paddw   %1, %2
+    paddw   %3, %4
+    paddw   %2, %2
+    paddw   %4, %4
+    psubw   %2, %1
+    psubw   %4, %3
+%endmacro
+
+%macro MMX_SUMSUB2_AB 3
+    movq    %3, %1
+    paddw   %1, %1
+    paddw   %1, %2
+    psubw   %3, %2
+    psubw   %3, %2
+%endmacro
+
+%macro MMX_SUMSUBD2_AB 4
+    movq    %4, %1
+    movq    %3, %2
+    psraw   %2, $1
+    psraw   %4, $1
+    paddw   %1, %2
+    psubw   %4, %3
+%endmacro
+
+%macro SBUTTERFLYwd 3
+    movq        %3, %1
+    punpcklwd   %1, %2
+    punpckhwd   %3, %2
+%endmacro
+
+%macro SBUTTERFLYdq 3
+    movq        %3, %1
+    punpckldq   %1, %2
+    punpckhdq   %3, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; input ABCD output ADTC
+;-----------------------------------------------------------------------------
+%macro MMX_TRANSPOSE 5
+    SBUTTERFLYwd %1, %2, %5
+    SBUTTERFLYwd %3, %4, %2
+    SBUTTERFLYdq %1, %3, %4
+    SBUTTERFLYdq %5, %2, %3
+%endmacro
+
+%macro MMX_STORE_DIFF_4P 5
+    paddw       %1, %3
+    psraw       %1, $6
+    movd        %2, %5
+    punpcklbw   %2, %4
+    paddsw      %1, %2
+    packuswb    %1, %1
+    movd        %5, %1
+%endmacro
+
+;%macro 
+;%endmacro
+
+;=============================================================================
+; Local Data (Read Only)
+;=============================================================================
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata data align=16
+%endif
+
+;-----------------------------------------------------------------------------
+; Various memory constants (trigonometric values or rounding values)
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_mmx_1:
+  dw 1, 1, 1, 1
+
+x264_mmx_32:
+  dw 32, 32, 32, 32
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_dct4x4dc_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl dct4x4dc( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+x264_dct4x4dc_mmxext:
+    mov     eax,        [esp+ 4]
+    movq    mm0,        [eax+ 0]
+    movq    mm1,        [eax+ 8]
+    movq    mm2,        [eax+16]
+    movq    mm3,        [eax+24]
+
+    MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
+    MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23
+
+    MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
+    MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
+
+    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
+
+    movq    mm6,        [x264_mmx_1]
+    paddw   mm0,        mm6
+    paddw   mm4,        mm6
+    psraw   mm0,        1
+    movq    [eax+ 0],   mm0
+    psraw   mm4,        1
+    movq    [eax+ 8],   mm4
+    paddw   mm1,        mm6
+    paddw   mm3,        mm6
+    psraw   mm1,        1
+    movq    [eax+16],   mm1
+    psraw   mm3,        1
+    movq    [eax+24],   mm3
+    ret
+
+cglobal x264_idct4x4dc_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+x264_idct4x4dc_mmxext:
+    mov     eax, [esp+ 4]
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+
+    MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
+    MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
+
+    MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
+    MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
+
+    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
+
+    movq    [eax+ 0],   mm0
+    movq    [eax+ 8],   mm4
+    movq    [eax+16],   mm1
+    movq    [eax+24],   mm3
+    ret
+
+cglobal x264_sub4x4_dct_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+;-----------------------------------------------------------------------------
+x264_sub4x4_dct_mmxext:
+    push    ebx
+    mov     eax, [esp+12]   ; pix1
+    mov     ebx, [esp+16]   ; i_pix1
+    mov     ecx, [esp+20]   ; pix2
+    mov     edx, [esp+24]   ; i_pix2
+
+    MMX_ZERO    mm7
+
+    ; Load 4 lines
+    MMX_LOAD_DIFF_4P    mm0, mm6, mm7, [eax      ], [ecx]
+    MMX_LOAD_DIFF_4P    mm1, mm6, mm7, [eax+ebx  ], [ecx+edx]
+    MMX_LOAD_DIFF_4P    mm2, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
+    add     eax, ebx
+    add     ecx, edx
+    MMX_LOAD_DIFF_4P    mm3, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
+
+    MMX_SUMSUB_BADC     mm3, mm0, mm2, mm1          ; mm3=s03  mm0=d03  mm2=s12  mm1=d12
+
+    MMX_SUMSUB_BA       mm2, mm3                    ; mm2=s03+s12      mm3=s03-s12
+    MMX_SUMSUB2_AB      mm0, mm1, mm4               ; mm0=2.d03+d12    mm4=d03-2.d12
+
+    ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
+    MMX_TRANSPOSE       mm2, mm0, mm3, mm4, mm1
+
+    MMX_SUMSUB_BADC     mm3, mm2, mm1, mm4          ; mm3=s03  mm2=d03  mm1=s12  mm4=d12
+
+    MMX_SUMSUB_BA       mm1, mm3                    ; mm1=s03+s12      mm3=s03-s12
+    MMX_SUMSUB2_AB      mm2, mm4, mm0               ; mm2=2.d03+d12    mm0=d03-2.d12
+
+    ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
+    MMX_TRANSPOSE       mm1, mm2, mm3, mm0, mm4
+
+    mov     eax, [esp+ 8]   ; dct
+    movq    [eax+ 0],   mm1
+    movq    [eax+ 8],   mm0
+    movq    [eax+16],   mm4
+    movq    [eax+24],   mm3
+
+    pop     ebx
+    ret
+
+cglobal x264_add4x4_idct_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+x264_add4x4_idct_mmxext:
+
+    ; Load dct coeffs
+    mov     eax, [esp+12]   ; dct
+    movq    mm0, [eax+ 0]
+    movq    mm4, [eax+ 8]
+    movq    mm3, [eax+16]
+    movq    mm1, [eax+24]
+    
+    mov     eax, [esp+ 4]   ; p_dst
+    mov     ecx, [esp+ 8]   ; i_dst
+    lea     edx, [ecx+ecx*2]
+
+    ; out:mm0, mm1, mm2, mm3
+    MMX_TRANSPOSE       mm0, mm4, mm3, mm1, mm2
+
+    MMX_SUMSUB_BA       mm2, mm0                        ; mm2=s02  mm0=d02
+    MMX_SUMSUBD2_AB     mm1, mm3, mm5, mm4              ; mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+    MMX_SUMSUB_BADC     mm1, mm2, mm4, mm0              ; mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13
+
+    ; in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0
+    MMX_TRANSPOSE       mm1, mm4, mm0, mm2, mm3
+
+    MMX_SUMSUB_BA       mm3, mm1                        ; mm3=s02  mm1=d02
+    MMX_SUMSUBD2_AB     mm2, mm0, mm5, mm4              ; mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm4, mm1              ; mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13
+
+    MMX_ZERO            mm7
+    movq                mm6, [x264_mmx_32]
+    
+    MMX_STORE_DIFF_4P   mm2, mm0, mm6, mm7, [eax]
+    MMX_STORE_DIFF_4P   mm4, mm0, mm6, mm7, [eax+ecx]
+    MMX_STORE_DIFF_4P   mm1, mm0, mm6, mm7, [eax+ecx*2]
+    MMX_STORE_DIFF_4P   mm3, mm0, mm6, mm7, [eax+edx]
+
+    ret
+
diff --git a/common/amd64/dct-c.c b/common/amd64/dct-c.c
new file mode 100644
index 00000000..45279035
--- /dev/null
+++ b/common/amd64/dct-c.c
@@ -0,0 +1,299 @@
+/*****************************************************************************
+ * dct.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include "x264.h"
+
+#include "common/dct.h"
+#include "dct.h"
+
+
+#if 0
+#define MMX_ZERO( MMZ ) \
+    asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
+
+/* MMP : diff,  MMT: temp */
+#define MMX_LOAD_DIFF_4P( MMP, MMT, MMZ, pix1, pix2 ) \
+    asm volatile( "movd (%0), " #MMP "\n" \
+                  "punpcklbw  " #MMZ ", " #MMP "\n" \
+                  "movd (%1), " #MMT "\n" \
+                  "punpcklbw  " #MMZ ", " #MMT "\n" \
+                  "psubw      " #MMT ", " #MMP "\n" : : "r"(pix1), "r"(pix2) )
+
+/* in: out: mma=mma+mmb, mmb=mmb-mma */
+#define MMX_SUMSUB_BA( MMA, MMB ) \
+    asm volatile( "paddw " #MMB ", " #MMA "\n"\
+                  "paddw " #MMB ", " #MMB "\n"\
+                  "psubw " #MMA ", " #MMB "\n" :: )
+
+#define MMX_SUMSUB_BADC( MMA, MMB, MMC, MMD ) \
+    asm volatile( "paddw " #MMB ", " #MMA "\n"\
+                  "paddw " #MMD ", " #MMC "\n"\
+                  "paddw " #MMB ", " #MMB "\n"\
+                  "paddw " #MMD ", " #MMD "\n"\
+                  "psubw " #MMA ", " #MMB "\n"\
+                  "psubw " #MMC ", " #MMD "\n" :: )
+
+/* inputs MMA, MMB output MMA MMT */
+#define MMX_SUMSUB2_AB( MMA, MMB, MMT ) \
+    asm volatile( "movq  " #MMA ", " #MMT "\n" \
+                  "paddw " #MMA ", " #MMA "\n" \
+                  "paddw " #MMB ", " #MMA "\n" \
+                  "psubw " #MMB ", " #MMT "\n" \
+                  "psubw " #MMB ", " #MMT "\n" :: )
+
+/* inputs MMA, MMB output MMA MMS */
+#define MMX_SUMSUBD2_AB( MMA, MMB, MMT, MMS ) \
+    asm volatile( "movq  " #MMA ", " #MMS "\n" \
+                  "movq  " #MMB ", " #MMT "\n" \
+                  "psraw   $1    , " #MMB "\n"       \
+                  "psraw   $1    , " #MMS "\n"       \
+                  "paddw " #MMB ", " #MMA "\n" \
+                  "psubw " #MMT ", " #MMS "\n" :: )
+
+#define SBUTTERFLYwd(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpcklwd " #b ", " #a "   \n\t" \
+                  "punpckhwd " #b ", " #t "   \n\t" :: )
+
+#define SBUTTERFLYdq(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpckldq " #b ", " #a "   \n\t" \
+                  "punpckhdq " #b ", " #t "   \n\t" :: )
+
+/* input ABCD output ADTC */
+#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
+        SBUTTERFLYwd( MMA, MMB, MMT ); \
+        SBUTTERFLYwd( MMC, MMD, MMB ); \
+        SBUTTERFLYdq( MMA, MMC, MMD ); \
+        SBUTTERFLYdq( MMT, MMB, MMC )
+
+#define MMX_STORE_DIFF_4P( MMP, MMT, MM32, MMZ, dst ) \
+    asm volatile( "paddw     " #MM32 "," #MMP "\n" \
+                  "psraw       $6,     " #MMP "\n" \
+                  "movd        (%0),   " #MMT "\n" \
+                  "punpcklbw " #MMZ ", " #MMT "\n" \
+                  "paddsw    " #MMT ", " #MMP "\n" \
+                  "packuswb  " #MMZ ", " #MMP "\n" \
+                  "movd      " #MMP ",   (%0)\n" :: "r"(dst) )
+
+#define UNUSED_LONGLONG( foo ) \
+    static const unsigned long long foo __asm__ (#foo)  __attribute__((unused)) __attribute__((aligned(16)))
+
+UNUSED_LONGLONG( x264_mmx_32 ) = 0x0020002000200020ULL;
+UNUSED_LONGLONG( x264_mmx_1 ) = 0x0001000100010001ULL;
+
+
+/*
+ * XXX For all dct dc : input could be equal to output so ...
+ */
+void x264_dct4x4dc_mmxext( int16_t d[4][4] )
+{
+    /* load DCT */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n" :: "r"(d) );
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 );  /* mm1=s01  mm0=d01  mm3=s23  mm2=d23 */
+    MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 );  /* mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23 */
+
+    /* in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 */
+    MMX_TRANSPOSE  ( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 );  /* mm2=s01  mm3=d01  mm0=s23  mm4=d23 */
+    MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 );  /* mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23 */
+
+    /* in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3 */
+    MMX_TRANSPOSE  ( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
+
+
+    asm volatile( "movq x264_mmx_1, %%mm6" :: );
+
+    /* Store back */
+    asm volatile(
+        "paddw %%mm6, %%mm0\n"
+        "paddw %%mm6, %%mm4\n"
+
+        "psraw $1,    %%mm0\n"
+        "movq  %%mm0,   (%0)\n"
+        "psraw $1,    %%mm4\n"
+        "movq  %%mm4,  8(%0)\n"
+
+        "paddw %%mm6, %%mm1\n"
+        "paddw %%mm6, %%mm3\n"
+
+        "psraw $1,    %%mm1\n"
+        "movq  %%mm1, 16(%0)\n"
+        "psraw $1,    %%mm3\n"
+        "movq  %%mm3, 24(%0)\n" :: "r"(d) );
+}
+
+void x264_idct4x4dc_mmxext( int16_t d[4][4] )
+{
+    /* load DCT */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n" 
+        "movq 24(%0), %%mm3\n" :: "r"(d) );
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 );  /* mm1=s01  mm0=d01  mm3=s23  mm2=d23 */
+    MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 );  /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */
+
+    /* in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 */
+    MMX_TRANSPOSE( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 );  /* mm2=s01  mm3=d01  mm0=s23  mm4=d23 */
+    MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 );  /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */
+
+    /* in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3 */
+    MMX_TRANSPOSE( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
+
+    /* Store back */
+    asm volatile(
+        "movq %%mm0,   (%0)\n"
+        "movq %%mm4,  8(%0)\n"
+        "movq %%mm1, 16(%0)\n" 
+        "movq %%mm3, 24(%0)\n" :: "r"(d) );
+}
+
+/****************************************************************************
+ * subXxX_dct:
+ ****************************************************************************/
+inline void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    /* Reset mm7 */
+    MMX_ZERO( %%mm7 );
+
+    /* Load 4 lines */
+    MMX_LOAD_DIFF_4P( %%mm0, %%mm6, %%mm7, &pix1[0*i_pix1], &pix2[0*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm1, %%mm6, %%mm7, &pix1[1*i_pix1], &pix2[1*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm2, %%mm6, %%mm7, &pix1[2*i_pix1], &pix2[2*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm3, %%mm6, %%mm7, &pix1[3*i_pix1], &pix2[3*i_pix2] );
+
+    MMX_SUMSUB_BADC( %%mm3, %%mm0, %%mm2, %%mm1 );  /* mm3=s03  mm0=d03  mm2=s12  mm1=d12 */
+
+    MMX_SUMSUB_BA(  %%mm2, %%mm3 );                 /* mm2=s03+s12      mm3=s03-s12 */
+    MMX_SUMSUB2_AB( %%mm0, %%mm1, %%mm4 );          /* mm0=2.d03+d12    mm4=d03-2.d12 */
+
+    /* transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 */
+    MMX_TRANSPOSE( %%mm2, %%mm0, %%mm3, %%mm4, %%mm1 );
+
+    MMX_SUMSUB_BADC( %%mm3, %%mm2, %%mm1, %%mm4 );  /* mm3=s03  mm2=d03  mm1=s12  mm4=d12 */
+
+    MMX_SUMSUB_BA(  %%mm1, %%mm3 );                 /* mm1=s03+s12      mm3=s03-s12 */
+    MMX_SUMSUB2_AB( %%mm2, %%mm4, %%mm0 );          /* mm2=2.d03+d12    mm0=d03-2.d12 */
+
+    /* transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3 */
+    MMX_TRANSPOSE( %%mm1, %%mm2, %%mm3, %%mm0, %%mm4 );
+
+    /* Store back */
+    asm volatile(
+        "movq %%mm1, (%0)\n"
+        "movq %%mm0, 8(%0)\n"
+        "movq %%mm4, 16(%0)\n"
+        "movq %%mm3, 24(%0)\n" :: "r"(dct) );
+}
+#endif
+
+void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    x264_sub4x4_dct_mmxext( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
+}
+
+void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    x264_sub8x8_dct_mmxext( &dct[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+}
+
+
+
+/****************************************************************************
+ * addXxX_idct:
+ ****************************************************************************/
+#if 0
+inline void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+{
+    /* Load dct coeffs */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n" :: "r"(dct) );
+
+    MMX_SUMSUB_BA  ( %%mm2, %%mm0 );                /* mm2=s02  mm0=d02 */
+    MMX_SUMSUBD2_AB( %%mm1, %%mm3, %%mm5, %%mm4 );  /* mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm2, %%mm4, %%mm0 );  /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
+
+    /* in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0 */
+    MMX_TRANSPOSE  ( %%mm1, %%mm4, %%mm0, %%mm2, %%mm3 );
+
+    MMX_SUMSUB_BA  ( %%mm3, %%mm1 );                /* mm3=s02  mm1=d02 */
+    MMX_SUMSUBD2_AB( %%mm2, %%mm0, %%mm5, %%mm4 );  /* mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm4, %%mm1 );  /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
+
+    /* in: mm2, mm4, mm1, mm3  out: mm2, mm3, mm0, mm1 */
+    MMX_TRANSPOSE  ( %%mm2, %%mm4, %%mm1, %%mm3, %%mm0 );
+
+    MMX_ZERO( %%mm7 );
+    asm volatile( "movq x264_mmx_32, %%mm6\n" :: );
+
+    MMX_STORE_DIFF_4P( %%mm2, %%mm4, %%mm6, %%mm7, &p_dst[0*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm3, %%mm4, %%mm6, %%mm7, &p_dst[1*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm0, %%mm4, %%mm6, %%mm7, &p_dst[2*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm1, %%mm4, %%mm6, %%mm7, &p_dst[3*i_dst] );
+}
+#endif
+
+void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
+{
+    x264_add4x4_idct_mmxext( p_dst, i_dst,             dct[0] );
+    x264_add4x4_idct_mmxext( &p_dst[4], i_dst,         dct[1] );
+    x264_add4x4_idct_mmxext( &p_dst[4*i_dst+0], i_dst, dct[2] );
+    x264_add4x4_idct_mmxext( &p_dst[4*i_dst+4], i_dst, dct[3] );
+}
+
+void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
+{
+    x264_add8x8_idct_mmxext( &p_dst[0], i_dst, &dct[0] );
+    x264_add8x8_idct_mmxext( &p_dst[8], i_dst, &dct[4] );
+    x264_add8x8_idct_mmxext( &p_dst[8*i_dst], i_dst, &dct[8] );
+    x264_add8x8_idct_mmxext( &p_dst[8*i_dst+8], i_dst, &dct[12] );
+}
diff --git a/common/amd64/dct.h b/common/amd64/dct.h
new file mode 100644
index 00000000..23601e5e
--- /dev/null
+++ b/common/amd64/dct.h
@@ -0,0 +1,38 @@
+/*****************************************************************************
+ * dct.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_DCT_H
+#define _I386_DCT_H 1
+
+void x264_sub4x4_dct_mmxext( int16_t dct[4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+
+void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
+void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
+void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+
+void x264_dct4x4dc_mmxext( int16_t d[4][4] );
+void x264_idct4x4dc_mmxext( int16_t d[4][4] );
+
+#endif
diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm
new file mode 100644
index 00000000..ebc68d85
--- /dev/null
+++ b/common/amd64/mc-a.asm
@@ -0,0 +1,489 @@
+;*****************************************************************************
+;* mc.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $
+;*
+;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
+;*          Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2004.05.17 portab mc_copy_w4/8/16 (CM)                                   *
+;*                                                                           *
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;=============================================================================
+; Local Data (Read Only)
+;=============================================================================
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata data align=16
+%endif
+
+;-----------------------------------------------------------------------------
+; Various memory constants (trigonometric values or rounding values)
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_pixel_avg_w4_mmxext
+cglobal x264_pixel_avg_w8_mmxext
+cglobal x264_pixel_avg_w16_mmxext
+cglobal x264_pixel_avg_w16_sse2
+
+cglobal x264_mc_copy_w4_mmxext
+cglobal x264_mc_copy_w8_mmxext
+cglobal x264_mc_copy_w16_mmxext
+cglobal x264_mc_copy_w16_sse2
+
+cglobal x264_mc_chroma_sse
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w4_mmxext( uint8_t *dst,  int i_dst_stride,
+;                                uint8_t *src1, int i_src1_stride,
+;                                uint8_t *src2, int i_src2_stride,
+;                                int i_height );
+;-----------------------------------------------------------------------------
+x264_pixel_avg_w4_mmxext:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    mov         eax, [esp+32]       ; i_src1_stride
+    mov         edx, [esp+40]       ; i_src2_stride
+    mov         ebp, [esp+44]       ; i_height
+ALIGN 4
+.height_loop    
+    movd        mm0, [ebx]
+    pavgb       mm0, [ecx]
+    movd        mm1, [ebx+eax]
+    pavgb       mm1, [ecx+edx]
+    movd        [edi], mm0
+    movd        [edi+esi], mm1
+    dec         ebp
+    dec         ebp
+    lea         ebx, [ebx+eax*2]
+    lea         ecx, [ecx+edx*2]
+    lea         edi, [edi+esi*2]
+    jne         .height_loop
+
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+                          
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w8_mmxext( uint8_t *dst,  int i_dst_stride,
+;                                uint8_t *src1, int i_src1_stride,
+;                                uint8_t *src2, int i_src2_stride,
+;                                int i_height );
+;-----------------------------------------------------------------------------
+x264_pixel_avg_w8_mmxext:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    mov         eax, [esp+32]       ; i_src1_stride
+    mov         edx, [esp+40]       ; i_src2_stride
+    mov         ebp, [esp+44]       ; i_height
+ALIGN 4
+.height_loop    
+    movq        mm0, [ebx]
+    pavgb       mm0, [ecx]
+    movq        [edi], mm0
+    dec         ebp
+    lea         ebx, [ebx+eax]
+    lea         ecx, [ecx+edx]
+    lea         edi, [edi+esi]
+    jne         .height_loop
+
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w16_mmxext( uint8_t *dst,  int i_dst_stride,
+;                                 uint8_t *src1, int i_src1_stride,
+;                                 uint8_t *src2, int i_src2_stride,
+;                                 int i_height );
+;-----------------------------------------------------------------------------
+x264_pixel_avg_w16_mmxext:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    mov         eax, [esp+32]       ; i_src1_stride
+    mov         edx, [esp+40]       ; i_src2_stride
+    mov         ebp, [esp+44]       ; i_height
+ALIGN 4
+.height_loop    
+    movq        mm0, [ebx  ]
+    movq        mm1, [ebx+8]
+    pavgb       mm0, [ecx  ]
+    pavgb       mm1, [ecx+8]
+    movq        [edi  ], mm0
+    movq        [edi+8], mm1
+    dec         ebp
+    lea         ebx, [ebx+eax]
+    lea         ecx, [ecx+edx]
+    lea         edi, [edi+esi]
+    jne         .height_loop
+
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w16_sse2( uint8_t *dst,  int i_dst_stride,
+;                               uint8_t *src1, int i_src1_stride,
+;                               uint8_t *src2, int i_src2_stride,
+;                               int i_height );
+;-----------------------------------------------------------------------------
+x264_pixel_avg_w16_sse2:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    mov         eax, [esp+32]       ; i_src1_stride
+    mov         edx, [esp+40]       ; i_src2_stride
+    mov         ebp, [esp+44]       ; i_height
+ALIGN 4
+.height_loop    
+    movdqu      xmm0, [ebx]
+    pavgb       xmm0, [ecx]
+    movdqu      [edi], xmm0
+
+    dec         ebp
+    lea         ebx, [ebx+eax]
+    lea         ecx, [ecx+edx]
+    lea         edi, [edi+esi]
+    jne         .height_loop
+
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;  void x264_mc_copy_w4_mmxext( uint8_t *src, int i_src_stride,
+;                               uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+x264_mc_copy_w4_mmxext:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    mov     eax, [esi]
+    mov     [edi], eax
+    mov     eax, [esi+ebx]
+    mov     [edi+edx], eax
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    dec     ecx
+    dec     ecx
+    jne     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+cglobal mc_copy_w8
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_mc_copy_w8_mmxext( uint8_t *src, int i_src_stride,
+;                                uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+x264_mc_copy_w8_mmxext:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    movq    mm0, [esi]
+    movq    [edi], mm0
+    movq    mm1, [esi+ebx]
+    movq    [edi+edx], mm1
+    movq    mm2, [esi+ebx*2]
+    movq    [edi+edx*2], mm2
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    movq    mm3, [esi+ebx]
+    movq    [edi+edx], mm3
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    
+    sub     ecx, byte 4
+    jnz     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+cglobal mc_copy_w16
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_mc_copy_w16_mmxext( uint8_t *src, int i_src_stride,
+;                                 uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+x264_mc_copy_w16_mmxext:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+
+ALIGN 4
+.height_loop
+    movq    mm0, [esi]
+    movq    mm1, [esi+8]
+    movq    [edi], mm0
+    movq    [edi+8], mm1
+    movq    mm2, [esi+ebx]
+    movq    mm3, [esi+ebx+8]
+    movq    [edi+edx], mm2
+    movq    [edi+edx+8], mm3
+    movq    mm4, [esi+ebx*2]
+    movq    mm5, [esi+ebx*2+8]
+    movq    [edi+edx*2], mm4
+    movq    [edi+edx*2+8], mm5
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    movq    mm6, [esi+ebx]
+    movq    mm7, [esi+ebx+8]
+    movq    [edi+edx], mm6
+    movq    [edi+edx+8], mm7
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    sub     ecx, byte 4
+    jnz     .height_loop
+    
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+x264_mc_copy_w16_sse2:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+
+ALIGN 4
+.height_loop
+    movdqu  xmm0, [esi]
+    movdqu  xmm1, [esi+ebx]
+    movdqu  [edi], xmm0
+    movdqu  [edi+edx], xmm1
+    dec     ecx
+    dec     ecx
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    jnz     .height_loop
+    
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+
+SECTION .rodata
+
+ALIGN 16
+eights    times 4   dw 8
+thirty2s  times 4   dw 32
+
+SECTION .text
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_mc_chroma_sse( uint8_t *src, int i_src_stride,
+;                               uint8_t *dst, int i_dst_stride,
+;                               int dx, int dy,
+;                               int i_height, int i_width )
+;-----------------------------------------------------------------------------
+
+x264_mc_chroma_sse:
+
+    pxor    mm3, mm3
+
+    pshufw  mm5, [esp+20], 0    ; mm5 - dx
+    pshufw  mm6, [esp+24], 0    ; mm6 - dy
+
+    movq    mm4, [eights]
+    movq    mm0, mm4
+
+    psubw   mm4, mm5            ; mm4 - 8-dx
+    psubw   mm0, mm6            ; mm0 - 8-dy
+
+    movq    mm7, mm5
+    pmullw  mm5, mm0            ; mm5 = dx*(8-dy) =     cB
+    pmullw  mm7, mm6            ; mm7 = dx*dy =         cD
+    pmullw  mm6, mm4            ; mm6 = (8-dx)*dy =     cC
+    pmullw  mm4, mm0            ; mm4 = (8-dx)*(8-dy) = cA
+
+    push    edi
+
+    mov     eax, [esp+4+4]     ; src
+    mov     edi, [esp+4+12]    ; dst
+    mov     ecx, [esp+4+8]     ; i_src_stride
+    mov     edx, [esp+4+28]    ; i_height
+
+ALIGN 4
+.height_loop
+
+    movd    mm1, [eax+ecx]
+    movd    mm0, [eax]
+    punpcklbw mm1, mm3          ; 00 px1 | 00 px2 | 00 px3 | 00 px4
+    punpcklbw mm0, mm3
+    pmullw  mm1, mm6            ; 2nd line * cC
+    pmullw  mm0, mm4            ; 1st line * cA
+
+    paddw   mm0, mm1            ; mm0 <- result
+
+    movd    mm2, [eax+1]
+    movd    mm1, [eax+ecx+1]
+    punpcklbw mm2, mm3
+    punpcklbw mm1, mm3
+
+    paddw   mm0, [thirty2s]
+
+    pmullw  mm2, mm5            ; line * cB
+    pmullw  mm1, mm7            ; line * cD
+    paddw   mm0, mm2
+    paddw   mm0, mm1
+
+    psrlw   mm0, 6
+    packuswb mm0, mm3           ; 00 00 00 00 px1 px2 px3 px4
+    movd    [edi], mm0
+
+    add     eax, ecx
+    add     edi, [esp+4+16]
+
+    dec     edx
+    jnz     .height_loop
+
+    mov     eax, [esp+4+32]
+    sub     eax, 8
+    jnz     .finish              ; width != 8 so assume 4
+
+    mov     [esp+4+32], eax
+    mov     edi, [esp+4+12]    ; dst
+    mov     eax, [esp+4+4]     ; src
+    mov     edx, [esp+4+28]    ; i_height
+    add     edi, 4
+    add     eax, 4
+    jmp    .height_loop
+
+.finish
+    pop     edi
+    ret
diff --git a/common/amd64/mc-a2.asm b/common/amd64/mc-a2.asm
new file mode 100644
index 00000000..aaab2c1b
--- /dev/null
+++ b/common/amd64/mc-a2.asm
@@ -0,0 +1,402 @@
+;*****************************************************************************
+;* mc-a2.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005 x264 project
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+    %ifdef PREFIX
+        global _%1
+        %define %1 _%1
+    %else
+        global %1
+    %endif
+%endmacro
+
+;=============================================================================
+; Read only data
+;=============================================================================
+
+SECTION .rodata data align=16
+
+ALIGN 16
+mmx_dw_one:
+    times 4 dw 16
+mmx_dd_one:
+    times 2 dd 512
+mmx_dw_20:
+    times 4 dw 20
+mmx_dw_5:
+    times 4 dw -5
+
+SECTION .data
+
+width:
+    dd 0
+height:
+    dd 0
+dstp1:
+    dd 0
+dstp2:
+    dd 0
+buffer:
+    dd 0
+dst1:
+    dd 0
+dst2:
+    dd 0
+src:
+    dd 0
+
+
+;=============================================================================
+; Macros
+;=============================================================================
+
+%macro LOAD_4 9
+    movd %1, %5
+    movd %2, %6
+    movd %3, %7
+    movd %4, %8
+    punpcklbw %1, %9
+    punpcklbw %2, %9
+    punpcklbw %3, %9
+    punpcklbw %4, %9
+%endmacro
+
+%macro FILT_2 2
+    psubw %1, %2
+    psllw %2, 2
+    psubw %1, %2
+%endmacro
+
+%macro FILT_4 3
+    paddw %2, %3
+    psllw %2, 2
+    paddw %1, %2
+    psllw %2, 2
+    paddw %1, %2
+%endmacro
+
+%macro FILT_6 4
+    psubw %1, %2
+    psllw %2, 2
+    psubw %1, %2
+    paddw %1, %3
+    paddw %1, %4
+    psraw %1, 5
+%endmacro
+
+%macro FILT_ALL 1
+    LOAD_4      mm1, mm2, mm3, mm4, [%1], [%1 + ecx], [%1 + 2 * ecx], [%1 + ebx], mm0
+    FILT_2      mm1, mm2
+    movd        mm5, [%1 + 4 * ecx]
+    movd        mm6, [%1 + edx]
+    FILT_4      mm1, mm3, mm4
+    punpcklbw   mm5, mm0
+    punpcklbw   mm6, mm0
+    psubw       mm1, mm5
+    psllw       mm5, 2
+    psubw       mm1, mm5
+    paddw       mm1, mm6
+%endmacro
+
+
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_vertical_filter_mmxext
+cglobal x264_horizontal_filter_mmxext
+cglobal x264_center_filter_mmxext
+
+;-----------------------------------------------------------------------------
+;
+; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
+;                                 uint8_t *dst2, int i_dst2_stride,
+;                                  uint8_t *src, int i_src_stride,
+;                                  int i_width, int i_height );
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_center_filter_mmxext :
+
+    push        edi
+    push        esi
+    push        ebx
+    push        ebp
+
+    mov         esi,      [esp + 36]         ; src
+
+    mov         edx,      [esp + 20]         ; dst1
+    mov         [dst1],   edx
+
+    mov         edi,      [esp + 28]         ; dst2
+    mov         [dst2],   edi
+
+    mov         eax,      [esp + 44]         ; width
+    mov         [width],  eax
+
+    mov         eax,      [esp + 48]         ; height
+    mov         [height], eax
+
+    mov         eax,      [esp + 24]         ; dst1_stride
+    mov         [dstp1],  eax
+
+    mov         eax,      [esp + 32]         ; dst2_stride
+    mov         [dstp2],  eax
+
+    mov         ecx,      [esp + 40]         ; src_stride
+
+    sub         esp,      ecx
+    sub         esp,      ecx                ; esp is now at the beginning of the buffer
+    mov         [buffer], esp
+
+    ;sub        esi,      2
+    sub         esi,      ecx
+    sub         esi,      ecx                ; esi - 2 - 2 * stride
+    mov         [src],    esi
+
+    ;sub        edi,      2
+
+    mov         ebx,      ecx
+    shl         ebx,      1
+    add         ebx,      ecx                ; 3 * src_stride
+
+    mov         edx,      ecx
+    shl         edx,      1
+    add         edx,      ebx                ; 5 * src_stride
+
+    pxor        mm0,      mm0                ; 0 ---> mm0
+    movq        mm7,      [mmx_dd_one]       ; for rounding
+
+    mov         ebp,      [height]
+
+loopcy:
+
+    dec         ebp
+    mov         eax,    [width]
+    mov         edi,    [dst1]
+    mov         esp,    [buffer]
+    mov         esi,    [src]
+
+    FILT_ALL    esi
+
+    pshufw      mm2,    mm1, 0
+    movq        [esp],  mm2
+    add         esp,    8
+    movq        [esp],  mm1
+    add         esp,    8
+    paddw       mm1,    [mmx_dw_one]
+    psraw       mm1,    5
+
+    packuswb    mm1,    mm1
+    movd        [edi],  mm1
+
+    sub         eax,    8
+    add         edi,    4
+    add         esi,    4
+
+loopcx1:
+
+    sub         eax,    4
+
+    FILT_ALL    esi
+
+    movq        [esp],  mm1
+    paddw       mm1,    [mmx_dw_one]
+    psraw       mm1,    5
+    packuswb    mm1,    mm1
+    movd        [edi],  mm1
+
+    add         esp,    8
+    add         esi,    4
+    add         edi,    4
+    test        eax,    eax
+    jnz         loopcx1
+
+    FILT_ALL    esi
+
+    pshufw      mm2,    mm1,  7
+    movq        [esp],  mm1
+    add         esp,    8
+    movq        [esp],  mm2
+    paddw       mm1,    [mmx_dw_one]
+    psraw       mm1,    5
+    packuswb    mm1,    mm1
+    movd        [edi],  mm1
+
+    mov         esi,    [src]
+    add         esi,    ecx
+    mov         [src],  esi
+
+    mov         edi,    [dst1]
+    add         edi,    [dstp1]
+    mov         [dst1], edi
+
+    mov         eax,    [width]
+    mov         edi,    [dst2]
+    mov         esp,    [buffer]
+    add         esp,    4
+
+loopcx2:
+
+    sub         eax,    4
+
+    movq        mm2,    [esp + 2 * eax + 2]
+    movq        mm3,    [esp + 2 * eax + 4]
+    movq        mm4,    [esp + 2 * eax + 6]
+    movq        mm5,    [esp + 2 * eax + 8]
+    movq        mm1,    [esp + 2 * eax]
+    movq        mm6,    [esp + 2 * eax + 10]
+    paddw       mm2,    mm5
+    paddw       mm3,    mm4
+    paddw       mm1,    mm6
+
+    movq        mm5,    [mmx_dw_20]
+    movq        mm4,    [mmx_dw_5]
+    movq        mm6,    mm1
+    pxor        mm7,    mm7
+
+    punpckhwd   mm5,    mm2
+    punpcklwd   mm4,    mm3
+    punpcklwd   mm2,    [mmx_dw_20]
+    punpckhwd   mm3,    [mmx_dw_5]
+
+    pcmpgtw     mm7,    mm1
+
+    pmaddwd     mm2,    mm4
+    pmaddwd     mm3,    mm5
+
+    punpcklwd   mm1,    mm7
+    punpckhwd   mm6,    mm7
+
+    paddd       mm2,    mm1
+    paddd       mm3,    mm6
+
+    paddd       mm2,    [mmx_dd_one]
+    paddd       mm3,    [mmx_dd_one]
+
+    psrad       mm2,    10
+    psrad       mm3,    10
+
+    packssdw    mm2,    mm3
+    packuswb    mm2,    mm0
+
+    movd        [edi + eax], mm2
+
+    test        eax,    eax
+    jnz         loopcx2
+
+    add         edi,    [dstp2]
+    mov         [dst2], edi
+
+    test        ebp,    ebp
+    jnz         loopcy
+
+    mov         esp,    [buffer]
+    shl         ecx,    1
+    add         esp,    ecx
+
+    pop         ebp
+    pop         ebx
+    pop         esi
+    pop         edi
+
+    ret
+
+;-----------------------------------------------------------------------------
+;
+; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
+;                                     uint8_t *src, int i_src_stride,
+;                                     int i_width, int i_height );
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_horizontal_filter_mmxext :
+    push edi
+    push esi
+
+    mov         edi,    [esp + 12]           ; dst
+    mov         esi,    [esp + 20]           ; src
+
+    pxor        mm0,    mm0
+    movq        mm7,    [mmx_dw_one]
+
+    mov         ecx,    [esp + 32]           ; height
+
+    sub         esi,    2
+
+loophy:
+
+    dec         ecx
+    mov         eax,    [esp + 28]           ; width
+
+loophx:
+
+    sub         eax,    8
+
+    LOAD_4      mm1,    mm2, mm3, mm4, [esi + eax], [esi + eax + 1], [esi + eax + 2], [esi + eax + 3], mm0
+    FILT_2      mm1,    mm2
+    movd        mm5,    [esi + eax + 4]
+    movd        mm6,    [esi + eax + 5]
+    FILT_4      mm1,    mm3, mm4
+    movd        mm2,    [esi + eax + 4]
+    movd        mm3,    [esi + eax + 6]
+    punpcklbw   mm5,    mm0
+    punpcklbw   mm6,    mm0
+    FILT_6      mm1,    mm5, mm6, mm7
+    movd        mm4,    [esi + eax + 7]
+    movd        mm5,    [esi + eax + 8]
+    punpcklbw   mm2,    mm0
+    punpcklbw   mm3,    mm0                  ; mm2(1), mm3(20), mm6(-5) ready
+    FILT_2      mm2,    mm6
+    movd        mm6,    [esi + eax + 9]
+    punpcklbw   mm4,    mm0
+    punpcklbw   mm5,    mm0                  ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
+    FILT_4      mm2,    mm3, mm4
+    punpcklbw   mm6,    mm0
+    FILT_6      mm2,    mm5, mm6, mm7
+
+    packuswb    mm1,    mm2
+    movq        [edi + eax],  mm1
+
+    test        eax,    eax
+    jnz         loophx
+
+    add         esi,    [esp + 24]           ; src_pitch
+    add         edi,    [esp + 16]           ; dst_pitch
+
+    test        ecx,    ecx
+    jnz         loophy
+
+    pop         esi
+    pop         edi
+
+    ret
diff --git a/common/amd64/mc-c.c b/common/amd64/mc-c.c
new file mode 100644
index 00000000..b5b3c3cf
--- /dev/null
+++ b/common/amd64/mc-c.c
@@ -0,0 +1,1161 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include "x264.h"   /* DECLARE_ALIGNED */
+#include "common/mc.h"
+#include "common/clip1.h"
+#include "mc.h"
+
+#if 0
+
+#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
+#define USED_UINT64(foo) \
+    static const uint64_t foo __asm__ (#foo) __attribute__((used))
+#else
+#define USED_UINT64(foo) \
+    static const uint64_t foo __asm__ (#foo) __attribute__((unused))
+#endif
+
+USED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;
+
+
+#define MMX_ZERO( MMZ ) \
+    asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
+
+#define MMX_INIT( MMV, NAME ) \
+    asm volatile( "movq " #NAME ", " #MMV "\n" :: )
+
+#define MMX_SAVE_4P( MMP, MMZ, dst ) \
+    asm volatile( "packuswb " #MMZ  "," #MMP "\n" \
+                  "movd " #MMP ", (%0)" :: "r"(dst) )
+
+#define MMX_LOAD_4P( MMP, MMZ, pix ) \
+    asm volatile( "movd (%0), " #MMP "\n" \
+                  "punpcklbw  " #MMZ ", " #MMP "\n" : : "r"(pix) )
+
+#define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
+    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \
+    MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \
+    MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] )
+
+#define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\
+    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] )
+
+#define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \
+    asm volatile( "packuswb " #MMP2  "," #MMP1 "\n" \
+                  "movq " #MMP1 ", (%0)\n" :: "r"(dst) )
+
+
+#define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \
+    asm volatile( "movq         (%0)   , " #MMP1 "\n" \
+                  "movq       " #MMP1 ", " #MMP2 "\n" \
+                  "punpcklbw  " #MMZ  ", " #MMP1 "\n" \
+                  "punpckhbw  " #MMZ  ", " #MMP2 "\n" : : "r"(pix) )
+
+#define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
+    MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] )
+
+#define SBUTTERFLYwd(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpcklwd " #b ", " #a "   \n\t" \
+                  "punpckhwd " #b ", " #t "   \n\t" :: )
+
+#define SBUTTERFLYdq(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpckldq " #b ", " #a "   \n\t" \
+                  "punpckhdq " #b ", " #t "   \n\t" :: )
+
+/* input ABCD output ADTC  ( or 0?31-2->0123 ) */
+#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
+        SBUTTERFLYwd( MMA, MMB, MMT ); \
+        SBUTTERFLYwd( MMC, MMD, MMB ); \
+        SBUTTERFLYdq( MMA, MMC, MMD ); \
+        SBUTTERFLYdq( MMT, MMB, MMC )
+
+/* first pass MM0 = MM0 -5*MM1 */
+#define MMX_FILTERTAP_P1( MMP0, MMP1 ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" :: )
+                                                   \
+/* second pass MM0 = MM0 + 20*(MM2+MM3) */
+#define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \
+    asm volatile( "paddw    " #MMP3 "," #MMP2 "\n" \
+                                                 \
+                  "psllw      $2,     " #MMP2 "\n" \
+                  "paddw    " #MMP2 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP2 "\n" \
+                  "paddw    " #MMP2 "," #MMP0 "\n" :: )
+
+/* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */
+#define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" \
+                                                   \
+                  "paddw    " #MMP2 "," #MMP0 "\n" \
+                  "paddw    " #MMV  "," #MMP0 "\n" \
+                  "psraw      $5,     " #MMP0 "\n" :: )
+
+#define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psubw    " #MMP3 "," #MMP2 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP3 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psubw    " #MMP3 "," #MMP2 "\n" :: )
+
+/* second pass MM0 = MM0 + 20*(MM1+MM2) */
+#define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \
+    asm volatile( "paddw    " #MMP2 "," #MMP1 "\n" \
+                  "paddw    " #MMP5 "," #MMP4 "\n" \
+                                                 \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP4 "\n" \
+                  "paddw    " #MMP1 "," #MMP0 "\n" \
+                  "paddw    " #MMP4 "," #MMP3 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP4 "\n" \
+                  "paddw    " #MMP1 "," #MMP0 "\n" \
+                  "paddw    " #MMP4 "," #MMP3 "\n" :: )
+
+#define MMX_LOAD_1r( m1, dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \
+
+#define MMX_SAVE_1r( m1, dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \
+
+#define MMX_LOAD_2r( m1, m2, dst, i_dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
+
+#define MMX_SAVE_2r( m1, m2, dst, i_dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
+
+#define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
+    asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
+    asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
+
+#define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
+
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
+}
+
+typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+
+/* NASM functions */
+extern void x264_pixel_avg_w4_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int  );
+extern void x264_pixel_avg_w8_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int  );
+extern void x264_pixel_avg_w16_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int  );
+extern void x264_pixel_avg_w16_sse2( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int  );
+
+/* Macro to define NxM functions */
+/* mc I+H */
+#define MC_IH( name, cpu, width, height, off )  \
+static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
+{                                                               \
+    DECLARE_ALIGNED( uint8_t, tmp[width*height], width );       \
+                                                                \
+    mc_hh_w##width( src, i_src_stride, tmp, width, i_height );  \
+    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
+                                     src+(off), i_src_stride,   \
+                                     tmp, width, i_height );    \
+}
+
+/* mc I+V */
+#define MC_IV( name, cpu, width, height, off )  \
+static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
+{                                                               \
+    DECLARE_ALIGNED( uint8_t, tmp[width*height], width );       \
+                                                                \
+    mc_hv_w##width( src, i_src_stride, tmp, width, i_height );  \
+    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
+                                     src+(off), i_src_stride,   \
+                                     tmp, width, i_height );    \
+}
+
+/* mc H+V */
+#define MC_HV( name, cpu, width, height, off1, off2 ) \
+static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
+{                                                               \
+    DECLARE_ALIGNED( uint8_t, tmp1[width*height], width );      \
+    DECLARE_ALIGNED( uint8_t, tmp2[width*height], width );      \
+                                                                \
+    mc_hv_w##width( src+(off1), i_src_stride, tmp1, width, i_height );  \
+    mc_hh_w##width( src+(off2), i_src_stride, tmp2, width, i_height );  \
+    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
+                                     tmp1, width, tmp2, width,  \
+                                     i_height );                \
+}
+
+/* mc C+H */
+#define MC_CH( name, cpu, width, height, off ) \
+static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
+{                                                               \
+    DECLARE_ALIGNED( uint8_t, tmp1[width*height], width );      \
+    DECLARE_ALIGNED( uint8_t, tmp2[width*height], width );      \
+                                                                \
+    mc_hc_w##width( src,       i_src_stride, tmp1, width, i_height );  \
+    mc_hh_w##width( src+(off), i_src_stride, tmp2, width, i_height );  \
+    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
+                                     tmp1, width, tmp2, width,  \
+                                     i_height );                \
+}
+
+/* mc C+V */
+#define MC_CV( name, cpu, width, height, off ) \
+static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
+{                                                               \
+    DECLARE_ALIGNED( uint8_t, tmp1[width*height], width );      \
+    DECLARE_ALIGNED( uint8_t, tmp2[width*height], width );      \
+                                                                \
+    mc_hc_w##width( src,       i_src_stride, tmp1, width, i_height );  \
+    mc_hv_w##width( src+(off), i_src_stride, tmp2, width, i_height );  \
+    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
+                                     tmp1, width, tmp2, width,  \
+                                     i_height );                \
+}
+
+
+/*****************************************************************************
+ * MC with width == 4 (height <= 8)
+ *****************************************************************************/
+
+extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int );
+
+static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    const int h4 = i_height / 4;
+    uint8_t  srct[4*8*3];
+    uint64_t tmp[4];
+    int y;
+
+    src -= 2;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < h4; y++ )
+    {
+        int i;
+
+        /* Preload data and transpose them */
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 );
+
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 );
+
+        /* we read 2 more bytes that needed */
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 );
+
+        /* tap filter */
+        for( i = 0; i < 4; i++ )
+        {
+            MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 );
+            MMX_FILTERTAP_P1( %%mm0, %%mm1 );
+            MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
+
+            MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 );
+            MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
+
+            MMX_SAVE_1r( %%mm0, &tmp[i] );
+        }
+
+        MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] );
+        MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] );
+        MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] );
+        MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] );
+
+        src += 4 * i_src;
+        dst += 4 * i_dst;
+    }
+}
+static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    int y;
+
+    src -= 2 * i_src;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src );
+        MMX_FILTERTAP_P1( %%mm0, %%mm1 );
+        MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
+
+        MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src );
+        MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 );
+        MMX_SAVE_4P( %%mm0, %%mm7, dst );
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+
+static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int i, x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        int16_t tap[5+4];
+
+        for( i = 0; i < 5+4; i++ )
+        {
+            tap[i] = x264_tapfilter( &src[-2+i], i_src_stride );
+        }
+
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
+        }
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+
+MC_IH( mc_xy10, mmxext, 4, 8, 0 )
+MC_IH( mc_xy30, mmxext, 4, 8, 1 )
+
+MC_IV( mc_xy01, mmxext, 4, 8, 0 )
+MC_IV( mc_xy03, mmxext, 4, 8, i_src_stride )
+
+MC_HV( mc_xy11, mmxext, 4, 8, 0, 0 )
+MC_HV( mc_xy31, mmxext, 4, 8, 1, 0 )
+MC_HV( mc_xy13, mmxext, 4, 8, 0, i_src_stride )
+MC_HV( mc_xy33, mmxext, 4, 8, 1, i_src_stride )
+
+MC_CH( mc_xy21, mmxext, 4, 8, 0 )
+MC_CH( mc_xy23, mmxext, 4, 8, i_src_stride )
+
+MC_CV( mc_xy12, mmxext, 4, 8, 0 )
+MC_CV( mc_xy32, mmxext, 4, 8, 1 )
+
+#if 0
+static void mc_xy10_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
+}
+static void mc_xy30_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src+1, i_src_stride, tmp, 4, i_height );
+}
+
+static void mc_xy01_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
+}
+static void mc_xy03_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 4, i_height );
+}
+
+static void mc_xy11_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy31_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src,   i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy13_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src,              i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy33_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src+1,            i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+
+static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src,              i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+
+static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hv_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src,   i_src_stride, tmp1, 4, i_height );
+    mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+#endif
+
+/*****************************************************************************
+ * MC with width == 8 (height <= 16)
+ *****************************************************************************/
+extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int );
+
+static inline void mc_hh_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w4( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hh_w4( &src[4], i_src, &dst[4], i_dst, i_height );
+}
+static inline void mc_hv_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    int y;
+
+    src -= 2 * i_src;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        MMX_LOAD_2x8( %%mm0, %%mm5, %%mm1, %%mm2, %%mm7,  &src[0*i_src], i_src );
+        MMX_FILTERTAP2_P1( %%mm0, %%mm1, %%mm5, %%mm2 );
+
+
+        MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[2*i_src], i_src );
+        MMX_FILTERTAP2_P2( %%mm0, %%mm1, %%mm2, %%mm5, %%mm3, %%mm4 );
+
+        MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[4*i_src], i_src );
+        MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
+        MMX_FILTERTAP_P3( %%mm5, %%mm3, %%mm4, %%mm6, %%mm7 );
+
+        MMX_SAVEPACK_8P( %%mm0, %%mm5, %%mm7, dst );
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+
+static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int x, y;
+
+    asm volatile( "pxor %%mm7,        %%mm7\n" : : );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        int16_t tap[5+8];
+
+        /* first 8 */
+        asm volatile(
+            "leal   (%0, %1),   %%eax\n"
+
+            "movq       (%0),   %%mm0\n"    /* load pix-2 */
+            "movq       %%mm0,  %%mm2\n"
+            "punpcklbw  %%mm7,  %%mm0\n"
+            "punpckhbw  %%mm7,  %%mm2\n"
+
+            "movq       (%%eax),%%mm1\n"    /* load pix-1 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psubw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "psubw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1),%%mm1\n"  /* load pix */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       (%0,%1,4),%%mm1\n"  /* load pix+2 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psubw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "psubw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       %%mm0,   (%2)\n"
+            "movq       %%mm2,  8(%2)\n"
+
+
+            "addl   $8,         %%eax\n"
+            "addl   $8,         %0\n"
+
+
+            "movd       (%0),   %%mm0\n"    /* load pix-2 */
+            "punpcklbw  %%mm7,  %%mm0\n"
+
+            "movd       (%%eax),%%mm1\n"    /* load pix-1 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1),%%mm1\n"  /* load pix */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movd       (%0,%1,4),%%mm1\n"  /* load pix+2 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movq       %%mm0,  16(%2)\n"
+            : : "r"(src-2*i_src_stride-2), "r"(i_src_stride), "r"(&tap[0]) : "%eax" );
+
+        /* last one */
+        tap[8+4] = x264_tapfilter( &src[-2+8+4], i_src_stride );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
+        }
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+
+MC_IH( mc_xy10, mmxext, 8, 16, 0 )
+MC_IH( mc_xy30, mmxext, 8, 16, 1 )
+
+MC_IV( mc_xy01, mmxext, 8, 16, 0 )
+MC_IV( mc_xy03, mmxext, 8, 16, i_src_stride )
+
+MC_HV( mc_xy11, mmxext, 8, 16, 0, 0 )
+MC_HV( mc_xy31, mmxext, 8, 16, 1, 0 )
+MC_HV( mc_xy13, mmxext, 8, 16, 0, i_src_stride )
+MC_HV( mc_xy33, mmxext, 8, 16, 1, i_src_stride )
+
+MC_CH( mc_xy21, mmxext, 8, 16, 0 )
+MC_CH( mc_xy23, mmxext, 8, 16, i_src_stride )
+
+MC_CV( mc_xy12, mmxext, 8, 16, 0 )
+MC_CV( mc_xy32, mmxext, 8, 16, 1 )
+
+#if 0
+/* mc I+H */
+static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
+}
+static void mc_xy30_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src+1, i_src_stride, tmp, 8, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
+}
+static void mc_xy03_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 8, i_height );
+}
+/* H+V */
+static void mc_xy11_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy31_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src,   i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy13_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src,              i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy33_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src+1,            i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy21_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy12_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hv_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy32_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src,   i_src_stride, tmp1, 8, i_height );
+    mc_hv_w8( src+1, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src,              i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+#endif
+
+/*****************************************************************************
+ * MC with width == 16 (height <= 16)
+ *****************************************************************************/
+
+extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int );
+extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
+
+static inline void mc_hh_w16( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w4( &src[ 0], i_src, &dst[ 0], i_dst, i_height );
+    mc_hh_w4( &src[ 4], i_src, &dst[ 4], i_dst, i_height );
+    mc_hh_w4( &src[ 8], i_src, &dst[ 8], i_dst, i_height );
+    mc_hh_w4( &src[12], i_src, &dst[12], i_dst, i_height );
+}
+static inline void mc_hv_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    mc_hv_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
+    mc_hv_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
+}
+
+static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    mc_hc_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
+    mc_hc_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
+}
+
+/* MMX avg/copy */
+MC_IH( mc_xy10, mmxext, 16, 16, 0 )
+MC_IH( mc_xy30, mmxext, 16, 16, 1 )
+
+MC_IV( mc_xy01, mmxext, 16, 16, 0 )
+MC_IV( mc_xy03, mmxext, 16, 16, i_src_stride )
+
+MC_HV( mc_xy11, mmxext, 16, 16, 0, 0 )
+MC_HV( mc_xy31, mmxext, 16, 16, 1, 0 )
+MC_HV( mc_xy13, mmxext, 16, 16, 0, i_src_stride )
+MC_HV( mc_xy33, mmxext, 16, 16, 1, i_src_stride )
+
+MC_CH( mc_xy21, mmxext, 16, 16, 0 )
+MC_CH( mc_xy23, mmxext, 16, 16, i_src_stride )
+
+MC_CV( mc_xy12, mmxext, 16, 16, 0 )
+MC_CV( mc_xy32, mmxext, 16, 16, 1 )
+
+/* SSE2 avg/copy */
+MC_IH( mc_xy10, sse2, 16, 16, 0 )
+MC_IH( mc_xy30, sse2, 16, 16, 1 )
+
+MC_IV( mc_xy01, sse2, 16, 16, 0 )
+MC_IV( mc_xy03, sse2, 16, 16, i_src_stride )
+
+MC_HV( mc_xy11, sse2, 16, 16, 0, 0 )
+MC_HV( mc_xy31, sse2, 16, 16, 1, 0 )
+MC_HV( mc_xy13, sse2, 16, 16, 0, i_src_stride )
+MC_HV( mc_xy33, sse2, 16, 16, 1, i_src_stride )
+
+MC_CH( mc_xy21, sse2, 16, 16, 0 )
+MC_CH( mc_xy23, sse2, 16, 16, i_src_stride )
+
+MC_CV( mc_xy12, sse2, 16, 16, 0 )
+MC_CV( mc_xy32, sse2, 16, 16, 1 )
+
+
+#if 0
+/* mc I+H */
+static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
+    mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
+}
+static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
+    mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
+    mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
+}
+static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
+    mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height );
+}
+/* H+V */
+static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
+
+    mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
+
+    mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src,   i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
+
+    mc_hv_w16( src,              i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
+
+    mc_hv_w16( src+1,            i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
+
+    mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
+
+    mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hv_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
+
+    mc_hc_w16( src,   i_src_stride, tmp1, 16, i_height );
+    mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
+
+    mc_hc_w16( src,              i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+#endif
+
+#define MOTION_COMPENSATION_LUMA \
+    src += (mvy >> 2) * i_src_stride + (mvx >> 2);  \
+    if( i_width == 4 )                              \
+    {                                               \
+        pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \
+    }                                               \
+    else if( i_width == 8 )                         \
+    {                                               \
+        pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \
+    }                                               \
+    else if( i_width == 16 )                        \
+    {                                               \
+        pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \
+    }                                               \
+    else                                            \
+    {                                               \
+        fprintf( stderr, "Error: motion_compensation_luma called with invalid width" ); \
+    }
+
+static void motion_compensation_luma_mmxext( uint8_t *src, int i_src_stride,
+                                             uint8_t *dst, int i_dst_stride,
+                                             int mvx,int mvy,
+                                             int i_width, int i_height )
+{
+    static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
+    {
+        {
+            { x264_mc_copy_w4_mmxext,   mc_xy10_w4_mmxext,    mc_hh_w4,             mc_xy30_w4_mmxext },
+            { mc_xy01_w4_mmxext,        mc_xy11_w4_mmxext,    mc_xy21_w4_mmxext,    mc_xy31_w4_mmxext },
+            { mc_hv_w4,                 mc_xy12_w4_mmxext,    mc_hc_w4,             mc_xy32_w4_mmxext },
+            { mc_xy03_w4_mmxext,        mc_xy13_w4_mmxext,    mc_xy23_w4_mmxext,    mc_xy33_w4_mmxext },
+        },
+        {
+            { x264_mc_copy_w8_mmxext,   mc_xy10_w8_mmxext,    mc_hh_w8,             mc_xy30_w8_mmxext },
+            { mc_xy01_w8_mmxext,        mc_xy11_w8_mmxext,    mc_xy21_w8_mmxext,    mc_xy31_w8_mmxext },
+            { mc_hv_w8,                 mc_xy12_w8_mmxext,    mc_hc_w8,             mc_xy32_w8_mmxext },
+            { mc_xy03_w8_mmxext,        mc_xy13_w8_mmxext,    mc_xy23_w8_mmxext,    mc_xy33_w8_mmxext },
+        },
+        {
+            { x264_mc_copy_w16_mmxext,   mc_xy10_w16_mmxext,    mc_hh_w16,             mc_xy30_w16_mmxext },
+            { mc_xy01_w16_mmxext,        mc_xy11_w16_mmxext,    mc_xy21_w16_mmxext,    mc_xy31_w16_mmxext },
+            { mc_hv_w16,                 mc_xy12_w16_mmxext,    mc_hc_w16,             mc_xy32_w16_mmxext },
+            { mc_xy03_w16_mmxext,        mc_xy13_w16_mmxext,    mc_xy23_w16_mmxext,    mc_xy33_w16_mmxext },
+        }
+    };
+
+    MOTION_COMPENSATION_LUMA
+}
+
+static void motion_compensation_luma_sse2( uint8_t *src, int i_src_stride,
+                                           uint8_t *dst, int i_dst_stride,
+                                           int mvx,int mvy,
+                                           int i_width, int i_height )
+{
+    static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
+    {
+        {
+            { x264_mc_copy_w4_mmxext,   mc_xy10_w4_mmxext,    mc_hh_w4,             mc_xy30_w4_mmxext },
+            { mc_xy01_w4_mmxext,        mc_xy11_w4_mmxext,    mc_xy21_w4_mmxext,    mc_xy31_w4_mmxext },
+            { mc_hv_w4,                 mc_xy12_w4_mmxext,    mc_hc_w4,             mc_xy32_w4_mmxext },
+            { mc_xy03_w4_mmxext,        mc_xy13_w4_mmxext,    mc_xy23_w4_mmxext,    mc_xy33_w4_mmxext },
+        },
+        {
+            { x264_mc_copy_w8_mmxext,   mc_xy10_w8_mmxext,    mc_hh_w8,             mc_xy30_w8_mmxext },
+            { mc_xy01_w8_mmxext,        mc_xy11_w8_mmxext,    mc_xy21_w8_mmxext,    mc_xy31_w8_mmxext },
+            { mc_hv_w8,                 mc_xy12_w8_mmxext,    mc_hc_w8,             mc_xy32_w8_mmxext },
+            { mc_xy03_w8_mmxext,        mc_xy13_w8_mmxext,    mc_xy23_w8_mmxext,    mc_xy33_w8_mmxext },
+        },
+        {
+            { x264_mc_copy_w16_sse2,   mc_xy10_w16_sse2,    mc_hh_w16,             mc_xy30_w16_sse2 },
+            { mc_xy01_w16_sse2,        mc_xy11_w16_sse2,    mc_xy21_w16_sse2,    mc_xy31_w16_sse2 },
+            { mc_hv_w16,                 mc_xy12_w16_sse2,    mc_hc_w16,             mc_xy32_w16_sse2 },
+            { mc_xy03_w16_sse2,        mc_xy13_w16_sse2,    mc_xy23_w16_sse2,    mc_xy33_w16_sse2 },
+        }
+    };
+    MOTION_COMPENSATION_LUMA
+}
+
+#endif
+
+void mc_luma_mmx( uint8_t *src[4], int i_src_stride,
+              uint8_t *dst,    int i_dst_stride,
+              int mvx,int mvy,
+              int i_width, int i_height )
+{
+    uint8_t *src1, *src2;
+
+    /* todo : fixme... */
+    int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
+
+    int hpel1x = mvx>>1;
+    int hpel1y = (mvy+1-correction)>>1;
+    int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
+
+
+    src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
+
+    if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
+    {
+        int hpel2x = (mvx+1)>>1;
+        int hpel2y = (mvy+correction)>>1;
+        int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
+
+        src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
+
+        switch(i_width) {
+        case 4:
+            x264_pixel_avg_w4_mmxext( dst, i_dst_stride, src1, i_src_stride,
+                          src2, i_src_stride, i_height );
+            break;
+        case 8:
+            x264_pixel_avg_w8_mmxext( dst, i_dst_stride, src1, i_src_stride,
+                          src2, i_src_stride, i_height );
+            break;
+        case 16:
+        default:
+            x264_pixel_avg_w16_mmxext(dst, i_dst_stride, src1, i_src_stride,
+                          src2, i_src_stride, i_height );
+        }
+    }
+    else
+    {
+        switch(i_width) {
+        case 4:
+            x264_mc_copy_w4_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
+            break;
+        case 8:
+            x264_mc_copy_w8_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
+            break;
+        case 16:
+            x264_mc_copy_w16_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
+            break;
+        }
+
+    }
+}
+
+uint8_t *get_ref_mmx( uint8_t *src[4], int i_src_stride,
+                      uint8_t *dst,   int *i_dst_stride,
+                      int mvx,int mvy,
+                      int i_width, int i_height )
+{
+    uint8_t *src1, *src2;
+
+    /* todo : fixme... */
+    int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
+
+    int hpel1x = mvx>>1;
+    int hpel1y = (mvy+1-correction)>>1;
+    int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
+
+
+    src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
+
+    if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
+    {
+        int hpel2x = (mvx+1)>>1;
+        int hpel2y = (mvy+correction)>>1;
+        int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
+
+        src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
+    
+        switch(i_width) {
+        case 4:
+            x264_pixel_avg_w4_mmxext( dst, *i_dst_stride, src1, i_src_stride,
+                          src2, i_src_stride, i_height );
+            break;
+        case 8:
+            x264_pixel_avg_w8_mmxext( dst, *i_dst_stride, src1, i_src_stride,
+                          src2, i_src_stride, i_height );
+            break;
+        case 16:
+        default:
+            x264_pixel_avg_w16_mmxext(dst, *i_dst_stride, src1, i_src_stride,
+                          src2, i_src_stride, i_height );
+        }
+        return dst;
+
+    }
+    else
+    {
+        *i_dst_stride = i_src_stride;
+        return src1;
+    }
+}
+
+
+void x264_mc_mmxext_init( x264_mc_functions_t *pf )
+{
+    pf->mc_luma   = mc_luma_mmx;
+    pf->get_ref   = get_ref_mmx;
+}
+void x264_mc_sse2_init( x264_mc_functions_t *pf )
+{
+    /* todo: use sse2 */
+    pf->mc_luma   = mc_luma_mmx;
+    pf->get_ref   = get_ref_mmx;
+}
+
+#if 0
+void get_funcs_mmx(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
+{
+    *int_h = mc_hh_w16;
+    *int_v = mc_hv_w16;
+    *int_hv = mc_hc_w16;
+}
+
+void get_funcs_sse2(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
+{
+    *int_h = mc_hh_w16;
+    *int_v = mc_hv_w16;
+    *int_hv = mc_hc_w16;
+}
+#endif
diff --git a/common/amd64/mc.h b/common/amd64/mc.h
new file mode 100644
index 00000000..69766167
--- /dev/null
+++ b/common/amd64/mc.h
@@ -0,0 +1,34 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_MC_H
+#define _I386_MC_H 1
+
+void x264_mc_mmxext_init( x264_mc_functions_t *pf );
+void x264_mc_sse2_init( x264_mc_functions_t *pf );
+
+void x264_mc_chroma_sse( uint8_t *src, int i_src_stride,
+                         uint8_t *dst, int i_dst_stride,
+                         int dx, int dy,
+                         int i_height, int i_width );
+#endif
diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm
new file mode 100644
index 00000000..9760173f
--- /dev/null
+++ b/common/amd64/pixel-a.asm
@@ -0,0 +1,811 @@
+;*****************************************************************************
+;* pixel.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro SAD_INC_2x16P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+    movq    mm3,    [eax+8]
+    movq    mm4,    [ecx+8]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    movq    mm1,    [eax+ebx]
+    movq    mm2,    [ecx+edx]
+    movq    mm3,    [eax+ebx+8]
+    movq    mm4,    [ecx+edx+8]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro SAD_INC_2x8P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+    movq    mm3,    [eax+ebx]
+    movq    mm4,    [ecx+edx]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro SAD_INC_2x4P 0
+    movd    mm1,    [eax]
+    movd    mm2,    [ecx]
+    movd    mm3,    [eax+ebx]
+    movd    mm4,    [ecx+edx]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro SSD_INC_1x16P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+    movq    mm3,    [eax+8]
+    movq    mm4,    [ecx+8]
+
+    movq    mm5,    mm2
+    movq    mm6,    mm4
+    psubusb mm2,    mm1
+    psubusb mm4,    mm3
+    psubusb mm1,    mm5
+    psubusb mm3,    mm6
+    por     mm1,    mm2
+    por     mm3,    mm4
+
+    movq    mm2,    mm1
+    movq    mm4,    mm3
+    punpcklbw mm1,  mm7
+    punpcklbw mm3,  mm7
+    punpckhbw mm2,  mm7
+    punpckhbw mm4,  mm7
+    pmaddwd mm1,    mm1
+    pmaddwd mm2,    mm2
+    pmaddwd mm3,    mm3
+    pmaddwd mm4,    mm4
+
+    add     eax,    ebx
+    add     ecx,    edx
+    paddd   mm0,    mm1
+    paddd   mm0,    mm2
+    paddd   mm0,    mm3
+    paddd   mm0,    mm4
+%endmacro
+
+%macro SSD_INC_1x8P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+
+    movq    mm5,    mm2
+    psubusb mm2,    mm1
+    psubusb mm1,    mm5
+    por     mm1,    mm2         ; mm1 = 8bit abs diff
+
+    movq    mm2,    mm1
+    punpcklbw mm1,  mm7
+    punpckhbw mm2,  mm7         ; (mm1,mm2) = 16bit abs diff
+    pmaddwd mm1,    mm1
+    pmaddwd mm2,    mm2
+
+    add     eax,    ebx
+    add     ecx,    edx
+    paddd   mm0,    mm1
+    paddd   mm0,    mm2
+%endmacro
+
+%macro SSD_INC_1x4P 0
+    movd    mm1,    [eax]
+    movd    mm2,    [ecx]
+
+    movq    mm5,    mm2
+    psubusb mm2,    mm1
+    psubusb mm1,    mm5
+    por     mm1,    mm2
+    punpcklbw mm1,  mm7
+    pmaddwd mm1,    mm1
+
+    add     eax,    ebx
+    add     ecx,    edx
+    paddd   mm0,    mm1
+%endmacro
+
+%macro SSD_INC_8x16P 0
+    SSD_INC_1x16P
+    SSD_INC_1x16P
+    SSD_INC_1x16P
+    SSD_INC_1x16P
+    SSD_INC_1x16P
+    SSD_INC_1x16P
+    SSD_INC_1x16P
+    SSD_INC_1x16P
+%endmacro
+
+%macro SSD_INC_4x8P 0
+    SSD_INC_1x8P
+    SSD_INC_1x8P
+    SSD_INC_1x8P
+    SSD_INC_1x8P
+%endmacro
+
+%macro SSD_INC_4x4P 0
+    SSD_INC_1x4P
+    SSD_INC_1x4P
+    SSD_INC_1x4P
+    SSD_INC_1x4P
+%endmacro
+
+%macro LOAD_DIFF_4P 5  ; MMP, MMT, MMZ, [pix1], [pix2]
+    movd        %1, %4
+    punpcklbw   %1, %3
+    movd        %2, %5
+    punpcklbw   %2, %3
+    psubw       %1, %2
+%endmacro
+
+%macro LOAD_DIFF_INC_4x4 11 ; p1,p2,p3,p4, t, z, pix1, i_pix1, pix2, i_pix2, offset
+    LOAD_DIFF_4P %1, %5, %6, [%7+%11],    [%9+%11]
+    LOAD_DIFF_4P %2, %5, %6, [%7+%8+%11], [%9+%10+%11]
+    lea %7, [%7+2*%8]
+    lea %9, [%9+2*%10]
+    LOAD_DIFF_4P %3, %5, %6, [%7+%11],    [%9+%11]
+    LOAD_DIFF_4P %4, %5, %6, [%7+%8+%11], [%9+%10+%11]
+    lea %7, [%7+2*%8]
+    lea %9, [%9+2*%10]
+%endmacro
+
+%macro HADAMARD4_SUB_BADC 4
+    paddw %1,   %2
+    paddw %3,   %4
+    paddw %2,   %2
+    paddw %4,   %4
+    psubw %2,   %1
+    psubw %4,   %3
+%endmacro
+
+%macro HADAMARD4x4 4
+    HADAMARD4_SUB_BADC %1, %2, %3, %4
+    HADAMARD4_SUB_BADC %1, %3, %2, %4
+%endmacro
+
+%macro SBUTTERFLYwd 3
+    movq        %3, %1
+    punpcklwd   %1, %2
+    punpckhwd   %3, %2
+%endmacro
+
+%macro SBUTTERFLYdq 3
+    movq        %3, %1
+    punpckldq   %1, %2
+    punpckhdq   %3, %2
+%endmacro
+
+%macro TRANSPOSE4x4 5   ; abcd-t -> adtc
+    SBUTTERFLYwd %1, %2, %5
+    SBUTTERFLYwd %3, %4, %2
+    SBUTTERFLYdq %1, %3, %4
+    SBUTTERFLYdq %5, %2, %3
+%endmacro
+
+%macro MMX_ABS 2        ; mma, mmt
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+%endmacro
+
+%macro MMX_ABS_SUM 3    ; mma, mmt, mms
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+    paddusw %3, %1
+%endmacro
+
+
+%macro MMX_SUM_MM 2     ; mmv, mmt
+    movq    %2, %1
+    psrlq   %1, 32
+    paddusw %1, %2
+    movq    %2, %1
+    psrlq   %1, 16
+    paddusw %1, %2
+    movd    eax,%1
+    and     eax,0xffff
+    shr     eax,1
+%endmacro
+
+%macro HADAMARD4x4_FIRST 0
+    HADAMARD4x4 mm0, mm1, mm2, mm3
+    TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
+    HADAMARD4x4 mm0, mm3, mm4, mm2
+    MMX_ABS     mm0, mm7
+    MMX_ABS_SUM mm3, mm7, mm0
+    MMX_ABS_SUM mm4, mm7, mm0
+    MMX_ABS_SUM mm2, mm7, mm0
+%endmacro
+
+%macro HADAMARD4x4_NEXT 0
+    HADAMARD4x4 mm1, mm2, mm3, mm4
+    TRANSPOSE4x4 mm1, mm2, mm3, mm4, mm5
+    HADAMARD4x4 mm1, mm4, mm5, mm3
+    MMX_ABS_SUM mm1, mm7, mm0
+    MMX_ABS_SUM mm4, mm7, mm0
+    MMX_ABS_SUM mm5, mm7, mm0
+    MMX_ABS_SUM mm3, mm7, mm0
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_pixel_sad_16x16_mmxext
+cglobal x264_pixel_sad_16x8_mmxext
+cglobal x264_pixel_sad_8x16_mmxext
+cglobal x264_pixel_sad_8x8_mmxext
+cglobal x264_pixel_sad_8x4_mmxext
+cglobal x264_pixel_sad_4x8_mmxext
+cglobal x264_pixel_sad_4x4_mmxext
+
+cglobal x264_pixel_ssd_16x16_mmxext
+cglobal x264_pixel_ssd_16x8_mmxext
+cglobal x264_pixel_ssd_8x16_mmxext
+cglobal x264_pixel_ssd_8x8_mmxext
+cglobal x264_pixel_ssd_8x4_mmxext
+cglobal x264_pixel_ssd_4x8_mmxext
+cglobal x264_pixel_ssd_4x4_mmxext
+
+cglobal x264_pixel_satd_4x4_mmxext
+cglobal x264_pixel_satd_4x8_mmxext
+cglobal x264_pixel_satd_8x4_mmxext
+cglobal x264_pixel_satd_8x8_mmxext
+cglobal x264_pixel_satd_16x8_mmxext
+cglobal x264_pixel_satd_8x16_mmxext
+cglobal x264_pixel_satd_16x16_mmxext
+
+%macro SAD_START 0
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+%endmacro
+%macro SAD_END 0
+    movd eax,    mm0
+
+    pop ebx
+    ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x16_mmxext:
+    SAD_START
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_END
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x8_mmxext:
+    SAD_START
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_END
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x16_mmxext:
+    SAD_START
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_END
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x8_mmxext:
+    SAD_START
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_END
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x4_mmxext:
+    SAD_START
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_END
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_4x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_4x8_mmxext:
+    SAD_START
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+    SAD_END
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_4x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_4x4_mmxext:
+    SAD_START
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+    SAD_END
+
+
+
+%macro SSD_START 0
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7         ; zero
+    pxor    mm0,    mm0         ; mm0 holds the sum
+%endmacro
+
+%macro SSD_END 0
+    movq    mm1,    mm0
+    psrlq   mm1,    32
+    paddd   mm0,    mm1
+    movd    eax,    mm0
+
+    pop ebx
+    ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_ssd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_ssd_16x16_mmxext:
+    SSD_START
+    SSD_INC_8x16P
+    SSD_INC_8x16P
+    SSD_END
+
+ALIGN 16
+x264_pixel_ssd_16x8_mmxext:
+    SSD_START
+    SSD_INC_8x16P
+    SSD_END
+
+ALIGN 16
+x264_pixel_ssd_8x16_mmxext:
+    SSD_START
+    SSD_INC_4x8P
+    SSD_INC_4x8P
+    SSD_INC_4x8P
+    SSD_INC_4x8P
+    SSD_END
+
+ALIGN 16
+x264_pixel_ssd_8x8_mmxext:
+    SSD_START
+    SSD_INC_4x8P
+    SSD_INC_4x8P
+    SSD_END
+
+ALIGN 16
+x264_pixel_ssd_8x4_mmxext:
+    SSD_START
+    SSD_INC_4x8P
+    SSD_END
+
+ALIGN 16
+x264_pixel_ssd_4x8_mmxext:
+    SSD_START
+    SSD_INC_4x4P
+    SSD_INC_4x4P
+    SSD_END
+
+ALIGN 16
+x264_pixel_ssd_4x4_mmxext:
+    SSD_START
+    SSD_INC_4x4P
+    SSD_END
+
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_4x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_4P mm0, mm6, mm7, [eax],       [ecx]
+    LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx],   [ecx+edx]
+    LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
+    add eax, ebx
+    add ecx, edx
+    LOAD_DIFF_4P mm3, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
+
+    HADAMARD4x4_FIRST
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_4x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ecx,    [esp+16]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ecx,    [esp+16]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x8_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add         eax, ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x16_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     eax,    ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x16_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     ebp,    eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     ebp,    eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     eax,    ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
diff --git a/common/amd64/pixel.h b/common/amd64/pixel.h
new file mode 100644
index 00000000..43916c0a
--- /dev/null
+++ b/common/amd64/pixel.h
@@ -0,0 +1,51 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_PIXEL_H
+#define _I386_PIXEL_H 1
+
+int x264_pixel_sad_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
+int x264_pixel_ssd_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
+int x264_pixel_satd_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
+#endif
diff --git a/common/amd64/predict-a.asm b/common/amd64/predict-a.asm
new file mode 100644
index 00000000..3237ebb6
--- /dev/null
+++ b/common/amd64/predict-a.asm
@@ -0,0 +1,141 @@
+;*****************************************************************************
+;* predict-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005 x264 project
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+    %ifdef PREFIX
+        global _%1
+        %define %1 _%1
+    %else
+        global %1
+    %endif
+%endmacro
+
+;=============================================================================
+; Read only data
+;=============================================================================
+
+SECTION .rodata data align=16
+
+SECTION .data
+
+;=============================================================================
+; Macros
+;=============================================================================
+
+%macro SAVE_0_1 1
+    movq        [%1]         , mm0
+    movq        [%1 + 8]     , mm1
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal predict_8x8_v_mmx
+cglobal predict_16x16_v_mmx
+
+;-----------------------------------------------------------------------------
+;
+; void predict_8x8_v_mmx( uint8_t *src, int i_stride )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_8x8_v_mmx :
+
+    ;push       edi
+    ;push       esi
+
+    mov         edx             , [esp + 4]
+    mov         ecx             , [esp + 8]
+    sub         edx             , ecx               ; esi <-- line -1
+
+    movq        mm0             , [edx]
+    movq        [edx + ecx]     , mm0               ; 0
+    movq        [edx + 2 * ecx] , mm0               ; 1
+    movq        [edx + 4 * ecx] , mm0               ; 3
+    movq        [edx + 8 * ecx] , mm0               ; 7
+    add         edx             , ecx               ; esi <-- line 0
+    movq        [edx + 2 * ecx] , mm0               ; 2
+    movq        [edx + 4 * ecx] , mm0               ; 4
+    lea         edx             , [edx + 4 * ecx]   ; esi <-- line 4
+    movq        [edx + ecx]     , mm0               ; 5
+    movq        [edx + 2 * ecx] , mm0               ; 6
+
+    ;pop        esi
+    ;pop        edi
+
+    ret
+
+;-----------------------------------------------------------------------------
+;
+; void predict_16x16_v_mmx( uint8_t *src, int i_stride )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_16x16_v_mmx :
+
+    ;push       edi
+    ;push       esi
+
+    mov         edx, [esp + 4]
+    mov         ecx, [esp + 8]
+    sub         edx, ecx                ; esi <-- line -1
+
+    movq        mm0, [edx]
+    movq        mm1, [edx + 8]
+    mov         eax, ecx
+    shl         eax, 1
+    add         eax, ecx                ; eax <-- 3* stride
+
+    SAVE_0_1    (edx + ecx)             ; 0
+    SAVE_0_1    (edx + 2 * ecx)         ; 1
+    SAVE_0_1    (edx + eax)             ; 2
+    SAVE_0_1    (edx + 4 * ecx)         ; 3
+    SAVE_0_1    (edx + 2 * eax)         ; 5
+    SAVE_0_1    (edx + 8 * ecx)         ; 7
+    SAVE_0_1    (edx + 4 * eax)         ; 11
+    add         edx, ecx                ; esi <-- line 0
+    SAVE_0_1    (edx + 4 * ecx)         ; 4
+    SAVE_0_1    (edx + 2 * eax)         ; 6
+    SAVE_0_1    (edx + 8 * ecx)         ; 8
+    SAVE_0_1    (edx + 4 * eax)         ; 12
+    lea         edx, [edx + 8 * ecx]    ; esi <-- line 8
+    SAVE_0_1    (edx + ecx)             ; 9
+    SAVE_0_1    (edx + 2 * ecx)         ; 10
+    lea         edx, [edx + 4 * ecx]    ; esi <-- line 12
+    SAVE_0_1    (edx + ecx)             ; 13
+    SAVE_0_1    (edx + 2 * ecx)         ; 14
+    SAVE_0_1    (edx + eax)             ; 15
+
+
+    ;pop        esi
+    ;pop        edi
+
+    ret
diff --git a/common/amd64/predict.c b/common/amd64/predict.c
new file mode 100644
index 00000000..5422f15c
--- /dev/null
+++ b/common/amd64/predict.c
@@ -0,0 +1,444 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* XXX predict4x4 are inspired from ffmpeg h264 decoder
+ */
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include "x264.h"   /* for keyword inline */
+#include "common/predict.h"
+#include "predict.h"
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/****************************************************************************
+ * 16x16 prediction for intra block DC, H, V, P
+ ****************************************************************************/
+static void predict_16x16_dc( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    /* calculate DC value */
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+        dc += src[i - i_stride];
+    }
+    dc = (( dc + 16 ) >> 5) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_left( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+    }
+    dc = (( dc + 8 ) >> 4) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_top( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[i - i_stride];
+    }
+    dc = (( dc + 8 ) >> 4) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_128( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        const uint32_t v = 0x01010101 * src[-1];
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = v;
+        *p++ = v;
+        *p++ = v;
+        *p++ = v;
+
+        src += i_stride;
+
+    }
+}
+
+extern predict_16x16_v_mmx( uint8_t *src, int i_stride );
+
+#if 0
+static void predict_16x16_v( uint8_t *src, int i_stride )
+{
+    int i;
+
+    asm volatile(
+        "movq  (%0), %%mm0\n"
+        "movq 8(%0), %%mm1\n" :: "r"(&src[-i_stride]) );
+
+    for( i = 0; i < 16; i++ )
+    {
+        asm volatile(
+            "movq %%mm0,  (%0)\n"
+            "movq %%mm1, 8(%0)\n" :: "r"(src) );
+        src += i_stride;
+    }
+}
+#endif
+
+/****************************************************************************
+ * 8x8 prediction for intra chroma block DC, H, V, P
+ ****************************************************************************/
+static void predict_8x8_dc_128( uint8_t *src, int i_stride )
+{
+    int y;
+
+    for( y = 0; y < 8; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_left( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc0 = 0, dc1 = 0;
+
+    for( y = 0; y < 4; y++ )
+    {
+        dc0 += src[y * i_stride     - 1];
+        dc1 += src[(y+4) * i_stride - 1];
+    }
+    dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
+    dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc0;
+
+        src += i_stride;
+    }
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc1;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+
+}
+static void predict_8x8_dc_top( uint8_t *src, int i_stride )
+{
+    int y, x;
+    uint32_t dc0 = 0, dc1 = 0;
+
+    for( x = 0; x < 4; x++ )
+    {
+        dc0 += src[x     - i_stride];
+        dc1 += src[x + 4 - i_stride];
+    }
+    dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
+    dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 8; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc( uint8_t *src, int i_stride )
+{
+    int y;
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    uint32_t dc0, dc1, dc2, dc3;
+    int i;
+
+    /* First do :
+          s0 s1
+       s2
+       s3
+    */
+    for( i = 0; i < 4; i++ )
+    {
+        s0 += src[i - i_stride];
+        s1 += src[i + 4 - i_stride];
+        s2 += src[-1 + i * i_stride];
+        s3 += src[-1 + (i+4)*i_stride];
+    }
+    /* now calculate
+       dc0 dc1
+       dc2 dc3
+     */
+    dc0 = (( s0 + s2 + 4 ) >> 3)*0x01010101;
+    dc1 = (( s1 + 2 ) >> 2)*0x01010101;
+    dc2 = (( s3 + 2 ) >> 2)*0x01010101;
+    dc3 = (( s1 + s3 + 4 ) >> 3)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc2;
+        *p++ = dc3;
+
+        src += i_stride;
+    }
+}
+
+static void predict_8x8_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 8; i++ )
+    {
+        uint32_t v = 0x01010101 * src[-1];
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = v;
+        *p++ = v;
+
+        src += i_stride;
+    }
+}
+
+extern void predict_8x8_v_mmx( uint8_t *src, int i_stride );
+
+#if 0
+static void predict_8x8_v( uint8_t *src, int i_stride )
+{
+    int i;
+
+    asm volatile( "movq  (%0), %%mm0\n" :: "r"(&src[-i_stride]) );
+
+    for( i = 0; i < 8; i++ )
+    {
+        asm volatile( "movq %%mm0,  (%0)\n" :: "r"(src) );
+        src += i_stride;
+    }
+}
+#endif
+
+
+/****************************************************************************
+ * 4x4 prediction for intra luma block DC, H, V, P
+ ****************************************************************************/
+static void predict_4x4_dc_128( uint8_t *src, int i_stride )
+{
+    int y;
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_left( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
+                     src[-1+2*i_stride] + src[-1+3*i_stride] + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_top( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[0 - i_stride] + src[1 - i_stride] +
+                     src[2 - i_stride] + src[3 - i_stride] + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
+                     src[-1+2*i_stride] + src[-1+3*i_stride] +
+                     src[0 - i_stride]  + src[1 - i_stride] +
+                     src[2 - i_stride]  + src[3 - i_stride] + 4 ) >> 3)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = 0x01010101*src[-1];
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_v( uint8_t *src, int i_stride )
+{
+    uint32_t top = *((uint32_t*)&src[-i_stride]);
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p = top;
+
+        src += i_stride;
+    }
+}
+
+/****************************************************************************
+ * Exported functions:
+ ****************************************************************************/
+void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
+{
+    pf[I_PRED_16x16_V ]     = predict_16x16_v_mmx;
+    pf[I_PRED_16x16_H ]     = predict_16x16_h;
+    pf[I_PRED_16x16_DC]     = predict_16x16_dc;
+    pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
+    pf[I_PRED_16x16_DC_TOP ]= predict_16x16_dc_top;
+    pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128;
+}
+
+void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] )
+{
+    pf[I_PRED_CHROMA_V ]     = predict_8x8_v_mmx;
+    pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
+    pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
+    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top;
+    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128;
+}
+
+void x264_predict_4x4_init_mmxext( x264_predict_t pf[12] )
+{
+    pf[I_PRED_4x4_V]      = predict_4x4_v;
+    pf[I_PRED_4x4_H]      = predict_4x4_h;
+    pf[I_PRED_4x4_DC]     = predict_4x4_dc;
+    pf[I_PRED_4x4_DC_LEFT]= predict_4x4_dc_left;
+    pf[I_PRED_4x4_DC_TOP] = predict_4x4_dc_top;
+    pf[I_PRED_4x4_DC_128] = predict_4x4_dc_128;
+}
+
diff --git a/common/amd64/predict.h b/common/amd64/predict.h
new file mode 100644
index 00000000..b00b1e59
--- /dev/null
+++ b/common/amd64/predict.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+ * predict.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_PREDICT_H
+#define _I386_PREDICT_H 1
+
+void x264_predict_16x16_init_mmxext ( x264_predict_t pf[7] );
+void x264_predict_8x8_init_mmxext   ( x264_predict_t pf[7] );
+void x264_predict_4x4_init_mmxext   ( x264_predict_t pf[12] );
+
+#endif
-- 
2.40.0