From: Loren Merritt <pengvado@videolan.org>
Date: Sun, 27 Mar 2005 20:49:59 +0000 (+0000)
Subject: convert mc's inline asm to nasm (slight speedup and msvc compatibility).
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0df24cf995faf3169fe15d808e4fff00c18ad7dc;p=libx264

convert mc's inline asm to nasm (slight speedup and msvc compatibility).
patch by Mathieu Monnier.


git-svn-id: svn://svn.videolan.org/x264/trunk@180 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/Makefile b/Makefile
index a3305c0a..548380f4 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,8 @@ ifeq ($(ARCH),X86)
 CFLAGS+=-DHAVE_MMXEXT -DHAVE_SSE2
 SRCS+= common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
 ASMSRC= common/i386/dct-a.asm common/i386/cpu-a.asm \
-        common/i386/pixel-a.asm common/i386/mc-a.asm
+        common/i386/pixel-a.asm common/i386/mc-a.asm \
+        common/i386/mc-a2.asm common/i386/predict-a.asm
 OBJASM= $(ASMSRC:%.asm=%.o)
 endif
 
diff --git a/build/cygwin/Makefile b/build/cygwin/Makefile
index 7554f4d5..89357ff1 100644
--- a/build/cygwin/Makefile
+++ b/build/cygwin/Makefile
@@ -22,7 +22,8 @@ SRC_C= common/mc.c common/predict.c common/pixel.c common/macroblock.c \
        encoder/encoder.c encoder/eval.c \
        common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
 
-SRC_ASM= common/i386/dct-a.asm common/i386/cpu-a.asm common/i386/pixel-a.asm common/i386/mc-a.asm
+SRC_ASM= common/i386/dct-a.asm common/i386/cpu-a.asm common/i386/pixel-a.asm \
+         common/i386/mc-a.asm common/i386/mc-a2.asm common/i386/predict-a.asm
 
 # Alias
 RM= rm -rf
diff --git a/common/i386/mc-a2.asm b/common/i386/mc-a2.asm
new file mode 100644
index 00000000..aaab2c1b
--- /dev/null
+++ b/common/i386/mc-a2.asm
@@ -0,0 +1,402 @@
+;*****************************************************************************
+;* mc-a2.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005 x264 project
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+    %ifdef PREFIX
+        global _%1
+        %define %1 _%1
+    %else
+        global %1
+    %endif
+%endmacro
+
+;=============================================================================
+; Read only data
+;=============================================================================
+
+SECTION .rodata data align=16
+
+ALIGN 16
+mmx_dw_one:
+    times 4 dw 16
+mmx_dd_one:
+    times 2 dd 512
+mmx_dw_20:
+    times 4 dw 20
+mmx_dw_5:
+    times 4 dw -5
+
+SECTION .data
+
+width:
+    dd 0
+height:
+    dd 0
+dstp1:
+    dd 0
+dstp2:
+    dd 0
+buffer:
+    dd 0
+dst1:
+    dd 0
+dst2:
+    dd 0
+src:
+    dd 0
+
+
+;=============================================================================
+; Macros
+;=============================================================================
+
+%macro LOAD_4 9
+    movd %1, %5
+    movd %2, %6
+    movd %3, %7
+    movd %4, %8
+    punpcklbw %1, %9
+    punpcklbw %2, %9
+    punpcklbw %3, %9
+    punpcklbw %4, %9
+%endmacro
+
+%macro FILT_2 2
+    psubw %1, %2
+    psllw %2, 2
+    psubw %1, %2
+%endmacro
+
+%macro FILT_4 3
+    paddw %2, %3
+    psllw %2, 2
+    paddw %1, %2
+    psllw %2, 2
+    paddw %1, %2
+%endmacro
+
+%macro FILT_6 4
+    psubw %1, %2
+    psllw %2, 2
+    psubw %1, %2
+    paddw %1, %3
+    paddw %1, %4
+    psraw %1, 5
+%endmacro
+
+%macro FILT_ALL 1
+    LOAD_4      mm1, mm2, mm3, mm4, [%1], [%1 + ecx], [%1 + 2 * ecx], [%1 + ebx], mm0
+    FILT_2      mm1, mm2
+    movd        mm5, [%1 + 4 * ecx]
+    movd        mm6, [%1 + edx]
+    FILT_4      mm1, mm3, mm4
+    punpcklbw   mm5, mm0
+    punpcklbw   mm6, mm0
+    psubw       mm1, mm5
+    psllw       mm5, 2
+    psubw       mm1, mm5
+    paddw       mm1, mm6
+%endmacro
+
+
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_vertical_filter_mmxext
+cglobal x264_horizontal_filter_mmxext
+cglobal x264_center_filter_mmxext
+
+;-----------------------------------------------------------------------------
+;
+; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
+;                                 uint8_t *dst2, int i_dst2_stride,
+;                                  uint8_t *src, int i_src_stride,
+;                                  int i_width, int i_height );
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_center_filter_mmxext :
+
+    push        edi
+    push        esi
+    push        ebx
+    push        ebp
+
+    mov         esi,      [esp + 36]         ; src
+
+    mov         edx,      [esp + 20]         ; dst1
+    mov         [dst1],   edx
+
+    mov         edi,      [esp + 28]         ; dst2
+    mov         [dst2],   edi
+
+    mov         eax,      [esp + 44]         ; width
+    mov         [width],  eax
+
+    mov         eax,      [esp + 48]         ; height
+    mov         [height], eax
+
+    mov         eax,      [esp + 24]         ; dst1_stride
+    mov         [dstp1],  eax
+
+    mov         eax,      [esp + 32]         ; dst2_stride
+    mov         [dstp2],  eax
+
+    mov         ecx,      [esp + 40]         ; src_stride
+
+    sub         esp,      ecx
+    sub         esp,      ecx                ; esp is now at the beginning of the buffer
+    mov         [buffer], esp
+
+    ;sub        esi,      2
+    sub         esi,      ecx
+    sub         esi,      ecx                ; esi - 2 - 2 * stride
+    mov         [src],    esi
+
+    ;sub        edi,      2
+
+    mov         ebx,      ecx
+    shl         ebx,      1
+    add         ebx,      ecx                ; 3 * src_stride
+
+    mov         edx,      ecx
+    shl         edx,      1
+    add         edx,      ebx                ; 5 * src_stride
+
+    pxor        mm0,      mm0                ; 0 ---> mm0
+    movq        mm7,      [mmx_dd_one]       ; for rounding
+
+    mov         ebp,      [height]
+
+loopcy:
+
+    dec         ebp
+    mov         eax,    [width]
+    mov         edi,    [dst1]
+    mov         esp,    [buffer]
+    mov         esi,    [src]
+
+    FILT_ALL    esi
+
+    pshufw      mm2,    mm1, 0
+    movq        [esp],  mm2
+    add         esp,    8
+    movq        [esp],  mm1
+    add         esp,    8
+    paddw       mm1,    [mmx_dw_one]
+    psraw       mm1,    5
+
+    packuswb    mm1,    mm1
+    movd        [edi],  mm1
+
+    sub         eax,    8
+    add         edi,    4
+    add         esi,    4
+
+loopcx1:
+
+    sub         eax,    4
+
+    FILT_ALL    esi
+
+    movq        [esp],  mm1
+    paddw       mm1,    [mmx_dw_one]
+    psraw       mm1,    5
+    packuswb    mm1,    mm1
+    movd        [edi],  mm1
+
+    add         esp,    8
+    add         esi,    4
+    add         edi,    4
+    test        eax,    eax
+    jnz         loopcx1
+
+    FILT_ALL    esi
+
+    pshufw      mm2,    mm1,  7
+    movq        [esp],  mm1
+    add         esp,    8
+    movq        [esp],  mm2
+    paddw       mm1,    [mmx_dw_one]
+    psraw       mm1,    5
+    packuswb    mm1,    mm1
+    movd        [edi],  mm1
+
+    mov         esi,    [src]
+    add         esi,    ecx
+    mov         [src],  esi
+
+    mov         edi,    [dst1]
+    add         edi,    [dstp1]
+    mov         [dst1], edi
+
+    mov         eax,    [width]
+    mov         edi,    [dst2]
+    mov         esp,    [buffer]
+    add         esp,    4
+
+loopcx2:
+
+    sub         eax,    4
+
+    movq        mm2,    [esp + 2 * eax + 2]
+    movq        mm3,    [esp + 2 * eax + 4]
+    movq        mm4,    [esp + 2 * eax + 6]
+    movq        mm5,    [esp + 2 * eax + 8]
+    movq        mm1,    [esp + 2 * eax]
+    movq        mm6,    [esp + 2 * eax + 10]
+    paddw       mm2,    mm5
+    paddw       mm3,    mm4
+    paddw       mm1,    mm6
+
+    movq        mm5,    [mmx_dw_20]
+    movq        mm4,    [mmx_dw_5]
+    movq        mm6,    mm1
+    pxor        mm7,    mm7
+
+    punpckhwd   mm5,    mm2
+    punpcklwd   mm4,    mm3
+    punpcklwd   mm2,    [mmx_dw_20]
+    punpckhwd   mm3,    [mmx_dw_5]
+
+    pcmpgtw     mm7,    mm1
+
+    pmaddwd     mm2,    mm4
+    pmaddwd     mm3,    mm5
+
+    punpcklwd   mm1,    mm7
+    punpckhwd   mm6,    mm7
+
+    paddd       mm2,    mm1
+    paddd       mm3,    mm6
+
+    paddd       mm2,    [mmx_dd_one]
+    paddd       mm3,    [mmx_dd_one]
+
+    psrad       mm2,    10
+    psrad       mm3,    10
+
+    packssdw    mm2,    mm3
+    packuswb    mm2,    mm0
+
+    movd        [edi + eax], mm2
+
+    test        eax,    eax
+    jnz         loopcx2
+
+    add         edi,    [dstp2]
+    mov         [dst2], edi
+
+    test        ebp,    ebp
+    jnz         loopcy
+
+    mov         esp,    [buffer]
+    shl         ecx,    1
+    add         esp,    ecx
+
+    pop         ebp
+    pop         ebx
+    pop         esi
+    pop         edi
+
+    ret
+
+;-----------------------------------------------------------------------------
+;
+; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
+;                                     uint8_t *src, int i_src_stride,
+;                                     int i_width, int i_height );
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_horizontal_filter_mmxext :
+    push edi
+    push esi
+
+    mov         edi,    [esp + 12]           ; dst
+    mov         esi,    [esp + 20]           ; src
+
+    pxor        mm0,    mm0
+    movq        mm7,    [mmx_dw_one]
+
+    mov         ecx,    [esp + 32]           ; height
+
+    sub         esi,    2
+
+loophy:
+
+    dec         ecx
+    mov         eax,    [esp + 28]           ; width
+
+loophx:
+
+    sub         eax,    8
+
+    LOAD_4      mm1,    mm2, mm3, mm4, [esi + eax], [esi + eax + 1], [esi + eax + 2], [esi + eax + 3], mm0
+    FILT_2      mm1,    mm2
+    movd        mm5,    [esi + eax + 4]
+    movd        mm6,    [esi + eax + 5]
+    FILT_4      mm1,    mm3, mm4
+    movd        mm2,    [esi + eax + 4]
+    movd        mm3,    [esi + eax + 6]
+    punpcklbw   mm5,    mm0
+    punpcklbw   mm6,    mm0
+    FILT_6      mm1,    mm5, mm6, mm7
+    movd        mm4,    [esi + eax + 7]
+    movd        mm5,    [esi + eax + 8]
+    punpcklbw   mm2,    mm0
+    punpcklbw   mm3,    mm0                  ; mm2(1), mm3(20), mm6(-5) ready
+    FILT_2      mm2,    mm6
+    movd        mm6,    [esi + eax + 9]
+    punpcklbw   mm4,    mm0
+    punpcklbw   mm5,    mm0                  ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
+    FILT_4      mm2,    mm3, mm4
+    punpcklbw   mm6,    mm0
+    FILT_6      mm2,    mm5, mm6, mm7
+
+    packuswb    mm1,    mm2
+    movq        [edi + eax],  mm1
+
+    test        eax,    eax
+    jnz         loophx
+
+    add         esi,    [esp + 24]           ; src_pitch
+    add         edi,    [esp + 16]           ; dst_pitch
+
+    test        ecx,    ecx
+    jnz         loophy
+
+    pop         esi
+    pop         edi
+
+    ret
diff --git a/common/i386/mc-c.c b/common/i386/mc-c.c
index 63fb7a7b..b5b3c3cf 100644
--- a/common/i386/mc-c.c
+++ b/common/i386/mc-c.c
@@ -36,6 +36,8 @@
 #include "common/clip1.h"
 #include "mc.h"
 
+#if 0
+
 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
 #define USED_UINT64(foo) \
     static const uint64_t foo __asm__ (#foo) __attribute__((used))
@@ -1021,6 +1023,7 @@ static void motion_compensation_luma_sse2( uint8_t *src, int i_src_stride,
     MOTION_COMPENSATION_LUMA
 }
 
+#endif
 
 void mc_luma_mmx( uint8_t *src[4], int i_src_stride,
               uint8_t *dst,    int i_dst_stride,
@@ -1141,6 +1144,7 @@ void x264_mc_sse2_init( x264_mc_functions_t *pf )
     pf->get_ref   = get_ref_mmx;
 }
 
+#if 0
 void get_funcs_mmx(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
 {
     *int_h = mc_hh_w16;
@@ -1154,3 +1158,4 @@ void get_funcs_sse2(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
     *int_v = mc_hv_w16;
     *int_hv = mc_hc_w16;
 }
+#endif
diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm
new file mode 100644
index 00000000..3237ebb6
--- /dev/null
+++ b/common/i386/predict-a.asm
@@ -0,0 +1,141 @@
+;*****************************************************************************
+;* predict-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005 x264 project
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+    %ifdef PREFIX
+        global _%1
+        %define %1 _%1
+    %else
+        global %1
+    %endif
+%endmacro
+
+;=============================================================================
+; Read only data
+;=============================================================================
+
+SECTION .rodata data align=16
+
+SECTION .data
+
+;=============================================================================
+; Macros
+;=============================================================================
+
+%macro SAVE_0_1 1
+    movq        [%1]         , mm0
+    movq        [%1 + 8]     , mm1
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal predict_8x8_v_mmx
+cglobal predict_16x16_v_mmx
+
+;-----------------------------------------------------------------------------
+;
+; void predict_8x8_v_mmx( uint8_t *src, int i_stride )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_8x8_v_mmx :
+
+    ;push       edi
+    ;push       esi
+
+    mov         edx             , [esp + 4]
+    mov         ecx             , [esp + 8]
+    sub         edx             , ecx               ; esi <-- line -1
+
+    movq        mm0             , [edx]
+    movq        [edx + ecx]     , mm0               ; 0
+    movq        [edx + 2 * ecx] , mm0               ; 1
+    movq        [edx + 4 * ecx] , mm0               ; 3
+    movq        [edx + 8 * ecx] , mm0               ; 7
+    add         edx             , ecx               ; esi <-- line 0
+    movq        [edx + 2 * ecx] , mm0               ; 2
+    movq        [edx + 4 * ecx] , mm0               ; 4
+    lea         edx             , [edx + 4 * ecx]   ; esi <-- line 4
+    movq        [edx + ecx]     , mm0               ; 5
+    movq        [edx + 2 * ecx] , mm0               ; 6
+
+    ;pop        esi
+    ;pop        edi
+
+    ret
+
+;-----------------------------------------------------------------------------
+;
+; void predict_16x16_v_mmx( uint8_t *src, int i_stride )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_16x16_v_mmx :
+
+    ;push       edi
+    ;push       esi
+
+    mov         edx, [esp + 4]
+    mov         ecx, [esp + 8]
+    sub         edx, ecx                ; esi <-- line -1
+
+    movq        mm0, [edx]
+    movq        mm1, [edx + 8]
+    mov         eax, ecx
+    shl         eax, 1
+    add         eax, ecx                ; eax <-- 3* stride
+
+    SAVE_0_1    (edx + ecx)             ; 0
+    SAVE_0_1    (edx + 2 * ecx)         ; 1
+    SAVE_0_1    (edx + eax)             ; 2
+    SAVE_0_1    (edx + 4 * ecx)         ; 3
+    SAVE_0_1    (edx + 2 * eax)         ; 5
+    SAVE_0_1    (edx + 8 * ecx)         ; 7
+    SAVE_0_1    (edx + 4 * eax)         ; 11
+    add         edx, ecx                ; esi <-- line 0
+    SAVE_0_1    (edx + 4 * ecx)         ; 4
+    SAVE_0_1    (edx + 2 * eax)         ; 6
+    SAVE_0_1    (edx + 8 * ecx)         ; 8
+    SAVE_0_1    (edx + 4 * eax)         ; 12
+    lea         edx, [edx + 8 * ecx]    ; esi <-- line 8
+    SAVE_0_1    (edx + ecx)             ; 9
+    SAVE_0_1    (edx + 2 * ecx)         ; 10
+    lea         edx, [edx + 4 * ecx]    ; esi <-- line 12
+    SAVE_0_1    (edx + ecx)             ; 13
+    SAVE_0_1    (edx + 2 * ecx)         ; 14
+    SAVE_0_1    (edx + eax)             ; 15
+
+
+    ;pop        esi
+    ;pop        edi
+
+    ret
diff --git a/common/i386/predict.c b/common/i386/predict.c
index b0a0b7b7..5422f15c 100644
--- a/common/i386/predict.c
+++ b/common/i386/predict.c
@@ -152,6 +152,10 @@ static void predict_16x16_h( uint8_t *src, int i_stride )
 
     }
 }
+
+extern predict_16x16_v_mmx( uint8_t *src, int i_stride );
+
+#if 0
 static void predict_16x16_v( uint8_t *src, int i_stride )
 {
     int i;
@@ -168,6 +172,7 @@ static void predict_16x16_v( uint8_t *src, int i_stride )
         src += i_stride;
     }
 }
+#endif
 
 /****************************************************************************
  * 8x8 prediction for intra chroma block DC, H, V, P
@@ -301,6 +306,10 @@ static void predict_8x8_h( uint8_t *src, int i_stride )
         src += i_stride;
     }
 }
+
+extern void predict_8x8_v_mmx( uint8_t *src, int i_stride );
+
+#if 0
 static void predict_8x8_v( uint8_t *src, int i_stride )
 {
     int i;
@@ -313,6 +322,7 @@ static void predict_8x8_v( uint8_t *src, int i_stride )
         src += i_stride;
     }
 }
+#endif
 
 
 /****************************************************************************
@@ -404,7 +414,7 @@ static void predict_4x4_v( uint8_t *src, int i_stride )
  ****************************************************************************/
 void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
 {
-    pf[I_PRED_16x16_V ]     = predict_16x16_v;
+    pf[I_PRED_16x16_V ]     = predict_16x16_v_mmx;
     pf[I_PRED_16x16_H ]     = predict_16x16_h;
     pf[I_PRED_16x16_DC]     = predict_16x16_dc;
     pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
@@ -414,7 +424,7 @@ void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
 
 void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] )
 {
-    pf[I_PRED_CHROMA_V ]     = predict_8x8_v;
+    pf[I_PRED_CHROMA_V ]     = predict_8x8_v_mmx;
     pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
     pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
     pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
diff --git a/common/mc.c b/common/mc.c
index dbe05a39..c179aad2 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -37,14 +37,11 @@
 #include "clip1.h"
 #include "frame.h"
 
-#ifdef _MSC_VER
-#undef HAVE_MMXEXT  /* not finished now */
-#endif
 #ifdef HAVE_MMXEXT
-#   include "i386/mc.h"
+#include "i386/mc.h"
 #endif
 #ifdef ARCH_PPC
-#   include "ppc/mc.h"
+#include "ppc/mc.h"
 #endif
 
 
@@ -425,6 +422,14 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 void get_funcs_mmx(pf_mc_t*, pf_mc_t*, pf_mc_t*);
 void get_funcs_sse2(pf_mc_t*, pf_mc_t*, pf_mc_t*);
 
+extern void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
+                                           uint8_t *src, int i_src_stride,
+                                           int i_width, int i_height );
+extern void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
+                                       uint8_t *dst2, int i_dst2_stride,
+                                       uint8_t *src, int i_src_stride,
+                                       int i_width, int i_height );
+
 void x264_frame_filter( int cpu, x264_frame_t *frame )
 {
     const int x_inc = 16, y_inc = 16;
@@ -435,6 +440,7 @@ void x264_frame_filter( int cpu, x264_frame_t *frame )
     pf_mc_t int_v = mc_hv;
     pf_mc_t int_hv = mc_hc;
 
+#if 0
 #ifdef HAVE_MMXEXT
     if( cpu&X264_CPU_MMXEXT )
         get_funcs_mmx(&int_h, &int_v, &int_hv);
@@ -444,24 +450,41 @@ void x264_frame_filter( int cpu, x264_frame_t *frame )
     if( cpu&X264_CPU_SSE2 )
         get_funcs_sse2(&int_h, &int_v, &int_hv);
 #endif
+#endif
 
-    for( y = -8; y < frame->i_lines[0]+8; y += y_inc ) {
-        
-        uint8_t *p_in = frame->plane[0] + y * stride - 8;
-        uint8_t *p_h  = frame->filtered[1] + y * stride - 8;
-        uint8_t *p_v  = frame->filtered[2] + y * stride - 8;
-        uint8_t *p_hv = frame->filtered[3] + y * stride - 8;
-
-        for( x = -8; x < stride - 64 + 8; x += x_inc )
+#ifdef HAVE_MMXEXT
+    if ( cpu & X264_CPU_MMXEXT )
+    {
+        x264_horizontal_filter_mmxext(frame->filtered[1] - 8 * stride - 8, stride,
+            frame->plane[0] - 8 * stride - 8, stride,
+            stride - 48, frame->i_lines[0] + 16);
+        x264_center_filter_mmxext(frame->filtered[2] - 8 * stride - 8, stride,
+            frame->filtered[3] - 8 * stride - 8, stride,
+            frame->plane[0] - 8 * stride - 8, stride,
+            stride - 48, frame->i_lines[0] + 16);
+    }
+    else
+    {
+#else
+    {
+#endif
+        for( y = -8; y < frame->i_lines[0]+8; y += y_inc )
         {
-            int_h(  p_in, stride, p_h,  stride, x_inc, y_inc );
-            int_v(  p_in, stride, p_v,  stride, x_inc, y_inc );
-            int_hv( p_in, stride, p_hv, stride, x_inc, y_inc );
-
-            p_h += x_inc;
-            p_v += x_inc;
-            p_hv += x_inc;
-            p_in += x_inc;
+            uint8_t *p_in = frame->plane[0] + y * stride - 8;
+            uint8_t *p_h  = frame->filtered[1] + y * stride - 8;
+            uint8_t *p_v  = frame->filtered[2] + y * stride - 8;
+            uint8_t *p_hv = frame->filtered[3] + y * stride - 8;
+            for( x = -8; x < stride - 64 + 8; x += x_inc )
+            {
+                int_h(  p_in, stride, p_h,  stride, x_inc, y_inc );
+                int_v(  p_in, stride, p_v,  stride, x_inc, y_inc );
+                int_hv( p_in, stride, p_hv, stride, x_inc, y_inc );
+
+                p_h += x_inc;
+                p_v += x_inc;
+                p_hv += x_inc;
+                p_in += x_inc;
+            }
         }
     }
 }