From 02713c2401b471fb3bfa4541e8b867dfd06628cc Mon Sep 17 00:00:00 2001
From: Min Chen <chenm001@163.com>
Date: Fri, 18 Jun 2004 02:00:40 +0000
Subject: [PATCH] update & SSE2 support

git-svn-id: svn://svn.videolan.org/x264/trunk@10 df754926-b1dd-0310-bc7b-ec298dee348c
---
 Makefile              |   5 +-
 Makefile.cygwin       |   7 +--
 build/cygwin/Makefile |  12 +++--
 core/i386/mc-c.c      |  63 ++++++++++++++-----------
 core/i386/mc.asm      | 106 ++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 155 insertions(+), 38 deletions(-)

diff --git a/Makefile b/Makefile
index 25060f5d..7c214124 100644
--- a/Makefile
+++ b/Makefile
@@ -4,8 +4,9 @@
 #  Defines: HAVE_ALTIVEC
 #  CFLAGS: -faltivec
 #
+PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -DHAVE_SSE2
 CC=gcc
-CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H $(PFLAGS)
 
 SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
        core/frame.c core/dct.c core/cpu.c core/cabac.c \
@@ -18,7 +19,7 @@ SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
 
 AS= nasm
 # for linux
-ASFLAGS=-f elf
+ASFLAGS=-f elf $(PFLAGS)
 # for cygwin
 #ASFLAGS=-f gnuwin32 -DPREFIX
 
diff --git a/Makefile.cygwin b/Makefile.cygwin
index b99f8dc4..7b698883 100644
--- a/Makefile.cygwin
+++ b/Makefile.cygwin
@@ -1,7 +1,8 @@
 # Makefile: tuned for i386/MMX cygwin system only
 #
+PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -DHAVE_SSE2
 CC=gcc
-CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H $(PFLAGS)
 
 SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
        core/frame.c core/dct.c core/cpu.c core/cabac.c \
@@ -14,8 +15,8 @@ SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
 
 AS= nasm
 #for cygwin
-ASFLAGS=-f win32 -DPREFIX
-
+ASFLAGS=-f win32 -DPREFIX $(PFLAGS)
+          
 ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm
 OBJASM= $(ASMSRC:%.asm=%.o)
 
diff --git a/build/cygwin/Makefile b/build/cygwin/Makefile
index 7ea6c897..c7225126 100644
--- a/build/cygwin/Makefile
+++ b/build/cygwin/Makefile
@@ -4,7 +4,7 @@
 #
 # Author: x264 by Laurent Aimar <fenrir@via.ecp.fr>
 #
-# $Id: Makefile,v 1.3 2004/06/14 05:47:51 chenm001 Exp $
+# $Id: Makefile,v 1.4 2004/06/18 02:00:40 chenm001 Exp $
 ##############################################################################
 
 # Current dir
@@ -27,6 +27,11 @@ SRC_ASM= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.as
 # Alias
 RM= rm -rf
 
+##############################################################################
+# PFLAGS
+##############################################################################
+PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -UHAVE_SSE2
+
 ##############################################################################
 # CFLAGS
 ##############################################################################
@@ -35,7 +40,8 @@ RM= rm -rf
 # The `mingw-runtime` package is required when building with -mno-cygwin
 CFLAGS += -I$(DIR_SRC)
 CFLAGS += -mno-cygwin
-CFLAGS += -D__X264__ -DARCH_X86 -DHAVE_MMXEXT -D_CYGWIN
+CFLAGS += -D__X264__ -D_CYGWIN
+CFLAGS += $(PFLAGS)
 
 # Optional Compiler options
 CFLAGS += -g -Wall -DDEBUG
@@ -55,7 +61,7 @@ LDFLAGS += -L$(DIR_LIB) -lx264
 # ASM
 ##############################################################################
 AS= nasm
-ASFLAGS= -f win32 -DPREFIX
+ASFLAGS= -f win32 -DPREFIX $(PFLAGS)
 ##############################################################################
 # Rules
 ##############################################################################
diff --git a/core/i386/mc-c.c b/core/i386/mc-c.c
index e409b520..bab42647 100644
--- a/core/i386/mc-c.c
+++ b/core/i386/mc-c.c
@@ -2,7 +2,7 @@
  * mc.c: h264 encoder library (Motion Compensation)
  *****************************************************************************
  * Copyright (C) 2003 Laurent Aimar
- * $Id: mc-c.c,v 1.4 2004/06/17 09:01:19 chenm001 Exp $
+ * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *
@@ -26,6 +26,7 @@
 #include <string.h>
 #include <stdint.h>
 
+#include "x264.h"   /* DECLARE_ALIGNED */
 #include "../mc.h"
 #include "../clip1.h"
 #include "mc.h"
@@ -198,12 +199,6 @@ static inline void pixel_avg_w4( uint8_t *dst,  int i_dst_stride,
         src2 += i_src2_stride;
     }
 }
-#else
-extern void pixel_avg_w4( uint8_t *dst,  int i_dst_stride,
-                          uint8_t *src1, int i_src1_stride,
-                          uint8_t *src2, int i_src2_stride,
-                          int i_height );
-#endif
 
 static inline void pixel_avg_w8( uint8_t *dst,  int i_dst_stride,
                                  uint8_t *src1, int i_src1_stride,
@@ -251,6 +246,20 @@ static inline void pixel_avg_w16( uint8_t *dst,  int i_dst_stride,
         src2 += i_src2_stride;
     }
 }
+#else
+extern void pixel_avg_w4( uint8_t *dst,  int i_dst_stride,
+                          uint8_t *src1, int i_src1_stride,
+                          uint8_t *src2, int i_src2_stride,
+                          int i_height );
+extern void pixel_avg_w8( uint8_t *dst,  int i_dst_stride,
+                          uint8_t *src1, int i_src1_stride,
+                          uint8_t *src2, int i_src2_stride,
+                          int i_height );
+extern void pixel_avg_w16( uint8_t *dst,  int i_dst_stride,
+                           uint8_t *src1, int i_src1_stride,
+                           uint8_t *src2, int i_src2_stride,
+                           int i_height );
+#endif
 
 typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
 
@@ -803,34 +812,34 @@ static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int
 /* mc I+H */
 static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
     mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
     pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
 }
 static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
     mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
     pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height );
 }
 /* mc I+V */
 static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
     mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
     pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
 }
 static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
     mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
     pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height );
 }
 /* H+V */
 static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 
     mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
     mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
@@ -838,8 +847,8 @@ static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
 }
 static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 
     mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
     mc_hh_w16( src,   i_src_stride, tmp2, 16, i_height );
@@ -847,8 +856,8 @@ static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
 }
 static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 
     mc_hv_w16( src,              i_src_stride, tmp1, 16, i_height );
     mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
@@ -856,8 +865,8 @@ static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
 }
 static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 
     mc_hv_w16( src+1,            i_src_stride, tmp1, 16, i_height );
     mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
@@ -865,8 +874,8 @@ static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
 }
 static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 
     mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
     mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
@@ -874,8 +883,8 @@ static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
 }
 static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 
     mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
     mc_hv_w16( src, i_src_stride, tmp2, 16, i_height );
@@ -883,8 +892,8 @@ static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
 }
 static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 
     mc_hc_w16( src,   i_src_stride, tmp1, 16, i_height );
     mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height );
@@ -892,8 +901,8 @@ static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
 }
 static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 
     mc_hc_w16( src,              i_src_stride, tmp1, 16, i_height );
     mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
diff --git a/core/i386/mc.asm b/core/i386/mc.asm
index 0f18c04d..a932e159 100644
--- a/core/i386/mc.asm
+++ b/core/i386/mc.asm
@@ -2,7 +2,7 @@
 ;* mc.asm: h264 encoder library
 ;*****************************************************************************
 ;* Copyright (C) 2003 x264 project
-;* $Id: mc.asm,v 1.2 2004/06/17 09:01:19 chenm001 Exp $
+;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $
 ;*
 ;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
 ;*          Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
@@ -111,6 +111,95 @@ ALIGN 4
     ret
 
                           
+cglobal pixel_avg_w8
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void pixel_avg_w8( uint8_t *dst,  int i_dst_stride,
+;                    uint8_t *src1, int i_src1_stride,
+;                    uint8_t *src2, int i_src2_stride,
+;                    int i_height );
+;-----------------------------------------------------------------------------
+pixel_avg_w8:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    mov         eax, [esp+32]       ; i_src1_stride
+    mov         edx, [esp+40]       ; i_src2_stride
+    mov         ebp, [esp+44]       ; i_height
+ALIGN 4
+.height_loop    
+    movq        mm0, [ebx]
+    pavgb       mm0, [ecx]
+    movq        [edi], mm0
+    dec         ebp
+    lea         ebx, [ebx+eax]
+    lea         ecx, [ecx+edx]
+    lea         edi, [edi+esi]
+    jne         .height_loop
+
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
+cglobal pixel_avg_w16
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void pixel_avg_w16( uint8_t *dst,  int i_dst_stride,
+;                     uint8_t *src1, int i_src1_stride,
+;                     uint8_t *src2, int i_src2_stride,
+;                     int i_height );
+;-----------------------------------------------------------------------------
+pixel_avg_w16:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    mov         eax, [esp+32]       ; i_src1_stride
+    mov         edx, [esp+40]       ; i_src2_stride
+    mov         ebp, [esp+44]       ; i_height
+ALIGN 4
+.height_loop    
+%ifndef HAVE_SSE2
+    movq        mm0, [ebx  ]
+    movq        mm1, [ebx+8]
+    pavgb       mm0, [ecx  ]
+    pavgb       mm1, [ecx+8]
+    movq        [edi  ], mm0
+    movq        [edi+8], mm1
+%else
+    movdqu      xmm0, [ebx]
+    pavgb       xmm0, [ecx]
+    movdqu      [edi], xmm0
+%endif
+    dec         ebp
+    lea         ebx, [ebx+eax]
+    lea         ecx, [ecx+edx]
+    lea         edi, [edi+esi]
+    jne         .height_loop
+
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
 cglobal mc_copy_w4
 
 ALIGN 16
@@ -201,6 +290,7 @@ mc_copy_w16:
     mov     ecx, [esp+32]       ; i_height
 ALIGN 4
 .height_loop
+%ifndef HAVE_SSE2
     movq    mm0, [esi]
     movq    mm1, [esi+8]
     movq    [edi], mm0
@@ -221,10 +311,20 @@ ALIGN 4
     movq    [edi+edx+8], mm7
     lea     esi, [esi+ebx*2]
     lea     edi, [edi+edx*2]
-    
     sub     ecx, byte 4
     jnz     .height_loop
-
+%else
+    movdqu  xmm0, [esi]
+    movdqu  xmm1, [esi+ebx]
+    movdqu  [edi], xmm0
+    movdqu  [edi+edx], xmm1
+    dec     ecx
+    dec     ecx
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    jnz     .height_loop
+%endif
+    
     pop     edi
     pop     esi
     pop     ebx
-- 
2.40.0