update & SSE2 support

author Min Chen <chenm001@163.com>

Fri, 18 Jun 2004 02:00:40 +0000 (02:00 +0000)

committer Min Chen <chenm001@163.com>

Fri, 18 Jun 2004 02:00:40 +0000 (02:00 +0000)
author Min Chen <chenm001@163.com>
Fri, 18 Jun 2004 02:00:40 +0000 (02:00 +0000)
committer Min Chen <chenm001@163.com>
Fri, 18 Jun 2004 02:00:40 +0000 (02:00 +0000)
diff --git a/Makefile b/Makefile

index 25060f5d4f404e3d6e4c60f9f451d4d1b8ced4a7..7c2141244f783f0be14014be02c9003a761731c6 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -4,8 +4,9 @@
  #  Defines: HAVE_ALTIVEC
  #  CFLAGS: -faltivec
  #
+PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -DHAVE_SSE2
  CC=gcc
-CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H $(PFLAGS)
  
  SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
         core/frame.c core/dct.c core/cpu.c core/cabac.c \
@@ -18,7 +19,7 @@ SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
  
  AS= nasm
  # for linux
-ASFLAGS=-f elf
+ASFLAGS=-f elf $(PFLAGS)
  # for cygwin
  #ASFLAGS=-f gnuwin32 -DPREFIX
  
diff --git a/Makefile.cygwin b/Makefile.cygwin

index b99f8dc4323d2504b6d781cba7a1ecc4a2f66f4e..7b698883aab905ba015ba4b3461e4cab1f0e6527 100644 (file)
--- a/Makefile.cygwin
+++ b/Makefile.cygwin
@@ -1,7 +1,8 @@
  # Makefile: tuned for i386/MMX cygwin system only
  #
+PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -DHAVE_SSE2
  CC=gcc
-CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H $(PFLAGS)
  
  SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
         core/frame.c core/dct.c core/cpu.c core/cabac.c \
@@ -14,8 +15,8 @@ SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
  
  AS= nasm
  #for cygwin
-ASFLAGS=-f win32 -DPREFIX
-
+ASFLAGS=-f win32 -DPREFIX $(PFLAGS)
+          
  ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm
  OBJASM= $(ASMSRC:%.asm=%.o)
  
diff --git a/build/cygwin/Makefile b/build/cygwin/Makefile

index 7ea6c897f4f7507fa7060698cb057732687f8164..c7225126ccf166f5ce654707bb2964e79ce62a4d 100644 (file)
--- a/build/cygwin/Makefile
+++ b/build/cygwin/Makefile
@@ -4,7 +4,7 @@
  #
  # Author: x264 by Laurent Aimar <fenrir@via.ecp.fr>
  #
-# $Id: Makefile,v 1.3 2004/06/14 05:47:51 chenm001 Exp $
+# $Id: Makefile,v 1.4 2004/06/18 02:00:40 chenm001 Exp $
  ##############################################################################
  
  # Current dir
@@ -27,6 +27,11 @@ SRC_ASM= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.as
  # Alias
  RM= rm -rf
  
+##############################################################################
+# PFLAGS
+##############################################################################
+PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -UHAVE_SSE2
+
  ##############################################################################
  # CFLAGS
  ##############################################################################
@@ -35,7 +40,8 @@ RM= rm -rf
  # The `mingw-runtime` package is required when building with -mno-cygwin
  CFLAGS += -I$(DIR_SRC)
  CFLAGS += -mno-cygwin
-CFLAGS += -D__X264__ -DARCH_X86 -DHAVE_MMXEXT -D_CYGWIN
+CFLAGS += -D__X264__ -D_CYGWIN
+CFLAGS += $(PFLAGS)
  
  # Optional Compiler options
  CFLAGS += -g -Wall -DDEBUG
@@ -55,7 +61,7 @@ LDFLAGS += -L$(DIR_LIB) -lx264
  # ASM
  ##############################################################################
  AS= nasm
-ASFLAGS= -f win32 -DPREFIX
+ASFLAGS= -f win32 -DPREFIX $(PFLAGS)
  ##############################################################################
  # Rules
  ##############################################################################
diff --git a/core/i386/mc-c.c b/core/i386/mc-c.c

index e409b520d2373a46c7fcd9271d70d4b4d70aba98..bab42647a612e9f1c7452f15121cd27ae3c15393 100644 (file)
--- a/core/i386/mc-c.c
+++ b/core/i386/mc-c.c
@@ -2,7 +2,7 @@
   * mc.c: h264 encoder library (Motion Compensation)
   *****************************************************************************
   * Copyright (C) 2003 Laurent Aimar
- * $Id: mc-c.c,v 1.4 2004/06/17 09:01:19 chenm001 Exp $
+ * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $
   *
   * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   *
@@ -26,6 +26,7 @@
  #include <string.h>
  #include <stdint.h>
  
+#include "x264.h"   /* DECLARE_ALIGNED */
  #include "../mc.h"
  #include "../clip1.h"
  #include "mc.h"
@@ -198,12 +199,6 @@ static inline void pixel_avg_w4( uint8_t *dst,  int i_dst_stride,
          src2 += i_src2_stride;
      }
  }
-#else
-extern void pixel_avg_w4( uint8_t *dst,  int i_dst_stride,
-                          uint8_t *src1, int i_src1_stride,
-                          uint8_t *src2, int i_src2_stride,
-                          int i_height );
-#endif
  
  static inline void pixel_avg_w8( uint8_t *dst,  int i_dst_stride,
                                   uint8_t *src1, int i_src1_stride,
@@ -251,6 +246,20 @@ static inline void pixel_avg_w16( uint8_t *dst,  int i_dst_stride,
          src2 += i_src2_stride;
      }
  }
+#else
+extern void pixel_avg_w4( uint8_t *dst,  int i_dst_stride,
+                          uint8_t *src1, int i_src1_stride,
+                          uint8_t *src2, int i_src2_stride,
+                          int i_height );
+extern void pixel_avg_w8( uint8_t *dst,  int i_dst_stride,
+                          uint8_t *src1, int i_src1_stride,
+                          uint8_t *src2, int i_src2_stride,
+                          int i_height );
+extern void pixel_avg_w16( uint8_t *dst,  int i_dst_stride,
+                           uint8_t *src1, int i_src1_stride,
+                           uint8_t *src2, int i_src2_stride,
+                           int i_height );
+#endif
  
  typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
  
@@ -803,34 +812,34 @@ static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int
  /* mc I+H */
  static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
      mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
      pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
  }
  static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
      mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
      pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height );
  }
  /* mc I+V */
  static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
      mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
      pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
  }
  static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
      mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
      pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height );
  }
  /* H+V */
  static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
  
      mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
      mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
@@ -838,8 +847,8 @@ static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
  }
  static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
  
      mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
      mc_hh_w16( src,   i_src_stride, tmp2, 16, i_height );
@@ -847,8 +856,8 @@ static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
  }
  static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
  
      mc_hv_w16( src,              i_src_stride, tmp1, 16, i_height );
      mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
@@ -856,8 +865,8 @@ static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
  }
  static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
  
      mc_hv_w16( src+1,            i_src_stride, tmp1, 16, i_height );
      mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
@@ -865,8 +874,8 @@ static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
  }
  static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
  
      mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
      mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
@@ -874,8 +883,8 @@ static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
  }
  static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
  
      mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
      mc_hv_w16( src, i_src_stride, tmp2, 16, i_height );
@@ -883,8 +892,8 @@ static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
  }
  static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
  
      mc_hc_w16( src,   i_src_stride, tmp1, 16, i_height );
      mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height );
@@ -892,8 +901,8 @@ static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
  }
  static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
  {
-    uint8_t tmp1[16*16];
-    uint8_t tmp2[16*16];
+    DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
+    DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
  
      mc_hc_w16( src,              i_src_stride, tmp1, 16, i_height );
      mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
diff --git a/core/i386/mc.asm b/core/i386/mc.asm

index 0f18c04d24439a9f8b2a285f976524b6b71df567..a932e159cbaa0fbb3bd98eb856db64548bdb3996 100644 (file)
--- a/core/i386/mc.asm
+++ b/core/i386/mc.asm
@@ -2,7 +2,7 @@
  ;* mc.asm: h264 encoder library
  ;*****************************************************************************
  ;* Copyright (C) 2003 x264 project
-;* $Id: mc.asm,v 1.2 2004/06/17 09:01:19 chenm001 Exp $
+;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $
  ;*
  ;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
  ;*          Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
@@ -111,6 +111,95 @@ ALIGN 4
      ret
  
                            
+cglobal pixel_avg_w8
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void pixel_avg_w8( uint8_t *dst,  int i_dst_stride,
+;                    uint8_t *src1, int i_src1_stride,
+;                    uint8_t *src2, int i_src2_stride,
+;                    int i_height );
+;-----------------------------------------------------------------------------
+pixel_avg_w8:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    mov         eax, [esp+32]       ; i_src1_stride
+    mov         edx, [esp+40]       ; i_src2_stride
+    mov         ebp, [esp+44]       ; i_height
+ALIGN 4
+.height_loop    
+    movq        mm0, [ebx]
+    pavgb       mm0, [ecx]
+    movq        [edi], mm0
+    dec         ebp
+    lea         ebx, [ebx+eax]
+    lea         ecx, [ecx+edx]
+    lea         edi, [edi+esi]
+    jne         .height_loop
+
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
+cglobal pixel_avg_w16
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void pixel_avg_w16( uint8_t *dst,  int i_dst_stride,
+;                     uint8_t *src1, int i_src1_stride,
+;                     uint8_t *src2, int i_src2_stride,
+;                     int i_height );
+;-----------------------------------------------------------------------------
+pixel_avg_w16:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    mov         eax, [esp+32]       ; i_src1_stride
+    mov         edx, [esp+40]       ; i_src2_stride
+    mov         ebp, [esp+44]       ; i_height
+ALIGN 4
+.height_loop    
+%ifndef HAVE_SSE2
+    movq        mm0, [ebx  ]
+    movq        mm1, [ebx+8]
+    pavgb       mm0, [ecx  ]
+    pavgb       mm1, [ecx+8]
+    movq        [edi  ], mm0
+    movq        [edi+8], mm1
+%else
+    movdqu      xmm0, [ebx]
+    pavgb       xmm0, [ecx]
+    movdqu      [edi], xmm0
+%endif
+    dec         ebp
+    lea         ebx, [ebx+eax]
+    lea         ecx, [ecx+edx]
+    lea         edi, [edi+esi]
+    jne         .height_loop
+
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
  cglobal mc_copy_w4
  
  ALIGN 16
@@ -201,6 +290,7 @@ mc_copy_w16:
      mov     ecx, [esp+32]       ; i_height
  ALIGN 4
  .height_loop
+%ifndef HAVE_SSE2
      movq    mm0, [esi]
      movq    mm1, [esi+8]
      movq    [edi], mm0
@@ -221,10 +311,20 @@ ALIGN 4
      movq    [edi+edx+8], mm7
      lea     esi, [esi+ebx*2]
      lea     edi, [edi+edx*2]
-    
      sub     ecx, byte 4
      jnz     .height_loop
-
+%else
+    movdqu  xmm0, [esi]
+    movdqu  xmm1, [esi+ebx]
+    movdqu  [edi], xmm0
+    movdqu  [edi+edx], xmm1
+    dec     ecx
+    dec     ecx
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    jnz     .height_loop
+%endif
+    
      pop     edi
      pop     esi
      pop     ebx
author	Min Chen <chenm001@163.com>
	Fri, 18 Jun 2004 02:00:40 +0000 (02:00 +0000)
committer	Min Chen <chenm001@163.com>
	Fri, 18 Jun 2004 02:00:40 +0000 (02:00 +0000)
Makefile		patch \| blob \| history
Makefile.cygwin		patch \| blob \| history
build/cygwin/Makefile		patch \| blob \| history
core/i386/mc-c.c		patch \| blob \| history
core/i386/mc.asm		patch \| blob \| history