From 02713c2401b471fb3bfa4541e8b867dfd06628cc Mon Sep 17 00:00:00 2001 From: Min Chen Date: Fri, 18 Jun 2004 02:00:40 +0000 Subject: [PATCH] update & SSE2 support git-svn-id: svn://svn.videolan.org/x264/trunk@10 df754926-b1dd-0310-bc7b-ec298dee348c --- Makefile | 5 +- Makefile.cygwin | 7 +-- build/cygwin/Makefile | 12 +++-- core/i386/mc-c.c | 63 ++++++++++++++----------- core/i386/mc.asm | 106 ++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 155 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 25060f5d..7c214124 100644 --- a/Makefile +++ b/Makefile @@ -4,8 +4,9 @@ # Defines: HAVE_ALTIVEC # CFLAGS: -faltivec # +PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -DHAVE_SSE2 CC=gcc -CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86 +CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H $(PFLAGS) SRCS= core/mc.c core/predict.c core/pixel.c core/macroblock.c \ core/frame.c core/dct.c core/cpu.c core/cabac.c \ @@ -18,7 +19,7 @@ SRCS= core/mc.c core/predict.c core/pixel.c core/macroblock.c \ AS= nasm # for linux -ASFLAGS=-f elf +ASFLAGS=-f elf $(PFLAGS) # for cygwin #ASFLAGS=-f gnuwin32 -DPREFIX diff --git a/Makefile.cygwin b/Makefile.cygwin index b99f8dc4..7b698883 100644 --- a/Makefile.cygwin +++ b/Makefile.cygwin @@ -1,7 +1,8 @@ # Makefile: tuned for i386/MMX cygwin system only # +PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -DHAVE_SSE2 CC=gcc -CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86 +CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H $(PFLAGS) SRCS= core/mc.c core/predict.c core/pixel.c core/macroblock.c \ core/frame.c core/dct.c core/cpu.c core/cabac.c \ @@ -14,8 +15,8 @@ SRCS= core/mc.c core/predict.c core/pixel.c core/macroblock.c \ AS= nasm #for cygwin -ASFLAGS=-f win32 -DPREFIX - +ASFLAGS=-f win32 -DPREFIX $(PFLAGS) + ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm OBJASM= $(ASMSRC:%.asm=%.o) diff --git a/build/cygwin/Makefile b/build/cygwin/Makefile index 7ea6c897..c7225126 100644 --- a/build/cygwin/Makefile +++ b/build/cygwin/Makefile @@ -4,7 +4,7 @@ # # Author: x264 by Laurent Aimar # -# $Id: Makefile,v 1.3 2004/06/14 05:47:51 chenm001 Exp $ +# $Id: Makefile,v 1.4 2004/06/18 02:00:40 chenm001 Exp $ ############################################################################## # Current dir @@ -27,6 +27,11 @@ SRC_ASM= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.as # Alias RM= rm -rf +############################################################################## +# PFLAGS +############################################################################## +PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -UHAVE_SSE2 + ############################################################################## # CFLAGS ############################################################################## @@ -35,7 +40,8 @@ RM= rm -rf # The `mingw-runtime` package is required when building with -mno-cygwin CFLAGS += -I$(DIR_SRC) CFLAGS += -mno-cygwin -CFLAGS += -D__X264__ -DARCH_X86 -DHAVE_MMXEXT -D_CYGWIN +CFLAGS += -D__X264__ -D_CYGWIN +CFLAGS += $(PFLAGS) # Optional Compiler options CFLAGS += -g -Wall -DDEBUG @@ -55,7 +61,7 @@ LDFLAGS += -L$(DIR_LIB) -lx264 # ASM ############################################################################## AS= nasm -ASFLAGS= -f win32 -DPREFIX +ASFLAGS= -f win32 -DPREFIX $(PFLAGS) ############################################################################## # Rules ############################################################################## diff --git a/core/i386/mc-c.c b/core/i386/mc-c.c index e409b520..bab42647 100644 --- a/core/i386/mc-c.c +++ b/core/i386/mc-c.c @@ -2,7 +2,7 @@ * mc.c: h264 encoder library (Motion Compensation) ***************************************************************************** * Copyright (C) 2003 Laurent Aimar - * $Id: mc-c.c,v 1.4 2004/06/17 09:01:19 chenm001 Exp $ + * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $ * * Authors: Laurent Aimar * @@ -26,6 +26,7 @@ #include #include +#include "x264.h" /* DECLARE_ALIGNED */ #include "../mc.h" #include "../clip1.h" #include "mc.h" @@ -198,12 +199,6 @@ static inline void pixel_avg_w4( uint8_t *dst, int i_dst_stride, src2 += i_src2_stride; } } -#else -extern void pixel_avg_w4( uint8_t *dst, int i_dst_stride, - uint8_t *src1, int i_src1_stride, - uint8_t *src2, int i_src2_stride, - int i_height ); -#endif static inline void pixel_avg_w8( uint8_t *dst, int i_dst_stride, uint8_t *src1, int i_src1_stride, @@ -251,6 +246,20 @@ static inline void pixel_avg_w16( uint8_t *dst, int i_dst_stride, src2 += i_src2_stride; } } +#else +extern void pixel_avg_w4( uint8_t *dst, int i_dst_stride, + uint8_t *src1, int i_src1_stride, + uint8_t *src2, int i_src2_stride, + int i_height ); +extern void pixel_avg_w8( uint8_t *dst, int i_dst_stride, + uint8_t *src1, int i_src1_stride, + uint8_t *src2, int i_src2_stride, + int i_height ); +extern void pixel_avg_w16( uint8_t *dst, int i_dst_stride, + uint8_t *src1, int i_src1_stride, + uint8_t *src2, int i_src2_stride, + int i_height ); +#endif typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ); @@ -803,34 +812,34 @@ static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int /* mc I+H */ static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp[16*16]; + DECLARE_ALIGNED(uint8_t, tmp[16*16], 16); mc_hh_w16( src, i_src_stride, tmp, 16, i_height ); pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height ); } static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp[16*16]; + DECLARE_ALIGNED(uint8_t, tmp[16*16], 16); mc_hh_w16( src, i_src_stride, tmp, 16, i_height ); pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height ); } /* mc I+V */ static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp[16*16]; + DECLARE_ALIGNED(uint8_t, tmp[16*16], 16); mc_hv_w16( src, i_src_stride, tmp, 16, i_height ); pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height ); } static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp[16*16]; + DECLARE_ALIGNED(uint8_t, tmp[16*16], 16); mc_hv_w16( src, i_src_stride, tmp, 16, i_height ); pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height ); } /* H+V */ static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp1[16*16]; - uint8_t tmp2[16*16]; + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); mc_hv_w16( src, i_src_stride, tmp1, 16, i_height ); mc_hh_w16( src, i_src_stride, tmp2, 16, i_height ); @@ -838,8 +847,8 @@ static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst } static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp1[16*16]; - uint8_t tmp2[16*16]; + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height ); mc_hh_w16( src, i_src_stride, tmp2, 16, i_height ); @@ -847,8 +856,8 @@ static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst } static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp1[16*16]; - uint8_t tmp2[16*16]; + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); mc_hv_w16( src, i_src_stride, tmp1, 16, i_height ); mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height ); @@ -856,8 +865,8 @@ static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst } static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp1[16*16]; - uint8_t tmp2[16*16]; + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height ); mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height ); @@ -865,8 +874,8 @@ static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst } static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp1[16*16]; - uint8_t tmp2[16*16]; + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); mc_hc_w16( src, i_src_stride, tmp1, 16, i_height ); mc_hh_w16( src, i_src_stride, tmp2, 16, i_height ); @@ -874,8 +883,8 @@ static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst } static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp1[16*16]; - uint8_t tmp2[16*16]; + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); mc_hc_w16( src, i_src_stride, tmp1, 16, i_height ); mc_hv_w16( src, i_src_stride, tmp2, 16, i_height ); @@ -883,8 +892,8 @@ static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst } static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp1[16*16]; - uint8_t tmp2[16*16]; + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); mc_hc_w16( src, i_src_stride, tmp1, 16, i_height ); mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height ); @@ -892,8 +901,8 @@ static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst } static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { - uint8_t tmp1[16*16]; - uint8_t tmp2[16*16]; + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); mc_hc_w16( src, i_src_stride, tmp1, 16, i_height ); mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height ); diff --git a/core/i386/mc.asm b/core/i386/mc.asm index 0f18c04d..a932e159 100644 --- a/core/i386/mc.asm +++ b/core/i386/mc.asm @@ -2,7 +2,7 @@ ;* mc.asm: h264 encoder library ;***************************************************************************** ;* Copyright (C) 2003 x264 project -;* $Id: mc.asm,v 1.2 2004/06/17 09:01:19 chenm001 Exp $ +;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $ ;* ;* Authors: Min Chen (converted to nasm) ;* Laurent Aimar (init algorithm) @@ -111,6 +111,95 @@ ALIGN 4 ret +cglobal pixel_avg_w8 + +ALIGN 16 +;----------------------------------------------------------------------------- +; void pixel_avg_w8( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); +;----------------------------------------------------------------------------- +pixel_avg_w8: + push ebp + push ebx + push esi + push edi + + mov edi, [esp+20] ; dst + mov ebx, [esp+28] ; src1 + mov ecx, [esp+36] ; src2 + mov esi, [esp+24] ; i_dst_stride + mov eax, [esp+32] ; i_src1_stride + mov edx, [esp+40] ; i_src2_stride + mov ebp, [esp+44] ; i_height +ALIGN 4 +.height_loop + movq mm0, [ebx] + pavgb mm0, [ecx] + movq [edi], mm0 + dec ebp + lea ebx, [ebx+eax] + lea ecx, [ecx+edx] + lea edi, [edi+esi] + jne .height_loop + + pop edi + pop esi + pop ebx + pop ebp + ret + + +cglobal pixel_avg_w16 + +ALIGN 16 +;----------------------------------------------------------------------------- +; void pixel_avg_w16( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); +;----------------------------------------------------------------------------- +pixel_avg_w16: + push ebp + push ebx + push esi + push edi + + mov edi, [esp+20] ; dst + mov ebx, [esp+28] ; src1 + mov ecx, [esp+36] ; src2 + mov esi, [esp+24] ; i_dst_stride + mov eax, [esp+32] ; i_src1_stride + mov edx, [esp+40] ; i_src2_stride + mov ebp, [esp+44] ; i_height +ALIGN 4 +.height_loop +%ifndef HAVE_SSE2 + movq mm0, [ebx ] + movq mm1, [ebx+8] + pavgb mm0, [ecx ] + pavgb mm1, [ecx+8] + movq [edi ], mm0 + movq [edi+8], mm1 +%else + movdqu xmm0, [ebx] + pavgb xmm0, [ecx] + movdqu [edi], xmm0 +%endif + dec ebp + lea ebx, [ebx+eax] + lea ecx, [ecx+edx] + lea edi, [edi+esi] + jne .height_loop + + pop edi + pop esi + pop ebx + pop ebp + ret + + cglobal mc_copy_w4 ALIGN 16 @@ -201,6 +290,7 @@ mc_copy_w16: mov ecx, [esp+32] ; i_height ALIGN 4 .height_loop +%ifndef HAVE_SSE2 movq mm0, [esi] movq mm1, [esi+8] movq [edi], mm0 @@ -221,10 +311,20 @@ ALIGN 4 movq [edi+edx+8], mm7 lea esi, [esi+ebx*2] lea edi, [edi+edx*2] - sub ecx, byte 4 jnz .height_loop - +%else + movdqu xmm0, [esi] + movdqu xmm1, [esi+ebx] + movdqu [edi], xmm0 + movdqu [edi+edx], xmm1 + dec ecx + dec ecx + lea esi, [esi+ebx*2] + lea edi, [edi+edx*2] + jnz .height_loop +%endif + pop edi pop esi pop ebx -- 2.40.0