From: Loren Merritt Date: Sun, 27 Mar 2005 20:49:59 +0000 (+0000) Subject: convert mc's inline asm to nasm (slight speedup and msvc compatibility). X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0df24cf995faf3169fe15d808e4fff00c18ad7dc;p=libx264 convert mc's inline asm to nasm (slight speedup and msvc compatibility). patch by Mathieu Monnier. git-svn-id: svn://svn.videolan.org/x264/trunk@180 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/Makefile b/Makefile index a3305c0a..548380f4 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,8 @@ ifeq ($(ARCH),X86) CFLAGS+=-DHAVE_MMXEXT -DHAVE_SSE2 SRCS+= common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c ASMSRC= common/i386/dct-a.asm common/i386/cpu-a.asm \ - common/i386/pixel-a.asm common/i386/mc-a.asm + common/i386/pixel-a.asm common/i386/mc-a.asm \ + common/i386/mc-a2.asm common/i386/predict-a.asm OBJASM= $(ASMSRC:%.asm=%.o) endif diff --git a/build/cygwin/Makefile b/build/cygwin/Makefile index 7554f4d5..89357ff1 100644 --- a/build/cygwin/Makefile +++ b/build/cygwin/Makefile @@ -22,7 +22,8 @@ SRC_C= common/mc.c common/predict.c common/pixel.c common/macroblock.c \ encoder/encoder.c encoder/eval.c \ common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c -SRC_ASM= common/i386/dct-a.asm common/i386/cpu-a.asm common/i386/pixel-a.asm common/i386/mc-a.asm +SRC_ASM= common/i386/dct-a.asm common/i386/cpu-a.asm common/i386/pixel-a.asm \ + common/i386/mc-a.asm common/i386/mc-a2.asm common/i386/predict-a.asm # Alias RM= rm -rf diff --git a/common/i386/mc-a2.asm b/common/i386/mc-a2.asm new file mode 100644 index 00000000..aaab2c1b --- /dev/null +++ b/common/i386/mc-a2.asm @@ -0,0 +1,402 @@ +;***************************************************************************** +;* mc-a2.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2005 x264 project +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +;============================================================================= +; Read only data +;============================================================================= + +SECTION .rodata data align=16 + +ALIGN 16 +mmx_dw_one: + times 4 dw 16 +mmx_dd_one: + times 2 dd 512 +mmx_dw_20: + times 4 dw 20 +mmx_dw_5: + times 4 dw -5 + +SECTION .data + +width: + dd 0 +height: + dd 0 +dstp1: + dd 0 +dstp2: + dd 0 +buffer: + dd 0 +dst1: + dd 0 +dst2: + dd 0 +src: + dd 0 + + +;============================================================================= +; Macros +;============================================================================= + +%macro LOAD_4 9 + movd %1, %5 + movd %2, %6 + movd %3, %7 + movd %4, %8 + punpcklbw %1, %9 + punpcklbw %2, %9 + punpcklbw %3, %9 + punpcklbw %4, %9 +%endmacro + +%macro FILT_2 2 + psubw %1, %2 + psllw %2, 2 + psubw %1, %2 +%endmacro + +%macro FILT_4 3 + paddw %2, %3 + psllw %2, 2 + paddw %1, %2 + psllw %2, 2 + paddw %1, %2 +%endmacro + +%macro FILT_6 4 + psubw %1, %2 + psllw %2, 2 + psubw %1, %2 + paddw %1, %3 + paddw %1, %4 + psraw %1, 5 +%endmacro + +%macro FILT_ALL 1 + LOAD_4 mm1, mm2, mm3, mm4, [%1], [%1 + ecx], [%1 + 2 * ecx], [%1 + ebx], mm0 + FILT_2 mm1, mm2 + movd mm5, [%1 + 4 * ecx] + movd mm6, [%1 + edx] + FILT_4 mm1, mm3, mm4 + punpcklbw mm5, mm0 + punpcklbw mm6, mm0 + psubw mm1, mm5 + psllw mm5, 2 + psubw mm1, mm5 + paddw mm1, mm6 +%endmacro + + + + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal x264_vertical_filter_mmxext +cglobal x264_horizontal_filter_mmxext +cglobal x264_center_filter_mmxext + +;----------------------------------------------------------------------------- +; +; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride, +; uint8_t *dst2, int i_dst2_stride, +; uint8_t *src, int i_src_stride, +; int i_width, int i_height ); +; +;----------------------------------------------------------------------------- + +ALIGN 16 +x264_center_filter_mmxext : + + push edi + push esi + push ebx + push ebp + + mov esi, [esp + 36] ; src + + mov edx, [esp + 20] ; dst1 + mov [dst1], edx + + mov edi, [esp + 28] ; dst2 + mov [dst2], edi + + mov eax, [esp + 44] ; width + mov [width], eax + + mov eax, [esp + 48] ; height + mov [height], eax + + mov eax, [esp + 24] ; dst1_stride + mov [dstp1], eax + + mov eax, [esp + 32] ; dst2_stride + mov [dstp2], eax + + mov ecx, [esp + 40] ; src_stride + + sub esp, ecx + sub esp, ecx ; esp is now at the beginning of the buffer + mov [buffer], esp + + ;sub esi, 2 + sub esi, ecx + sub esi, ecx ; esi - 2 - 2 * stride + mov [src], esi + + ;sub edi, 2 + + mov ebx, ecx + shl ebx, 1 + add ebx, ecx ; 3 * src_stride + + mov edx, ecx + shl edx, 1 + add edx, ebx ; 5 * src_stride + + pxor mm0, mm0 ; 0 ---> mm0 + movq mm7, [mmx_dd_one] ; for rounding + + mov ebp, [height] + +loopcy: + + dec ebp + mov eax, [width] + mov edi, [dst1] + mov esp, [buffer] + mov esi, [src] + + FILT_ALL esi + + pshufw mm2, mm1, 0 + movq [esp], mm2 + add esp, 8 + movq [esp], mm1 + add esp, 8 + paddw mm1, [mmx_dw_one] + psraw mm1, 5 + + packuswb mm1, mm1 + movd [edi], mm1 + + sub eax, 8 + add edi, 4 + add esi, 4 + +loopcx1: + + sub eax, 4 + + FILT_ALL esi + + movq [esp], mm1 + paddw mm1, [mmx_dw_one] + psraw mm1, 5 + packuswb mm1, mm1 + movd [edi], mm1 + + add esp, 8 + add esi, 4 + add edi, 4 + test eax, eax + jnz loopcx1 + + FILT_ALL esi + + pshufw mm2, mm1, 7 + movq [esp], mm1 + add esp, 8 + movq [esp], mm2 + paddw mm1, [mmx_dw_one] + psraw mm1, 5 + packuswb mm1, mm1 + movd [edi], mm1 + + mov esi, [src] + add esi, ecx + mov [src], esi + + mov edi, [dst1] + add edi, [dstp1] + mov [dst1], edi + + mov eax, [width] + mov edi, [dst2] + mov esp, [buffer] + add esp, 4 + +loopcx2: + + sub eax, 4 + + movq mm2, [esp + 2 * eax + 2] + movq mm3, [esp + 2 * eax + 4] + movq mm4, [esp + 2 * eax + 6] + movq mm5, [esp + 2 * eax + 8] + movq mm1, [esp + 2 * eax] + movq mm6, [esp + 2 * eax + 10] + paddw mm2, mm5 + paddw mm3, mm4 + paddw mm1, mm6 + + movq mm5, [mmx_dw_20] + movq mm4, [mmx_dw_5] + movq mm6, mm1 + pxor mm7, mm7 + + punpckhwd mm5, mm2 + punpcklwd mm4, mm3 + punpcklwd mm2, [mmx_dw_20] + punpckhwd mm3, [mmx_dw_5] + + pcmpgtw mm7, mm1 + + pmaddwd mm2, mm4 + pmaddwd mm3, mm5 + + punpcklwd mm1, mm7 + punpckhwd mm6, mm7 + + paddd mm2, mm1 + paddd mm3, mm6 + + paddd mm2, [mmx_dd_one] + paddd mm3, [mmx_dd_one] + + psrad mm2, 10 + psrad mm3, 10 + + packssdw mm2, mm3 + packuswb mm2, mm0 + + movd [edi + eax], mm2 + + test eax, eax + jnz loopcx2 + + add edi, [dstp2] + mov [dst2], edi + + test ebp, ebp + jnz loopcy + + mov esp, [buffer] + shl ecx, 1 + add esp, ecx + + pop ebp + pop ebx + pop esi + pop edi + + ret + +;----------------------------------------------------------------------------- +; +; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src, int i_src_stride, +; int i_width, int i_height ); +; +;----------------------------------------------------------------------------- + +ALIGN 16 +x264_horizontal_filter_mmxext : + push edi + push esi + + mov edi, [esp + 12] ; dst + mov esi, [esp + 20] ; src + + pxor mm0, mm0 + movq mm7, [mmx_dw_one] + + mov ecx, [esp + 32] ; height + + sub esi, 2 + +loophy: + + dec ecx + mov eax, [esp + 28] ; width + +loophx: + + sub eax, 8 + + LOAD_4 mm1, mm2, mm3, mm4, [esi + eax], [esi + eax + 1], [esi + eax + 2], [esi + eax + 3], mm0 + FILT_2 mm1, mm2 + movd mm5, [esi + eax + 4] + movd mm6, [esi + eax + 5] + FILT_4 mm1, mm3, mm4 + movd mm2, [esi + eax + 4] + movd mm3, [esi + eax + 6] + punpcklbw mm5, mm0 + punpcklbw mm6, mm0 + FILT_6 mm1, mm5, mm6, mm7 + movd mm4, [esi + eax + 7] + movd mm5, [esi + eax + 8] + punpcklbw mm2, mm0 + punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready + FILT_2 mm2, mm6 + movd mm6, [esi + eax + 9] + punpcklbw mm4, mm0 + punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready + FILT_4 mm2, mm3, mm4 + punpcklbw mm6, mm0 + FILT_6 mm2, mm5, mm6, mm7 + + packuswb mm1, mm2 + movq [edi + eax], mm1 + + test eax, eax + jnz loophx + + add esi, [esp + 24] ; src_pitch + add edi, [esp + 16] ; dst_pitch + + test ecx, ecx + jnz loophy + + pop esi + pop edi + + ret diff --git a/common/i386/mc-c.c b/common/i386/mc-c.c index 63fb7a7b..b5b3c3cf 100644 --- a/common/i386/mc-c.c +++ b/common/i386/mc-c.c @@ -36,6 +36,8 @@ #include "common/clip1.h" #include "mc.h" +#if 0 + #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) #define USED_UINT64(foo) \ static const uint64_t foo __asm__ (#foo) __attribute__((used)) @@ -1021,6 +1023,7 @@ static void motion_compensation_luma_sse2( uint8_t *src, int i_src_stride, MOTION_COMPENSATION_LUMA } +#endif void mc_luma_mmx( uint8_t *src[4], int i_src_stride, uint8_t *dst, int i_dst_stride, @@ -1141,6 +1144,7 @@ void x264_mc_sse2_init( x264_mc_functions_t *pf ) pf->get_ref = get_ref_mmx; } +#if 0 void get_funcs_mmx(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv) { *int_h = mc_hh_w16; @@ -1154,3 +1158,4 @@ void get_funcs_sse2(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv) *int_v = mc_hv_w16; *int_hv = mc_hc_w16; } +#endif diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm new file mode 100644 index 00000000..3237ebb6 --- /dev/null +++ b/common/i386/predict-a.asm @@ -0,0 +1,141 @@ +;***************************************************************************** +;* predict-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2005 x264 project +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +;============================================================================= +; Read only data +;============================================================================= + +SECTION .rodata data align=16 + +SECTION .data + +;============================================================================= +; Macros +;============================================================================= + +%macro SAVE_0_1 1 + movq [%1] , mm0 + movq [%1 + 8] , mm1 +%endmacro + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal predict_8x8_v_mmx +cglobal predict_16x16_v_mmx + +;----------------------------------------------------------------------------- +; +; void predict_8x8_v_mmx( uint8_t *src, int i_stride ) +; +;----------------------------------------------------------------------------- + +ALIGN 16 +predict_8x8_v_mmx : + + ;push edi + ;push esi + + mov edx , [esp + 4] + mov ecx , [esp + 8] + sub edx , ecx ; esi <-- line -1 + + movq mm0 , [edx] + movq [edx + ecx] , mm0 ; 0 + movq [edx + 2 * ecx] , mm0 ; 1 + movq [edx + 4 * ecx] , mm0 ; 3 + movq [edx + 8 * ecx] , mm0 ; 7 + add edx , ecx ; esi <-- line 0 + movq [edx + 2 * ecx] , mm0 ; 2 + movq [edx + 4 * ecx] , mm0 ; 4 + lea edx , [edx + 4 * ecx] ; esi <-- line 4 + movq [edx + ecx] , mm0 ; 5 + movq [edx + 2 * ecx] , mm0 ; 6 + + ;pop esi + ;pop edi + + ret + +;----------------------------------------------------------------------------- +; +; void predict_16x16_v_mmx( uint8_t *src, int i_stride ) +; +;----------------------------------------------------------------------------- + +ALIGN 16 +predict_16x16_v_mmx : + + ;push edi + ;push esi + + mov edx, [esp + 4] + mov ecx, [esp + 8] + sub edx, ecx ; esi <-- line -1 + + movq mm0, [edx] + movq mm1, [edx + 8] + mov eax, ecx + shl eax, 1 + add eax, ecx ; eax <-- 3* stride + + SAVE_0_1 (edx + ecx) ; 0 + SAVE_0_1 (edx + 2 * ecx) ; 1 + SAVE_0_1 (edx + eax) ; 2 + SAVE_0_1 (edx + 4 * ecx) ; 3 + SAVE_0_1 (edx + 2 * eax) ; 5 + SAVE_0_1 (edx + 8 * ecx) ; 7 + SAVE_0_1 (edx + 4 * eax) ; 11 + add edx, ecx ; esi <-- line 0 + SAVE_0_1 (edx + 4 * ecx) ; 4 + SAVE_0_1 (edx + 2 * eax) ; 6 + SAVE_0_1 (edx + 8 * ecx) ; 8 + SAVE_0_1 (edx + 4 * eax) ; 12 + lea edx, [edx + 8 * ecx] ; esi <-- line 8 + SAVE_0_1 (edx + ecx) ; 9 + SAVE_0_1 (edx + 2 * ecx) ; 10 + lea edx, [edx + 4 * ecx] ; esi <-- line 12 + SAVE_0_1 (edx + ecx) ; 13 + SAVE_0_1 (edx + 2 * ecx) ; 14 + SAVE_0_1 (edx + eax) ; 15 + + + ;pop esi + ;pop edi + + ret diff --git a/common/i386/predict.c b/common/i386/predict.c index b0a0b7b7..5422f15c 100644 --- a/common/i386/predict.c +++ b/common/i386/predict.c @@ -152,6 +152,10 @@ static void predict_16x16_h( uint8_t *src, int i_stride ) } } + +extern predict_16x16_v_mmx( uint8_t *src, int i_stride ); + +#if 0 static void predict_16x16_v( uint8_t *src, int i_stride ) { int i; @@ -168,6 +172,7 @@ static void predict_16x16_v( uint8_t *src, int i_stride ) src += i_stride; } } +#endif /**************************************************************************** * 8x8 prediction for intra chroma block DC, H, V, P @@ -301,6 +306,10 @@ static void predict_8x8_h( uint8_t *src, int i_stride ) src += i_stride; } } + +extern void predict_8x8_v_mmx( uint8_t *src, int i_stride ); + +#if 0 static void predict_8x8_v( uint8_t *src, int i_stride ) { int i; @@ -313,6 +322,7 @@ static void predict_8x8_v( uint8_t *src, int i_stride ) src += i_stride; } } +#endif /**************************************************************************** @@ -404,7 +414,7 @@ static void predict_4x4_v( uint8_t *src, int i_stride ) ****************************************************************************/ void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] ) { - pf[I_PRED_16x16_V ] = predict_16x16_v; + pf[I_PRED_16x16_V ] = predict_16x16_v_mmx; pf[I_PRED_16x16_H ] = predict_16x16_h; pf[I_PRED_16x16_DC] = predict_16x16_dc; pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left; @@ -414,7 +424,7 @@ void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] ) void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] ) { - pf[I_PRED_CHROMA_V ] = predict_8x8_v; + pf[I_PRED_CHROMA_V ] = predict_8x8_v_mmx; pf[I_PRED_CHROMA_H ] = predict_8x8_h; pf[I_PRED_CHROMA_DC] = predict_8x8_dc; pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left; diff --git a/common/mc.c b/common/mc.c index dbe05a39..c179aad2 100644 --- a/common/mc.c +++ b/common/mc.c @@ -37,14 +37,11 @@ #include "clip1.h" #include "frame.h" -#ifdef _MSC_VER -#undef HAVE_MMXEXT /* not finished now */ -#endif #ifdef HAVE_MMXEXT -# include "i386/mc.h" +#include "i386/mc.h" #endif #ifdef ARCH_PPC -# include "ppc/mc.h" +#include "ppc/mc.h" #endif @@ -425,6 +422,14 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) void get_funcs_mmx(pf_mc_t*, pf_mc_t*, pf_mc_t*); void get_funcs_sse2(pf_mc_t*, pf_mc_t*, pf_mc_t*); +extern void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride, + uint8_t *src, int i_src_stride, + int i_width, int i_height ); +extern void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride, + uint8_t *dst2, int i_dst2_stride, + uint8_t *src, int i_src_stride, + int i_width, int i_height ); + void x264_frame_filter( int cpu, x264_frame_t *frame ) { const int x_inc = 16, y_inc = 16; @@ -435,6 +440,7 @@ void x264_frame_filter( int cpu, x264_frame_t *frame ) pf_mc_t int_v = mc_hv; pf_mc_t int_hv = mc_hc; +#if 0 #ifdef HAVE_MMXEXT if( cpu&X264_CPU_MMXEXT ) get_funcs_mmx(&int_h, &int_v, &int_hv); @@ -444,24 +450,41 @@ void x264_frame_filter( int cpu, x264_frame_t *frame ) if( cpu&X264_CPU_SSE2 ) get_funcs_sse2(&int_h, &int_v, &int_hv); #endif +#endif - for( y = -8; y < frame->i_lines[0]+8; y += y_inc ) { - - uint8_t *p_in = frame->plane[0] + y * stride - 8; - uint8_t *p_h = frame->filtered[1] + y * stride - 8; - uint8_t *p_v = frame->filtered[2] + y * stride - 8; - uint8_t *p_hv = frame->filtered[3] + y * stride - 8; - - for( x = -8; x < stride - 64 + 8; x += x_inc ) +#ifdef HAVE_MMXEXT + if ( cpu & X264_CPU_MMXEXT ) + { + x264_horizontal_filter_mmxext(frame->filtered[1] - 8 * stride - 8, stride, + frame->plane[0] - 8 * stride - 8, stride, + stride - 48, frame->i_lines[0] + 16); + x264_center_filter_mmxext(frame->filtered[2] - 8 * stride - 8, stride, + frame->filtered[3] - 8 * stride - 8, stride, + frame->plane[0] - 8 * stride - 8, stride, + stride - 48, frame->i_lines[0] + 16); + } + else + { +#else + { +#endif + for( y = -8; y < frame->i_lines[0]+8; y += y_inc ) { - int_h( p_in, stride, p_h, stride, x_inc, y_inc ); - int_v( p_in, stride, p_v, stride, x_inc, y_inc ); - int_hv( p_in, stride, p_hv, stride, x_inc, y_inc ); - - p_h += x_inc; - p_v += x_inc; - p_hv += x_inc; - p_in += x_inc; + uint8_t *p_in = frame->plane[0] + y * stride - 8; + uint8_t *p_h = frame->filtered[1] + y * stride - 8; + uint8_t *p_v = frame->filtered[2] + y * stride - 8; + uint8_t *p_hv = frame->filtered[3] + y * stride - 8; + for( x = -8; x < stride - 64 + 8; x += x_inc ) + { + int_h( p_in, stride, p_h, stride, x_inc, y_inc ); + int_v( p_in, stride, p_v, stride, x_inc, y_inc ); + int_hv( p_in, stride, p_hv, stride, x_inc, y_inc ); + + p_h += x_inc; + p_v += x_inc; + p_hv += x_inc; + p_in += x_inc; + } } } }