CFLAGS+=-DHAVE_MMXEXT -DHAVE_SSE2
SRCS+= common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
ASMSRC= common/i386/dct-a.asm common/i386/cpu-a.asm \
- common/i386/pixel-a.asm common/i386/mc-a.asm
+ common/i386/pixel-a.asm common/i386/mc-a.asm \
+ common/i386/mc-a2.asm common/i386/predict-a.asm
OBJASM= $(ASMSRC:%.asm=%.o)
endif
encoder/encoder.c encoder/eval.c \
common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
-SRC_ASM= common/i386/dct-a.asm common/i386/cpu-a.asm common/i386/pixel-a.asm common/i386/mc-a.asm
+SRC_ASM= common/i386/dct-a.asm common/i386/cpu-a.asm common/i386/pixel-a.asm \
+ common/i386/mc-a.asm common/i386/mc-a2.asm common/i386/predict-a.asm
# Alias
RM= rm -rf
--- /dev/null
+;*****************************************************************************
+;* mc-a2.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005 x264 project
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+ %ifdef PREFIX
+ global _%1
+ %define %1 _%1
+ %else
+ global %1
+ %endif
+%endmacro
+
+;=============================================================================
+; Read only data
+;=============================================================================
+
+SECTION .rodata data align=16
+
+ALIGN 16
+mmx_dw_one:
+ times 4 dw 16
+mmx_dd_one:
+ times 2 dd 512
+mmx_dw_20:
+ times 4 dw 20
+mmx_dw_5:
+ times 4 dw -5
+
+SECTION .data
+
+width:
+ dd 0
+height:
+ dd 0
+dstp1:
+ dd 0
+dstp2:
+ dd 0
+buffer:
+ dd 0
+dst1:
+ dd 0
+dst2:
+ dd 0
+src:
+ dd 0
+
+
+;=============================================================================
+; Macros
+;=============================================================================
+
+%macro LOAD_4 9
+ movd %1, %5
+ movd %2, %6
+ movd %3, %7
+ movd %4, %8
+ punpcklbw %1, %9
+ punpcklbw %2, %9
+ punpcklbw %3, %9
+ punpcklbw %4, %9
+%endmacro
+
+%macro FILT_2 2
+ psubw %1, %2
+ psllw %2, 2
+ psubw %1, %2
+%endmacro
+
+%macro FILT_4 3
+ paddw %2, %3
+ psllw %2, 2
+ paddw %1, %2
+ psllw %2, 2
+ paddw %1, %2
+%endmacro
+
+%macro FILT_6 4
+ psubw %1, %2
+ psllw %2, 2
+ psubw %1, %2
+ paddw %1, %3
+ paddw %1, %4
+ psraw %1, 5
+%endmacro
+
+%macro FILT_ALL 1
+ LOAD_4 mm1, mm2, mm3, mm4, [%1], [%1 + ecx], [%1 + 2 * ecx], [%1 + ebx], mm0
+ FILT_2 mm1, mm2
+ movd mm5, [%1 + 4 * ecx]
+ movd mm6, [%1 + edx]
+ FILT_4 mm1, mm3, mm4
+ punpcklbw mm5, mm0
+ punpcklbw mm6, mm0
+ psubw mm1, mm5
+ psllw mm5, 2
+ psubw mm1, mm5
+ paddw mm1, mm6
+%endmacro
+
+
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_vertical_filter_mmxext
+cglobal x264_horizontal_filter_mmxext
+cglobal x264_center_filter_mmxext
+
+;-----------------------------------------------------------------------------
+;
+; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
+; uint8_t *dst2, int i_dst2_stride,
+; uint8_t *src, int i_src_stride,
+; int i_width, int i_height );
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_center_filter_mmxext :
+
+ push edi
+ push esi
+ push ebx
+ push ebp
+
+ mov esi, [esp + 36] ; src
+
+ mov edx, [esp + 20] ; dst1
+ mov [dst1], edx
+
+ mov edi, [esp + 28] ; dst2
+ mov [dst2], edi
+
+ mov eax, [esp + 44] ; width
+ mov [width], eax
+
+ mov eax, [esp + 48] ; height
+ mov [height], eax
+
+ mov eax, [esp + 24] ; dst1_stride
+ mov [dstp1], eax
+
+ mov eax, [esp + 32] ; dst2_stride
+ mov [dstp2], eax
+
+ mov ecx, [esp + 40] ; src_stride
+
+ sub esp, ecx
+ sub esp, ecx ; esp is now at the beginning of the buffer
+ mov [buffer], esp
+
+ ;sub esi, 2
+ sub esi, ecx
+ sub esi, ecx ; esi - 2 - 2 * stride
+ mov [src], esi
+
+ ;sub edi, 2
+
+ mov ebx, ecx
+ shl ebx, 1
+ add ebx, ecx ; 3 * src_stride
+
+ mov edx, ecx
+ shl edx, 1
+ add edx, ebx ; 5 * src_stride
+
+ pxor mm0, mm0 ; 0 ---> mm0
+ movq mm7, [mmx_dd_one] ; for rounding
+
+ mov ebp, [height]
+
+loopcy:
+
+ dec ebp
+ mov eax, [width]
+ mov edi, [dst1]
+ mov esp, [buffer]
+ mov esi, [src]
+
+ FILT_ALL esi
+
+ pshufw mm2, mm1, 0
+ movq [esp], mm2
+ add esp, 8
+ movq [esp], mm1
+ add esp, 8
+ paddw mm1, [mmx_dw_one]
+ psraw mm1, 5
+
+ packuswb mm1, mm1
+ movd [edi], mm1
+
+ sub eax, 8
+ add edi, 4
+ add esi, 4
+
+loopcx1:
+
+ sub eax, 4
+
+ FILT_ALL esi
+
+ movq [esp], mm1
+ paddw mm1, [mmx_dw_one]
+ psraw mm1, 5
+ packuswb mm1, mm1
+ movd [edi], mm1
+
+ add esp, 8
+ add esi, 4
+ add edi, 4
+ test eax, eax
+ jnz loopcx1
+
+ FILT_ALL esi
+
+ pshufw mm2, mm1, 7
+ movq [esp], mm1
+ add esp, 8
+ movq [esp], mm2
+ paddw mm1, [mmx_dw_one]
+ psraw mm1, 5
+ packuswb mm1, mm1
+ movd [edi], mm1
+
+ mov esi, [src]
+ add esi, ecx
+ mov [src], esi
+
+ mov edi, [dst1]
+ add edi, [dstp1]
+ mov [dst1], edi
+
+ mov eax, [width]
+ mov edi, [dst2]
+ mov esp, [buffer]
+ add esp, 4
+
+loopcx2:
+
+ sub eax, 4
+
+ movq mm2, [esp + 2 * eax + 2]
+ movq mm3, [esp + 2 * eax + 4]
+ movq mm4, [esp + 2 * eax + 6]
+ movq mm5, [esp + 2 * eax + 8]
+ movq mm1, [esp + 2 * eax]
+ movq mm6, [esp + 2 * eax + 10]
+ paddw mm2, mm5
+ paddw mm3, mm4
+ paddw mm1, mm6
+
+ movq mm5, [mmx_dw_20]
+ movq mm4, [mmx_dw_5]
+ movq mm6, mm1
+ pxor mm7, mm7
+
+ punpckhwd mm5, mm2
+ punpcklwd mm4, mm3
+ punpcklwd mm2, [mmx_dw_20]
+ punpckhwd mm3, [mmx_dw_5]
+
+ pcmpgtw mm7, mm1
+
+ pmaddwd mm2, mm4
+ pmaddwd mm3, mm5
+
+ punpcklwd mm1, mm7
+ punpckhwd mm6, mm7
+
+ paddd mm2, mm1
+ paddd mm3, mm6
+
+ paddd mm2, [mmx_dd_one]
+ paddd mm3, [mmx_dd_one]
+
+ psrad mm2, 10
+ psrad mm3, 10
+
+ packssdw mm2, mm3
+ packuswb mm2, mm0
+
+ movd [edi + eax], mm2
+
+ test eax, eax
+ jnz loopcx2
+
+ add edi, [dstp2]
+ mov [dst2], edi
+
+ test ebp, ebp
+ jnz loopcy
+
+ mov esp, [buffer]
+ shl ecx, 1
+ add esp, ecx
+
+ pop ebp
+ pop ebx
+ pop esi
+ pop edi
+
+ ret
+
+;-----------------------------------------------------------------------------
+;
+; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
+; uint8_t *src, int i_src_stride,
+; int i_width, int i_height );
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_horizontal_filter_mmxext :
+ push edi
+ push esi
+
+ mov edi, [esp + 12] ; dst
+ mov esi, [esp + 20] ; src
+
+ pxor mm0, mm0
+ movq mm7, [mmx_dw_one]
+
+ mov ecx, [esp + 32] ; height
+
+ sub esi, 2
+
+loophy:
+
+ dec ecx
+ mov eax, [esp + 28] ; width
+
+loophx:
+
+ sub eax, 8
+
+ LOAD_4 mm1, mm2, mm3, mm4, [esi + eax], [esi + eax + 1], [esi + eax + 2], [esi + eax + 3], mm0
+ FILT_2 mm1, mm2
+ movd mm5, [esi + eax + 4]
+ movd mm6, [esi + eax + 5]
+ FILT_4 mm1, mm3, mm4
+ movd mm2, [esi + eax + 4]
+ movd mm3, [esi + eax + 6]
+ punpcklbw mm5, mm0
+ punpcklbw mm6, mm0
+ FILT_6 mm1, mm5, mm6, mm7
+ movd mm4, [esi + eax + 7]
+ movd mm5, [esi + eax + 8]
+ punpcklbw mm2, mm0
+ punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready
+ FILT_2 mm2, mm6
+ movd mm6, [esi + eax + 9]
+ punpcklbw mm4, mm0
+ punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
+ FILT_4 mm2, mm3, mm4
+ punpcklbw mm6, mm0
+ FILT_6 mm2, mm5, mm6, mm7
+
+ packuswb mm1, mm2
+ movq [edi + eax], mm1
+
+ test eax, eax
+ jnz loophx
+
+ add esi, [esp + 24] ; src_pitch
+ add edi, [esp + 16] ; dst_pitch
+
+ test ecx, ecx
+ jnz loophy
+
+ pop esi
+ pop edi
+
+ ret
#include "common/clip1.h"
#include "mc.h"
+#if 0
+
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
#define USED_UINT64(foo) \
static const uint64_t foo __asm__ (#foo) __attribute__((used))
MOTION_COMPENSATION_LUMA
}
+#endif
void mc_luma_mmx( uint8_t *src[4], int i_src_stride,
uint8_t *dst, int i_dst_stride,
pf->get_ref = get_ref_mmx;
}
+#if 0
void get_funcs_mmx(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
{
*int_h = mc_hh_w16;
*int_v = mc_hv_w16;
*int_hv = mc_hc_w16;
}
+#endif
--- /dev/null
+;*****************************************************************************
+;* predict-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005 x264 project
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+ %ifdef PREFIX
+ global _%1
+ %define %1 _%1
+ %else
+ global %1
+ %endif
+%endmacro
+
+;=============================================================================
+; Read only data
+;=============================================================================
+
+SECTION .rodata data align=16
+
+SECTION .data
+
+;=============================================================================
+; Macros
+;=============================================================================
+
+%macro SAVE_0_1 1
+ movq [%1] , mm0
+ movq [%1 + 8] , mm1
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal predict_8x8_v_mmx
+cglobal predict_16x16_v_mmx
+
+;-----------------------------------------------------------------------------
+;
+; void predict_8x8_v_mmx( uint8_t *src, int i_stride )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_8x8_v_mmx :
+
+ ;push edi
+ ;push esi
+
+ mov edx , [esp + 4]
+ mov ecx , [esp + 8]
+ sub edx , ecx ; esi <-- line -1
+
+ movq mm0 , [edx]
+ movq [edx + ecx] , mm0 ; 0
+ movq [edx + 2 * ecx] , mm0 ; 1
+ movq [edx + 4 * ecx] , mm0 ; 3
+ movq [edx + 8 * ecx] , mm0 ; 7
+ add edx , ecx ; esi <-- line 0
+ movq [edx + 2 * ecx] , mm0 ; 2
+ movq [edx + 4 * ecx] , mm0 ; 4
+ lea edx , [edx + 4 * ecx] ; esi <-- line 4
+ movq [edx + ecx] , mm0 ; 5
+ movq [edx + 2 * ecx] , mm0 ; 6
+
+ ;pop esi
+ ;pop edi
+
+ ret
+
+;-----------------------------------------------------------------------------
+;
+; void predict_16x16_v_mmx( uint8_t *src, int i_stride )
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_16x16_v_mmx :
+
+ ;push edi
+ ;push esi
+
+ mov edx, [esp + 4]
+ mov ecx, [esp + 8]
+ sub edx, ecx ; esi <-- line -1
+
+ movq mm0, [edx]
+ movq mm1, [edx + 8]
+ mov eax, ecx
+ shl eax, 1
+ add eax, ecx ; eax <-- 3* stride
+
+ SAVE_0_1 (edx + ecx) ; 0
+ SAVE_0_1 (edx + 2 * ecx) ; 1
+ SAVE_0_1 (edx + eax) ; 2
+ SAVE_0_1 (edx + 4 * ecx) ; 3
+ SAVE_0_1 (edx + 2 * eax) ; 5
+ SAVE_0_1 (edx + 8 * ecx) ; 7
+ SAVE_0_1 (edx + 4 * eax) ; 11
+ add edx, ecx ; esi <-- line 0
+ SAVE_0_1 (edx + 4 * ecx) ; 4
+ SAVE_0_1 (edx + 2 * eax) ; 6
+ SAVE_0_1 (edx + 8 * ecx) ; 8
+ SAVE_0_1 (edx + 4 * eax) ; 12
+ lea edx, [edx + 8 * ecx] ; esi <-- line 8
+ SAVE_0_1 (edx + ecx) ; 9
+ SAVE_0_1 (edx + 2 * ecx) ; 10
+ lea edx, [edx + 4 * ecx] ; esi <-- line 12
+ SAVE_0_1 (edx + ecx) ; 13
+ SAVE_0_1 (edx + 2 * ecx) ; 14
+ SAVE_0_1 (edx + eax) ; 15
+
+
+ ;pop esi
+ ;pop edi
+
+ ret
}
}
+
+extern predict_16x16_v_mmx( uint8_t *src, int i_stride );
+
+#if 0
static void predict_16x16_v( uint8_t *src, int i_stride )
{
int i;
src += i_stride;
}
}
+#endif
/****************************************************************************
* 8x8 prediction for intra chroma block DC, H, V, P
src += i_stride;
}
}
+
+extern void predict_8x8_v_mmx( uint8_t *src, int i_stride );
+
+#if 0
static void predict_8x8_v( uint8_t *src, int i_stride )
{
int i;
src += i_stride;
}
}
+#endif
/****************************************************************************
****************************************************************************/
void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
{
- pf[I_PRED_16x16_V ] = predict_16x16_v;
+ pf[I_PRED_16x16_V ] = predict_16x16_v_mmx;
pf[I_PRED_16x16_H ] = predict_16x16_h;
pf[I_PRED_16x16_DC] = predict_16x16_dc;
pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] )
{
- pf[I_PRED_CHROMA_V ] = predict_8x8_v;
+ pf[I_PRED_CHROMA_V ] = predict_8x8_v_mmx;
pf[I_PRED_CHROMA_H ] = predict_8x8_h;
pf[I_PRED_CHROMA_DC] = predict_8x8_dc;
pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
#include "clip1.h"
#include "frame.h"
-#ifdef _MSC_VER
-#undef HAVE_MMXEXT /* not finished now */
-#endif
#ifdef HAVE_MMXEXT
-# include "i386/mc.h"
+#include "i386/mc.h"
#endif
#ifdef ARCH_PPC
-# include "ppc/mc.h"
+#include "ppc/mc.h"
#endif
void get_funcs_mmx(pf_mc_t*, pf_mc_t*, pf_mc_t*);
void get_funcs_sse2(pf_mc_t*, pf_mc_t*, pf_mc_t*);
+extern void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
+ uint8_t *src, int i_src_stride,
+ int i_width, int i_height );
+extern void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
+ uint8_t *dst2, int i_dst2_stride,
+ uint8_t *src, int i_src_stride,
+ int i_width, int i_height );
+
void x264_frame_filter( int cpu, x264_frame_t *frame )
{
const int x_inc = 16, y_inc = 16;
pf_mc_t int_v = mc_hv;
pf_mc_t int_hv = mc_hc;
+#if 0
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMXEXT )
get_funcs_mmx(&int_h, &int_v, &int_hv);
if( cpu&X264_CPU_SSE2 )
get_funcs_sse2(&int_h, &int_v, &int_hv);
#endif
+#endif
- for( y = -8; y < frame->i_lines[0]+8; y += y_inc ) {
-
- uint8_t *p_in = frame->plane[0] + y * stride - 8;
- uint8_t *p_h = frame->filtered[1] + y * stride - 8;
- uint8_t *p_v = frame->filtered[2] + y * stride - 8;
- uint8_t *p_hv = frame->filtered[3] + y * stride - 8;
-
- for( x = -8; x < stride - 64 + 8; x += x_inc )
+#ifdef HAVE_MMXEXT
+ if ( cpu & X264_CPU_MMXEXT )
+ {
+ x264_horizontal_filter_mmxext(frame->filtered[1] - 8 * stride - 8, stride,
+ frame->plane[0] - 8 * stride - 8, stride,
+ stride - 48, frame->i_lines[0] + 16);
+ x264_center_filter_mmxext(frame->filtered[2] - 8 * stride - 8, stride,
+ frame->filtered[3] - 8 * stride - 8, stride,
+ frame->plane[0] - 8 * stride - 8, stride,
+ stride - 48, frame->i_lines[0] + 16);
+ }
+ else
+ {
+#else
+ {
+#endif
+ for( y = -8; y < frame->i_lines[0]+8; y += y_inc )
{
- int_h( p_in, stride, p_h, stride, x_inc, y_inc );
- int_v( p_in, stride, p_v, stride, x_inc, y_inc );
- int_hv( p_in, stride, p_hv, stride, x_inc, y_inc );
-
- p_h += x_inc;
- p_v += x_inc;
- p_hv += x_inc;
- p_in += x_inc;
+ uint8_t *p_in = frame->plane[0] + y * stride - 8;
+ uint8_t *p_h = frame->filtered[1] + y * stride - 8;
+ uint8_t *p_v = frame->filtered[2] + y * stride - 8;
+ uint8_t *p_hv = frame->filtered[3] + y * stride - 8;
+ for( x = -8; x < stride - 64 + 8; x += x_inc )
+ {
+ int_h( p_in, stride, p_h, stride, x_inc, y_inc );
+ int_v( p_in, stride, p_v, stride, x_inc, y_inc );
+ int_hv( p_in, stride, p_hv, stride, x_inc, y_inc );
+
+ p_h += x_inc;
+ p_v += x_inc;
+ p_hv += x_inc;
+ p_in += x_inc;
+ }
}
}
}