# MMX/SSE optims
ifeq ($(ARCH),X86)
-SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict-c.c
+SRCS += common/i386/mc-c.c common/i386/predict-c.c
ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \
common/i386/pixel-a.asm common/i386/mc-a.asm \
common/i386/mc-a2.asm common/i386/predict-a.asm \
# MMX/SSE optims
ifeq ($(ARCH),X86_64)
-SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict-c.c
+SRCS += common/i386/mc-c.c common/i386/predict-c.c
ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
common/amd64/pixel-a.asm common/amd64/mc-a.asm \
common/amd64/mc-a2.asm common/amd64/predict-a.asm \
;* Copyright (C) 2003 x264 project
;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
;*
-;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
-;* Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;* Min Chen <chenm001.163.com> (converted to nasm)
+;* Loren Merritt <lorenm@u.washington.edu> (dct8)
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]
ret
+
+
+;-----------------------------------------------------------------------------
+; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
+; uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+%macro SUB_NxN_DCT 6
+ALIGN 16
+cglobal %1
+%1:
+ call %2
+ add parm1q, %3
+ add parm2q, %4-%5*FENC_STRIDE
+ add parm3q, %4-%5*FDEC_STRIDE
+ call %2
+ add parm1q, %3
+ add parm2q, %4*FENC_STRIDE-%6
+ add parm3q, %4*FDEC_STRIDE-%6
+ call %2
+ add parm1q, %3
+ add parm2q, %4-%5*FENC_STRIDE
+ add parm3q, %4-%5*FDEC_STRIDE
+ jmp %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+;-----------------------------------------------------------------------------
+%macro ADD_NxN_IDCT 6
+ALIGN 16
+cglobal %1
+%1:
+ call %2
+ add parm1q, %4-%5*FDEC_STRIDE
+ add parm2q, %3
+ call %2
+ add parm1q, %4*FDEC_STRIDE-%6
+ add parm2q, %3
+ call %2
+ add parm1q, %4-%5*FDEC_STRIDE
+ add parm2q, %3
+ jmp %2
+%endmacro
+
+SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx, 32, 4, 0, 4
+ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx, 32, 4, 0, 4
+
+SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx, 32, 4, 4, 12
+ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx, 32, 4, 4, 12
+
+SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8
+ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
+
;* Copyright (C) 2003 x264 project
;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
;*
-;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
-;* Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;* Min Chen <chenm001.163.com> (converted to nasm)
;* Christian Heine <sennindemokrit@gmx.net> (dct8/idct8 functions)
+;* Loren Merritt <lorenm@u.washington.edu> (misc)
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
MMX_SUMSUB_BA %1, %2
%endmacro
-cglobal x264_pixel_sub_8x8_mmx
-cglobal x264_pixel_add_8x8_mmx
-cglobal x264_transpose_8x8_mmx
-cglobal x264_ydct8_mmx
-cglobal x264_yidct8_mmx
-
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
movq mm2, [eax+disp+0*16] ; mm2 = d0
movq mm0, [eax+disp+4*16] ; mm0 = d4
- MMX_SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
+ MMX_SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
- MMX_SUMSUB_BA mm6, mm0 ; mm6 = f0, mm0 = f6
- MMX_SUMSUB_BA mm4, mm2 ; mm4 = f2, mm2 = f4
+ MMX_SUMSUB_BADC mm6, mm0, mm4, mm2 ; mm6 = f0, mm0 = f6
+ ; mm4 = f2, mm2 = f4
- MMX_SUMSUB_BA mm7, mm6 ; mm7 = g0, mm6 = g7
- MMX_SUMSUB_BA mm5, mm4 ; mm5 = g1, mm4 = g6
- MMX_SUMSUB_BA mm3, mm2 ; mm3 = g2, mm2 = g5
- MMX_SUMSUB_BA mm1, mm0 ; mm1 = g3, mm0 = g4
+ MMX_SUMSUB_BADC mm7, mm6, mm5, mm4 ; mm7 = g0, mm6 = g7
+ ; mm5 = g1, mm4 = g6
+ MMX_SUMSUB_BADC mm3, mm2, mm1, mm0 ; mm3 = g2, mm2 = g5
+ ; mm1 = g3, mm0 = g4
movq [eax+disp+0*16], mm7
movq [eax+disp+1*16], mm5
ret
+;-----------------------------------------------------------------------------
+; void __cdecl x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+ALIGN 16
+cglobal x264_sub8x8_dct8_mmx
+x264_sub8x8_dct8_mmx:
+ push dword [esp+12]
+ push dword [esp+12]
+ push dword [esp+12]
+ call x264_pixel_sub_8x8_mmx
+ call x264_ydct8_mmx
+ call x264_transpose_8x8_mmx
+ add esp, 12
+ jmp x264_ydct8_mmx
+
+;-----------------------------------------------------------------------------
+; void __cdecl x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+ALIGN 16
+cglobal x264_add8x8_idct8_mmx
+x264_add8x8_idct8_mmx:
+ mov eax, [esp+8]
+ add word [eax], 32
+ push eax
+ call x264_yidct8_mmx
+ call x264_transpose_8x8_mmx
+ call x264_yidct8_mmx
+ add esp, 4
+ jmp x264_pixel_add_8x8_mmx
+
+;-----------------------------------------------------------------------------
+; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
+; uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+%macro SUB_NxN_DCT 4
+ALIGN 16
+cglobal %1
+%1:
+ mov edx, [esp+12]
+ mov ecx, [esp+ 8]
+ mov eax, [esp+ 4]
+ add edx, %4
+ add ecx, %4
+ add eax, %3
+ push edx
+ push ecx
+ push eax
+ call %2
+ add dword [esp+0], %3
+ add dword [esp+4], %4*FENC_STRIDE-%4
+ add dword [esp+8], %4*FDEC_STRIDE-%4
+ call %2
+ add dword [esp+0], %3
+ add dword [esp+4], %4
+ add dword [esp+8], %4
+ call %2
+ add esp, 12
+ jmp %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+;-----------------------------------------------------------------------------
+%macro ADD_NxN_IDCT 4
+ALIGN 16
+cglobal %1
+%1:
+ mov ecx, [esp+8]
+ mov eax, [esp+4]
+ add ecx, %3
+ add eax, %4
+ push ecx
+ push eax
+ call %2
+ add dword [esp+0], %4*FDEC_STRIDE-%4
+ add dword [esp+4], %3
+ call %2
+ add dword [esp+0], %4
+ add dword [esp+4], %3
+ call %2
+ add esp, 8
+ jmp %2
+%endmacro
+
+SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx, 32, 4
+ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx, 32, 4
+
+SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx, 128, 8
+ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx, 128, 8
+
+SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx, 128, 8
+ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8
+
+++ /dev/null
-/*****************************************************************************
- * dct.c: h264 encoder library
- *****************************************************************************
- * Copyright (C) 2003 Laurent Aimar
- * $Id: dct-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
- *****************************************************************************/
-
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#else
-#include <inttypes.h>
-#endif
-#include <stdlib.h>
-#include <stdarg.h>
-
-#include "dct.h"
-#include "common/common.h"
-
-
-void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
-{
- x264_sub4x4_dct_mmx( dct[0], &pix1[0], &pix2[0] );
- x264_sub4x4_dct_mmx( dct[1], &pix1[4], &pix2[4] );
- x264_sub4x4_dct_mmx( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
- x264_sub4x4_dct_mmx( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
-}
-
-void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
-{
- x264_sub8x8_dct_mmx( &dct[ 0], &pix1[0], &pix2[0] );
- x264_sub8x8_dct_mmx( &dct[ 4], &pix1[8], &pix2[8] );
- x264_sub8x8_dct_mmx( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
- x264_sub8x8_dct_mmx( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
-}
-
-
-
-/****************************************************************************
- * addXxX_idct:
- ****************************************************************************/
-
-void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] )
-{
- x264_add4x4_idct_mmx( p_dst, dct[0] );
- x264_add4x4_idct_mmx( &p_dst[4], dct[1] );
- x264_add4x4_idct_mmx( &p_dst[4*FDEC_STRIDE+0], dct[2] );
- x264_add4x4_idct_mmx( &p_dst[4*FDEC_STRIDE+4], dct[3] );
-}
-
-void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] )
-{
- x264_add8x8_idct_mmx( &p_dst[0], &dct[0] );
- x264_add8x8_idct_mmx( &p_dst[8], &dct[4] );
- x264_add8x8_idct_mmx( &p_dst[8*FDEC_STRIDE], &dct[8] );
- x264_add8x8_idct_mmx( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
-}
-
-/***********************
- * dct8/idct8 functions
- ***********************/
-
-#ifdef ARCH_X86_64
-void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
-{
- x264_sub8x8_dct8_sse2( dct[0], pix1, pix2 );
- x264_sub8x8_dct8_sse2( dct[1], pix1+8, pix2+8 );
- x264_sub8x8_dct8_sse2( dct[2], pix1+8*FENC_STRIDE, pix2+8*FDEC_STRIDE );
- x264_sub8x8_dct8_sse2( dct[3], pix1+8*FENC_STRIDE+8, pix2+8*FDEC_STRIDE+8 );
-}
-
-void x264_add16x16_idct8_sse2( uint8_t *p_dst, int16_t dct[4][8][8] )
-{
- x264_add8x8_idct8_sse2( p_dst, dct[0] );
- x264_add8x8_idct8_sse2( p_dst+8, dct[1] );
- x264_add8x8_idct8_sse2( p_dst+8*FDEC_STRIDE, dct[2] );
- x264_add8x8_idct8_sse2( p_dst+8*FDEC_STRIDE+8, dct[3] );
-}
-
-#else // ARCH_X86
-
-void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
-void x264_pixel_add_8x8_mmx( uint8_t *pix, uint16_t *diff );
-void x264_transpose_8x8_mmx( int16_t src[8][8] );
-void x264_ydct8_mmx( int16_t dct[8][8] );
-void x264_yidct8_mmx( int16_t dct[8][8] );
-
-inline void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
-{
- x264_pixel_sub_8x8_mmx( (int16_t *)dct, pix1, pix2 );
- x264_ydct8_mmx( dct );
- x264_transpose_8x8_mmx( dct );
- x264_ydct8_mmx( dct );
-}
-
-void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
-{
- x264_sub8x8_dct8_mmx( dct[0], pix1, pix2 );
- x264_sub8x8_dct8_mmx( dct[1], pix1+8, pix2+8 );
- x264_sub8x8_dct8_mmx( dct[2], pix1+8*FENC_STRIDE, pix2+8*FDEC_STRIDE );
- x264_sub8x8_dct8_mmx( dct[3], pix1+8*FENC_STRIDE+8, pix2+8*FDEC_STRIDE+8 );
-}
-
-inline void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
-{
- dct[0][0] += 32;
- x264_yidct8_mmx( dct );
- x264_transpose_8x8_mmx( dct );
- x264_yidct8_mmx( dct );
- x264_pixel_add_8x8_mmx( dst, (uint16_t *)dct ); // including >>6 at the end
-}
-
-void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] )
-{
- x264_add8x8_idct8_mmx( dst, dct[0] );
- x264_add8x8_idct8_mmx( dst+8, dct[1] );
- x264_add8x8_idct8_mmx( dst+8*FDEC_STRIDE, dct[2] );
- x264_add8x8_idct8_mmx( dst+8*FDEC_STRIDE+8, dct[3] );
-}
-#endif