slightly faster mmx dct

author Loren Merritt <pengvado@videolan.org>

Sun, 16 Jul 2006 18:28:39 +0000 (18:28 +0000)

committer Loren Merritt <pengvado@videolan.org>

Sun, 16 Jul 2006 18:28:39 +0000 (18:28 +0000)
author Loren Merritt <pengvado@videolan.org>
Sun, 16 Jul 2006 18:28:39 +0000 (18:28 +0000)
committer Loren Merritt <pengvado@videolan.org>
Sun, 16 Jul 2006 18:28:39 +0000 (18:28 +0000)
diff --git a/Makefile b/Makefile

index def4ec61e62418e1e723573e66d22b348ee0d2e9..323a754702fdd4119e8c81ff5571c636e62712bd 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ endif
  
  # MMX/SSE optims
  ifeq ($(ARCH),X86)
-SRCS   += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict-c.c
+SRCS   += common/i386/mc-c.c common/i386/predict-c.c
  ASMSRC  = common/i386/dct-a.asm common/i386/cpu-a.asm \
            common/i386/pixel-a.asm common/i386/mc-a.asm \
            common/i386/mc-a2.asm common/i386/predict-a.asm \
@@ -31,7 +31,7 @@ endif
  
  # MMX/SSE optims
  ifeq ($(ARCH),X86_64)
-SRCS   += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict-c.c
+SRCS   += common/i386/mc-c.c common/i386/predict-c.c
  ASMSRC  = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
            common/amd64/pixel-a.asm common/amd64/mc-a.asm \
            common/amd64/mc-a2.asm common/amd64/predict-a.asm \
diff --git a/common/amd64/dct-a.asm b/common/amd64/dct-a.asm

index db1b2155fa0bba73175494aa20a597ac7d827940..15783d19666b5d0c78be7de964e4b366fff1f724 100644 (file)
--- a/common/amd64/dct-a.asm
+++ b/common/amd64/dct-a.asm
@@ -4,8 +4,9 @@
  ;* Copyright (C) 2003 x264 project
  ;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
  ;*
-;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
-;*          Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;*          Min Chen <chenm001.163.com> (converted to nasm)
+;*          Loren Merritt <lorenm@u.washington.edu> (dct8)
  ;*
  ;* This program is free software; you can redistribute it and/or modify
  ;* it under the terms of the GNU General Public License as published by
@@ -464,3 +465,56 @@ x264_add8x8_idct8_sse2:
      MMX_STORE_DIFF_8P   xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]
  
      ret
+
+
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
+;                                     uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+%macro SUB_NxN_DCT 6
+ALIGN 16
+cglobal %1
+%1:
+    call %2
+    add  parm1q, %3
+    add  parm2q, %4-%5*FENC_STRIDE
+    add  parm3q, %4-%5*FDEC_STRIDE
+    call %2
+    add  parm1q, %3
+    add  parm2q, %4*FENC_STRIDE-%6
+    add  parm3q, %4*FDEC_STRIDE-%6
+    call %2
+    add  parm1q, %3
+    add  parm2q, %4-%5*FENC_STRIDE
+    add  parm3q, %4-%5*FDEC_STRIDE
+    jmp  %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+;-----------------------------------------------------------------------------
+%macro ADD_NxN_IDCT 6
+ALIGN 16
+cglobal %1
+%1:
+    call %2
+    add  parm1q, %4-%5*FDEC_STRIDE
+    add  parm2q, %3
+    call %2
+    add  parm1q, %4*FDEC_STRIDE-%6
+    add  parm2q, %3
+    call %2
+    add  parm1q, %4-%5*FDEC_STRIDE
+    add  parm2q, %3
+    jmp  %2
+%endmacro
+
+SUB_NxN_DCT  x264_sub8x8_dct_mmx,      x264_sub4x4_dct_mmx,     32, 4, 0,  4
+ADD_NxN_IDCT x264_add8x8_idct_mmx,     x264_add4x4_idct_mmx,    32, 4, 0,  4
+
+SUB_NxN_DCT  x264_sub16x16_dct_mmx,    x264_sub8x8_dct_mmx,     32, 4, 4, 12
+ADD_NxN_IDCT x264_add16x16_idct_mmx,   x264_add8x8_idct_mmx,    32, 4, 4, 12
+
+SUB_NxN_DCT  x264_sub16x16_dct8_sse2,  x264_sub8x8_dct8_sse2,  128, 8, 0,  8
+ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0,  8
+
diff --git a/common/i386/dct-a.asm b/common/i386/dct-a.asm

index 5d3abdbb9afd29e2f17435642d0e793f60196c10..0bd0eddf6498cb3f1b83e012f02d207bbff36e1e 100644 (file)
--- a/common/i386/dct-a.asm
+++ b/common/i386/dct-a.asm
@@ -4,9 +4,10 @@
  ;* Copyright (C) 2003 x264 project
  ;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
  ;*
-;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
-;*          Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;*          Min Chen <chenm001.163.com> (converted to nasm)
  ;*          Christian Heine <sennindemokrit@gmx.net> (dct8/idct8 functions)
+;*          Loren Merritt <lorenm@u.washington.edu> (misc)
  ;*
  ;* This program is free software; you can redistribute it and/or modify
  ;* it under the terms of the GNU General Public License as published by
@@ -320,12 +321,6 @@ x264_add4x4_idct_mmx:
      MMX_SUMSUB_BA   %1, %2
  %endmacro
  
-cglobal x264_pixel_sub_8x8_mmx
-cglobal x264_pixel_add_8x8_mmx
-cglobal x264_transpose_8x8_mmx
-cglobal x264_ydct8_mmx
-cglobal x264_yidct8_mmx
-
  ALIGN 16
  ;-----------------------------------------------------------------------------
  ;   void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
@@ -505,15 +500,15 @@ x264_yidct8_mmx:
  
      movq        mm2, [eax+disp+0*16]    ; mm2 = d0
      movq        mm0, [eax+disp+4*16]    ; mm0 = d4
-    MMX_SUMSUB_BA   mm0, mm2                ; mm0 = a0, mm2 = a2
+    MMX_SUMSUB_BA   mm0, mm2            ; mm0 = a0, mm2 = a2
  
-    MMX_SUMSUB_BA   mm6, mm0                ; mm6 = f0, mm0 = f6
-    MMX_SUMSUB_BA   mm4, mm2                ; mm4 = f2, mm2 = f4
+    MMX_SUMSUB_BADC mm6, mm0, mm4, mm2  ; mm6 = f0, mm0 = f6
+                                        ; mm4 = f2, mm2 = f4
  
-    MMX_SUMSUB_BA   mm7, mm6                ; mm7 = g0, mm6 = g7
-    MMX_SUMSUB_BA   mm5, mm4                ; mm5 = g1, mm4 = g6
-    MMX_SUMSUB_BA   mm3, mm2                ; mm3 = g2, mm2 = g5
-    MMX_SUMSUB_BA   mm1, mm0                ; mm1 = g3, mm0 = g4
+    MMX_SUMSUB_BADC mm7, mm6, mm5, mm4  ; mm7 = g0, mm6 = g7
+                                        ; mm5 = g1, mm4 = g6
+    MMX_SUMSUB_BADC mm3, mm2, mm1, mm0  ; mm3 = g2, mm2 = g5
+                                        ; mm1 = g3, mm0 = g4
  
      movq        [eax+disp+0*16], mm7
      movq        [eax+disp+1*16], mm5
@@ -607,3 +602,96 @@ x264_transpose_8x8_mmx:
  
      ret
  
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+ALIGN 16
+cglobal x264_sub8x8_dct8_mmx
+x264_sub8x8_dct8_mmx:
+    push dword [esp+12]
+    push dword [esp+12]
+    push dword [esp+12]
+    call x264_pixel_sub_8x8_mmx
+    call x264_ydct8_mmx
+    call x264_transpose_8x8_mmx
+    add  esp, 12
+    jmp  x264_ydct8_mmx
+
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+ALIGN 16
+cglobal x264_add8x8_idct8_mmx
+x264_add8x8_idct8_mmx:
+    mov  eax, [esp+8]
+    add  word [eax], 32
+    push eax
+    call x264_yidct8_mmx
+    call x264_transpose_8x8_mmx
+    call x264_yidct8_mmx
+    add  esp, 4
+    jmp  x264_pixel_add_8x8_mmx
+
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
+;                                     uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+%macro SUB_NxN_DCT 4
+ALIGN 16
+cglobal %1
+%1:
+    mov  edx, [esp+12]
+    mov  ecx, [esp+ 8]
+    mov  eax, [esp+ 4]
+    add  edx, %4
+    add  ecx, %4
+    add  eax, %3
+    push edx
+    push ecx
+    push eax
+    call %2
+    add  dword [esp+0], %3
+    add  dword [esp+4], %4*FENC_STRIDE-%4
+    add  dword [esp+8], %4*FDEC_STRIDE-%4
+    call %2
+    add  dword [esp+0], %3
+    add  dword [esp+4], %4
+    add  dword [esp+8], %4
+    call %2
+    add  esp, 12
+    jmp  %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+;-----------------------------------------------------------------------------
+%macro ADD_NxN_IDCT 4
+ALIGN 16
+cglobal %1
+%1:
+    mov  ecx, [esp+8]
+    mov  eax, [esp+4]
+    add  ecx, %3
+    add  eax, %4
+    push ecx
+    push eax
+    call %2
+    add  dword [esp+0], %4*FDEC_STRIDE-%4
+    add  dword [esp+4], %3
+    call %2
+    add  dword [esp+0], %4
+    add  dword [esp+4], %3
+    call %2
+    add  esp, 8
+    jmp  %2
+%endmacro
+
+SUB_NxN_DCT  x264_sub8x8_dct_mmx,     x264_sub4x4_dct_mmx,    32, 4
+ADD_NxN_IDCT x264_add8x8_idct_mmx,    x264_add4x4_idct_mmx,   32, 4
+
+SUB_NxN_DCT  x264_sub16x16_dct_mmx,   x264_sub8x8_dct_mmx,   128, 8
+ADD_NxN_IDCT x264_add16x16_idct_mmx,  x264_add8x8_idct_mmx,  128, 8
+
+SUB_NxN_DCT  x264_sub16x16_dct8_mmx,  x264_sub8x8_dct8_mmx,  128, 8
+ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8
+
diff --git a/common/i386/dct-c.c b/common/i386/dct-c.c

deleted file mode 100644 (file)

index cae4d81..0000000
--- a/common/i386/dct-c.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*****************************************************************************
- * dct.c: h264 encoder library
- *****************************************************************************
- * Copyright (C) 2003 Laurent Aimar
- * $Id: dct-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
- *****************************************************************************/
-
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#else
-#include <inttypes.h>
-#endif
-#include <stdlib.h>
-#include <stdarg.h>
-
-#include "dct.h"
-#include "common/common.h"
-
-
-void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
-{
-    x264_sub4x4_dct_mmx( dct[0], &pix1[0], &pix2[0] );
-    x264_sub4x4_dct_mmx( dct[1], &pix1[4], &pix2[4] );
-    x264_sub4x4_dct_mmx( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
-    x264_sub4x4_dct_mmx( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
-}
-
-void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
-{
-    x264_sub8x8_dct_mmx( &dct[ 0], &pix1[0], &pix2[0] );
-    x264_sub8x8_dct_mmx( &dct[ 4], &pix1[8], &pix2[8] );
-    x264_sub8x8_dct_mmx( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
-    x264_sub8x8_dct_mmx( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
-}
-
-
-
-/****************************************************************************
- * addXxX_idct:
- ****************************************************************************/
-
-void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] )
-{
-    x264_add4x4_idct_mmx( p_dst,                   dct[0] );
-    x264_add4x4_idct_mmx( &p_dst[4],               dct[1] );
-    x264_add4x4_idct_mmx( &p_dst[4*FDEC_STRIDE+0], dct[2] );
-    x264_add4x4_idct_mmx( &p_dst[4*FDEC_STRIDE+4], dct[3] );
-}
-
-void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] )
-{
-    x264_add8x8_idct_mmx( &p_dst[0],               &dct[0] );
-    x264_add8x8_idct_mmx( &p_dst[8],               &dct[4] );
-    x264_add8x8_idct_mmx( &p_dst[8*FDEC_STRIDE],   &dct[8] );
-    x264_add8x8_idct_mmx( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
-}
-
-/***********************
- * dct8/idct8 functions
- ***********************/
-
-#ifdef ARCH_X86_64
-void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
-{
-    x264_sub8x8_dct8_sse2( dct[0], pix1,                 pix2 );
-    x264_sub8x8_dct8_sse2( dct[1], pix1+8,               pix2+8 );
-    x264_sub8x8_dct8_sse2( dct[2], pix1+8*FENC_STRIDE,   pix2+8*FDEC_STRIDE );
-    x264_sub8x8_dct8_sse2( dct[3], pix1+8*FENC_STRIDE+8, pix2+8*FDEC_STRIDE+8 );
-}
-
-void x264_add16x16_idct8_sse2( uint8_t *p_dst, int16_t dct[4][8][8] )
-{
-    x264_add8x8_idct8_sse2( p_dst,                 dct[0] );
-    x264_add8x8_idct8_sse2( p_dst+8,               dct[1] );
-    x264_add8x8_idct8_sse2( p_dst+8*FDEC_STRIDE,   dct[2] );
-    x264_add8x8_idct8_sse2( p_dst+8*FDEC_STRIDE+8, dct[3] );
-}
-
-#else // ARCH_X86
-
-void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
-void x264_pixel_add_8x8_mmx( uint8_t *pix, uint16_t *diff );
-void x264_transpose_8x8_mmx( int16_t src[8][8] );
-void x264_ydct8_mmx( int16_t dct[8][8] );
-void x264_yidct8_mmx( int16_t dct[8][8] );
-
-inline void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
-{
-    x264_pixel_sub_8x8_mmx( (int16_t *)dct, pix1, pix2 );
-    x264_ydct8_mmx( dct );
-    x264_transpose_8x8_mmx( dct );
-    x264_ydct8_mmx( dct );
-}
-
-void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
-{
-    x264_sub8x8_dct8_mmx( dct[0], pix1,                 pix2 );
-    x264_sub8x8_dct8_mmx( dct[1], pix1+8,               pix2+8 );
-    x264_sub8x8_dct8_mmx( dct[2], pix1+8*FENC_STRIDE,   pix2+8*FDEC_STRIDE );
-    x264_sub8x8_dct8_mmx( dct[3], pix1+8*FENC_STRIDE+8, pix2+8*FDEC_STRIDE+8 );
-}
-
-inline void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
-{
-    dct[0][0] += 32;
-    x264_yidct8_mmx( dct );
-    x264_transpose_8x8_mmx( dct );
-    x264_yidct8_mmx( dct );
-    x264_pixel_add_8x8_mmx( dst, (uint16_t *)dct ); // including >>6 at the end
-}
-
-void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] )
-{
-    x264_add8x8_idct8_mmx( dst,                 dct[0] );
-    x264_add8x8_idct8_mmx( dst+8,               dct[1] );
-    x264_add8x8_idct8_mmx( dst+8*FDEC_STRIDE,   dct[2] );
-    x264_add8x8_idct8_mmx( dst+8*FDEC_STRIDE+8, dct[3] );
-}
-#endif
author	Loren Merritt <pengvado@videolan.org>
	Sun, 16 Jul 2006 18:28:39 +0000 (18:28 +0000)
committer	Loren Merritt <pengvado@videolan.org>
	Sun, 16 Jul 2006 18:28:39 +0000 (18:28 +0000)
Makefile		patch \| blob \| history
common/amd64/dct-a.asm		patch \| blob \| history
common/i386/dct-a.asm		patch \| blob \| history
common/i386/dct-c.c	[deleted file]	patch \| blob \| history