From 413d8fa90917044e0ffaffb7009ccbc8059c61b0 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Tue, 19 Apr 2005 18:35:45 +0000 Subject: [PATCH] amd64 asm patch, part1. git-svn-id: svn://svn.videolan.org/x264/trunk@212 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/cpu-a.asm | 117 ++++ common/amd64/dct-a.asm | 313 ++++++++++ common/amd64/dct-c.c | 299 ++++++++++ common/amd64/dct.h | 38 ++ common/amd64/mc-a.asm | 489 +++++++++++++++ common/amd64/mc-a2.asm | 402 +++++++++++++ common/amd64/mc-c.c | 1161 ++++++++++++++++++++++++++++++++++++ common/amd64/mc.h | 34 ++ common/amd64/pixel-a.asm | 811 +++++++++++++++++++++++++ common/amd64/pixel.h | 51 ++ common/amd64/predict-a.asm | 141 +++++ common/amd64/predict.c | 444 ++++++++++++++ common/amd64/predict.h | 31 + 13 files changed, 4331 insertions(+) create mode 100644 common/amd64/cpu-a.asm create mode 100644 common/amd64/dct-a.asm create mode 100644 common/amd64/dct-c.c create mode 100644 common/amd64/dct.h create mode 100644 common/amd64/mc-a.asm create mode 100644 common/amd64/mc-a2.asm create mode 100644 common/amd64/mc-c.c create mode 100644 common/amd64/mc.h create mode 100644 common/amd64/pixel-a.asm create mode 100644 common/amd64/pixel.h create mode 100644 common/amd64/predict-a.asm create mode 100644 common/amd64/predict.c create mode 100644 common/amd64/predict.h diff --git a/common/amd64/cpu-a.asm b/common/amd64/cpu-a.asm new file mode 100644 index 00000000..729ece64 --- /dev/null +++ b/common/amd64/cpu-a.asm @@ -0,0 +1,117 @@ +;***************************************************************************** +;* cpu.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003 x264 project +;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ +;* +;* Authors: Laurent Aimar +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal x264_cpu_cpuid_test +cglobal x264_cpu_cpuid +cglobal x264_emms + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported +;----------------------------------------------------------------------------- +x264_cpu_cpuid_test: + pushfd + push ebx + push ebp + push esi + push edi + + pushfd + pop eax + mov ebx, eax + xor eax, 0x200000 + push eax + popfd + pushfd + pop eax + xor eax, ebx + + pop edi + pop esi + pop ebp + pop ebx + popfd + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) +;----------------------------------------------------------------------------- +x264_cpu_cpuid: + + push ebp + mov ebp, esp + push ebx + push esi + push edi + + mov eax, [ebp + 8] + cpuid + + mov esi, [ebp + 12] + mov [esi], eax + + mov esi, [ebp + 16] + mov [esi], ebx + + mov esi, [ebp + 20] + mov [esi], ecx + + mov esi, [ebp + 24] + mov [esi], edx + + pop edi + pop esi + pop ebx + pop ebp + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_emms( void ) +;----------------------------------------------------------------------------- +x264_emms: + emms + ret + diff --git a/common/amd64/dct-a.asm b/common/amd64/dct-a.asm new file mode 100644 index 00000000..92dbc5ae --- /dev/null +++ b/common/amd64/dct-a.asm @@ -0,0 +1,313 @@ +;***************************************************************************** +;* dct.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003 x264 project +;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ +;* +;* Authors: Min Chen (converted to nasm) +;* Laurent Aimar (initial version) +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +;***************************************************************************** +;* * +;* Revision history: * +;* * +;* 2004.04.28 portab all 4x4 function to nasm (CM) * +;* * +;***************************************************************************** + +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +%macro MMX_ZERO 1 + pxor %1, %1 +%endmacro + +%macro MMX_LOAD_DIFF_4P 5 + movd %1, %4 + punpcklbw %1, %3 + movd %2, %5 + punpcklbw %2, %3 + psubw %1, %2 +%endmacro + +%macro MMX_SUMSUB_BA 2 + paddw %1, %2 + paddw %2, %2 + psubw %2, %1 +%endmacro + +%macro MMX_SUMSUB_BADC 4 + paddw %1, %2 + paddw %3, %4 + paddw %2, %2 + paddw %4, %4 + psubw %2, %1 + psubw %4, %3 +%endmacro + +%macro MMX_SUMSUB2_AB 3 + movq %3, %1 + paddw %1, %1 + paddw %1, %2 + psubw %3, %2 + psubw %3, %2 +%endmacro + +%macro MMX_SUMSUBD2_AB 4 + movq %4, %1 + movq %3, %2 + psraw %2, $1 + psraw %4, $1 + paddw %1, %2 + psubw %4, %3 +%endmacro + +%macro SBUTTERFLYwd 3 + movq %3, %1 + punpcklwd %1, %2 + punpckhwd %3, %2 +%endmacro + +%macro SBUTTERFLYdq 3 + movq %3, %1 + punpckldq %1, %2 + punpckhdq %3, %2 +%endmacro + +;----------------------------------------------------------------------------- +; input ABCD output ADTC +;----------------------------------------------------------------------------- +%macro MMX_TRANSPOSE 5 + SBUTTERFLYwd %1, %2, %5 + SBUTTERFLYwd %3, %4, %2 + SBUTTERFLYdq %1, %3, %4 + SBUTTERFLYdq %5, %2, %3 +%endmacro + +%macro MMX_STORE_DIFF_4P 5 + paddw %1, %3 + psraw %1, $6 + movd %2, %5 + punpcklbw %2, %4 + paddsw %1, %2 + packuswb %1, %1 + movd %5, %1 +%endmacro + +;%macro +;%endmacro + +;============================================================================= +; Local Data (Read Only) +;============================================================================= + +%ifdef FORMAT_COFF +SECTION .rodata data +%else +SECTION .rodata data align=16 +%endif + +;----------------------------------------------------------------------------- +; Various memory constants (trigonometric values or rounding values) +;----------------------------------------------------------------------------- + +ALIGN 16 +x264_mmx_1: + dw 1, 1, 1, 1 + +x264_mmx_32: + dw 32, 32, 32, 32 + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal x264_dct4x4dc_mmxext + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl dct4x4dc( int16_t d[4][4] ) +;----------------------------------------------------------------------------- +x264_dct4x4dc_mmxext: + mov eax, [esp+ 4] + movq mm0, [eax+ 0] + movq mm1, [eax+ 8] + movq mm2, [eax+16] + movq mm3, [eax+24] + + MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 + MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 + + MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 + + MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 + MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 + + MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 + + movq mm6, [x264_mmx_1] + paddw mm0, mm6 + paddw mm4, mm6 + psraw mm0, 1 + movq [eax+ 0], mm0 + psraw mm4, 1 + movq [eax+ 8], mm4 + paddw mm1, mm6 + paddw mm3, mm6 + psraw mm1, 1 + movq [eax+16], mm1 + psraw mm3, 1 + movq [eax+24], mm3 + ret + +cglobal x264_idct4x4dc_mmxext + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] ) +;----------------------------------------------------------------------------- +x264_idct4x4dc_mmxext: + mov eax, [esp+ 4] + movq mm0, [eax+ 0] + movq mm1, [eax+ 8] + movq mm2, [eax+16] + movq mm3, [eax+24] + + MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 + MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 + + MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 + + MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 + MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 + + MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 + + movq [eax+ 0], mm0 + movq [eax+ 8], mm4 + movq [eax+16], mm1 + movq [eax+24], mm3 + ret + +cglobal x264_sub4x4_dct_mmxext + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +;----------------------------------------------------------------------------- +x264_sub4x4_dct_mmxext: + push ebx + mov eax, [esp+12] ; pix1 + mov ebx, [esp+16] ; i_pix1 + mov ecx, [esp+20] ; pix2 + mov edx, [esp+24] ; i_pix2 + + MMX_ZERO mm7 + + ; Load 4 lines + MMX_LOAD_DIFF_4P mm0, mm6, mm7, [eax ], [ecx] + MMX_LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx ], [ecx+edx] + MMX_LOAD_DIFF_4P mm2, mm6, mm7, [eax+ebx*2], [ecx+edx*2] + add eax, ebx + add ecx, edx + MMX_LOAD_DIFF_4P mm3, mm6, mm7, [eax+ebx*2], [ecx+edx*2] + + MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12 + + MMX_SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12 + MMX_SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12 + + ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 + MMX_TRANSPOSE mm2, mm0, mm3, mm4, mm1 + + MMX_SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12 + + MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12 + MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12 + + ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3 + MMX_TRANSPOSE mm1, mm2, mm3, mm0, mm4 + + mov eax, [esp+ 8] ; dct + movq [eax+ 0], mm1 + movq [eax+ 8], mm0 + movq [eax+16], mm4 + movq [eax+24], mm3 + + pop ebx + ret + +cglobal x264_add4x4_idct_mmxext + +ALIGN 16 +;----------------------------------------------------------------------------- +; void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ) +;----------------------------------------------------------------------------- +x264_add4x4_idct_mmxext: + + ; Load dct coeffs + mov eax, [esp+12] ; dct + movq mm0, [eax+ 0] + movq mm4, [eax+ 8] + movq mm3, [eax+16] + movq mm1, [eax+24] + + mov eax, [esp+ 4] ; p_dst + mov ecx, [esp+ 8] ; i_dst + lea edx, [ecx+ecx*2] + + ; out:mm0, mm1, mm2, mm3 + MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2 + + MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02 + MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) + + MMX_SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 + + ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0 + MMX_TRANSPOSE mm1, mm4, mm0, mm2, mm3 + + MMX_SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02 + MMX_SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) + + MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 + + MMX_ZERO mm7 + movq mm6, [x264_mmx_32] + + MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax] + MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+ecx] + MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+ecx*2] + MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+edx] + + ret + diff --git a/common/amd64/dct-c.c b/common/amd64/dct-c.c new file mode 100644 index 00000000..45279035 --- /dev/null +++ b/common/amd64/dct-c.c @@ -0,0 +1,299 @@ +/***************************************************************************** + * dct.c: h264 encoder library + ***************************************************************************** + * Copyright (C) 2003 Laurent Aimar + * $Id: dct-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $ + * + * Authors: Laurent Aimar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifdef HAVE_STDINT_H +#include +#else +#include +#endif +#include +#include + +#include "x264.h" + +#include "common/dct.h" +#include "dct.h" + + +#if 0 +#define MMX_ZERO( MMZ ) \ + asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: ) + +/* MMP : diff, MMT: temp */ +#define MMX_LOAD_DIFF_4P( MMP, MMT, MMZ, pix1, pix2 ) \ + asm volatile( "movd (%0), " #MMP "\n" \ + "punpcklbw " #MMZ ", " #MMP "\n" \ + "movd (%1), " #MMT "\n" \ + "punpcklbw " #MMZ ", " #MMT "\n" \ + "psubw " #MMT ", " #MMP "\n" : : "r"(pix1), "r"(pix2) ) + +/* in: out: mma=mma+mmb, mmb=mmb-mma */ +#define MMX_SUMSUB_BA( MMA, MMB ) \ + asm volatile( "paddw " #MMB ", " #MMA "\n"\ + "paddw " #MMB ", " #MMB "\n"\ + "psubw " #MMA ", " #MMB "\n" :: ) + +#define MMX_SUMSUB_BADC( MMA, MMB, MMC, MMD ) \ + asm volatile( "paddw " #MMB ", " #MMA "\n"\ + "paddw " #MMD ", " #MMC "\n"\ + "paddw " #MMB ", " #MMB "\n"\ + "paddw " #MMD ", " #MMD "\n"\ + "psubw " #MMA ", " #MMB "\n"\ + "psubw " #MMC ", " #MMD "\n" :: ) + +/* inputs MMA, MMB output MMA MMT */ +#define MMX_SUMSUB2_AB( MMA, MMB, MMT ) \ + asm volatile( "movq " #MMA ", " #MMT "\n" \ + "paddw " #MMA ", " #MMA "\n" \ + "paddw " #MMB ", " #MMA "\n" \ + "psubw " #MMB ", " #MMT "\n" \ + "psubw " #MMB ", " #MMT "\n" :: ) + +/* inputs MMA, MMB output MMA MMS */ +#define MMX_SUMSUBD2_AB( MMA, MMB, MMT, MMS ) \ + asm volatile( "movq " #MMA ", " #MMS "\n" \ + "movq " #MMB ", " #MMT "\n" \ + "psraw $1 , " #MMB "\n" \ + "psraw $1 , " #MMS "\n" \ + "paddw " #MMB ", " #MMA "\n" \ + "psubw " #MMT ", " #MMS "\n" :: ) + +#define SBUTTERFLYwd(a,b,t )\ + asm volatile( "movq " #a ", " #t " \n\t" \ + "punpcklwd " #b ", " #a " \n\t" \ + "punpckhwd " #b ", " #t " \n\t" :: ) + +#define SBUTTERFLYdq(a,b,t )\ + asm volatile( "movq " #a ", " #t " \n\t" \ + "punpckldq " #b ", " #a " \n\t" \ + "punpckhdq " #b ", " #t " \n\t" :: ) + +/* input ABCD output ADTC */ +#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \ + SBUTTERFLYwd( MMA, MMB, MMT ); \ + SBUTTERFLYwd( MMC, MMD, MMB ); \ + SBUTTERFLYdq( MMA, MMC, MMD ); \ + SBUTTERFLYdq( MMT, MMB, MMC ) + +#define MMX_STORE_DIFF_4P( MMP, MMT, MM32, MMZ, dst ) \ + asm volatile( "paddw " #MM32 "," #MMP "\n" \ + "psraw $6, " #MMP "\n" \ + "movd (%0), " #MMT "\n" \ + "punpcklbw " #MMZ ", " #MMT "\n" \ + "paddsw " #MMT ", " #MMP "\n" \ + "packuswb " #MMZ ", " #MMP "\n" \ + "movd " #MMP ", (%0)\n" :: "r"(dst) ) + +#define UNUSED_LONGLONG( foo ) \ + static const unsigned long long foo __asm__ (#foo) __attribute__((unused)) __attribute__((aligned(16))) + +UNUSED_LONGLONG( x264_mmx_32 ) = 0x0020002000200020ULL; +UNUSED_LONGLONG( x264_mmx_1 ) = 0x0001000100010001ULL; + + +/* + * XXX For all dct dc : input could be equal to output so ... + */ +void x264_dct4x4dc_mmxext( int16_t d[4][4] ) +{ + /* load DCT */ + asm volatile( + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" :: "r"(d) ); + + MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 ); /* mm1=s01 mm0=d01 mm3=s23 mm2=d23 */ + MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 ); /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */ + + /* in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 */ + MMX_TRANSPOSE ( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ); + + MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 ); /* mm2=s01 mm3=d01 mm0=s23 mm4=d23 */ + MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 ); /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */ + + /* in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 */ + MMX_TRANSPOSE ( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 ); + + + asm volatile( "movq x264_mmx_1, %%mm6" :: ); + + /* Store back */ + asm volatile( + "paddw %%mm6, %%mm0\n" + "paddw %%mm6, %%mm4\n" + + "psraw $1, %%mm0\n" + "movq %%mm0, (%0)\n" + "psraw $1, %%mm4\n" + "movq %%mm4, 8(%0)\n" + + "paddw %%mm6, %%mm1\n" + "paddw %%mm6, %%mm3\n" + + "psraw $1, %%mm1\n" + "movq %%mm1, 16(%0)\n" + "psraw $1, %%mm3\n" + "movq %%mm3, 24(%0)\n" :: "r"(d) ); +} + +void x264_idct4x4dc_mmxext( int16_t d[4][4] ) +{ + /* load DCT */ + asm volatile( + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" :: "r"(d) ); + + MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 ); /* mm1=s01 mm0=d01 mm3=s23 mm2=d23 */ + MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 ); /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */ + + /* in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 */ + MMX_TRANSPOSE( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ); + + MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 ); /* mm2=s01 mm3=d01 mm0=s23 mm4=d23 */ + MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 ); /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */ + + /* in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 */ + MMX_TRANSPOSE( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 ); + + /* Store back */ + asm volatile( + "movq %%mm0, (%0)\n" + "movq %%mm4, 8(%0)\n" + "movq %%mm1, 16(%0)\n" + "movq %%mm3, 24(%0)\n" :: "r"(d) ); +} + +/**************************************************************************** + * subXxX_dct: + ****************************************************************************/ +inline void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +{ + /* Reset mm7 */ + MMX_ZERO( %%mm7 ); + + /* Load 4 lines */ + MMX_LOAD_DIFF_4P( %%mm0, %%mm6, %%mm7, &pix1[0*i_pix1], &pix2[0*i_pix2] ); + MMX_LOAD_DIFF_4P( %%mm1, %%mm6, %%mm7, &pix1[1*i_pix1], &pix2[1*i_pix2] ); + MMX_LOAD_DIFF_4P( %%mm2, %%mm6, %%mm7, &pix1[2*i_pix1], &pix2[2*i_pix2] ); + MMX_LOAD_DIFF_4P( %%mm3, %%mm6, %%mm7, &pix1[3*i_pix1], &pix2[3*i_pix2] ); + + MMX_SUMSUB_BADC( %%mm3, %%mm0, %%mm2, %%mm1 ); /* mm3=s03 mm0=d03 mm2=s12 mm1=d12 */ + + MMX_SUMSUB_BA( %%mm2, %%mm3 ); /* mm2=s03+s12 mm3=s03-s12 */ + MMX_SUMSUB2_AB( %%mm0, %%mm1, %%mm4 ); /* mm0=2.d03+d12 mm4=d03-2.d12 */ + + /* transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 */ + MMX_TRANSPOSE( %%mm2, %%mm0, %%mm3, %%mm4, %%mm1 ); + + MMX_SUMSUB_BADC( %%mm3, %%mm2, %%mm1, %%mm4 ); /* mm3=s03 mm2=d03 mm1=s12 mm4=d12 */ + + MMX_SUMSUB_BA( %%mm1, %%mm3 ); /* mm1=s03+s12 mm3=s03-s12 */ + MMX_SUMSUB2_AB( %%mm2, %%mm4, %%mm0 ); /* mm2=2.d03+d12 mm0=d03-2.d12 */ + + /* transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3 */ + MMX_TRANSPOSE( %%mm1, %%mm2, %%mm3, %%mm0, %%mm4 ); + + /* Store back */ + asm volatile( + "movq %%mm1, (%0)\n" + "movq %%mm0, 8(%0)\n" + "movq %%mm4, 16(%0)\n" + "movq %%mm3, 24(%0)\n" :: "r"(dct) ); +} +#endif + +void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +{ + x264_sub4x4_dct_mmxext( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 ); + x264_sub4x4_dct_mmxext( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 ); + x264_sub4x4_dct_mmxext( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 ); + x264_sub4x4_dct_mmxext( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 ); +} + +void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +{ + x264_sub8x8_dct_mmxext( &dct[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 ); + x264_sub8x8_dct_mmxext( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 ); + x264_sub8x8_dct_mmxext( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 ); + x264_sub8x8_dct_mmxext( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ); +} + + + +/**************************************************************************** + * addXxX_idct: + ****************************************************************************/ +#if 0 +inline void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ) +{ + /* Load dct coeffs */ + asm volatile( + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" :: "r"(dct) ); + + MMX_SUMSUB_BA ( %%mm2, %%mm0 ); /* mm2=s02 mm0=d02 */ + MMX_SUMSUBD2_AB( %%mm1, %%mm3, %%mm5, %%mm4 ); /* mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */ + + MMX_SUMSUB_BADC( %%mm1, %%mm2, %%mm4, %%mm0 ); /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ + + /* in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0 */ + MMX_TRANSPOSE ( %%mm1, %%mm4, %%mm0, %%mm2, %%mm3 ); + + MMX_SUMSUB_BA ( %%mm3, %%mm1 ); /* mm3=s02 mm1=d02 */ + MMX_SUMSUBD2_AB( %%mm2, %%mm0, %%mm5, %%mm4 ); /* mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */ + + MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm4, %%mm1 ); /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ + + /* in: mm2, mm4, mm1, mm3 out: mm2, mm3, mm0, mm1 */ + MMX_TRANSPOSE ( %%mm2, %%mm4, %%mm1, %%mm3, %%mm0 ); + + MMX_ZERO( %%mm7 ); + asm volatile( "movq x264_mmx_32, %%mm6\n" :: ); + + MMX_STORE_DIFF_4P( %%mm2, %%mm4, %%mm6, %%mm7, &p_dst[0*i_dst] ); + MMX_STORE_DIFF_4P( %%mm3, %%mm4, %%mm6, %%mm7, &p_dst[1*i_dst] ); + MMX_STORE_DIFF_4P( %%mm0, %%mm4, %%mm6, %%mm7, &p_dst[2*i_dst] ); + MMX_STORE_DIFF_4P( %%mm1, %%mm4, %%mm6, %%mm7, &p_dst[3*i_dst] ); +} +#endif + +void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] ) +{ + x264_add4x4_idct_mmxext( p_dst, i_dst, dct[0] ); + x264_add4x4_idct_mmxext( &p_dst[4], i_dst, dct[1] ); + x264_add4x4_idct_mmxext( &p_dst[4*i_dst+0], i_dst, dct[2] ); + x264_add4x4_idct_mmxext( &p_dst[4*i_dst+4], i_dst, dct[3] ); +} + +void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] ) +{ + x264_add8x8_idct_mmxext( &p_dst[0], i_dst, &dct[0] ); + x264_add8x8_idct_mmxext( &p_dst[8], i_dst, &dct[4] ); + x264_add8x8_idct_mmxext( &p_dst[8*i_dst], i_dst, &dct[8] ); + x264_add8x8_idct_mmxext( &p_dst[8*i_dst+8], i_dst, &dct[12] ); +} diff --git a/common/amd64/dct.h b/common/amd64/dct.h new file mode 100644 index 00000000..23601e5e --- /dev/null +++ b/common/amd64/dct.h @@ -0,0 +1,38 @@ +/***************************************************************************** + * dct.h: h264 encoder library + ***************************************************************************** + * Copyright (C) 2003 Laurent Aimar + * $Id: dct.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $ + * + * Authors: Laurent Aimar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifndef _I386_DCT_H +#define _I386_DCT_H 1 + +void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); +void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); +void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); + +void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ); +void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] ); +void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] ); + +void x264_dct4x4dc_mmxext( int16_t d[4][4] ); +void x264_idct4x4dc_mmxext( int16_t d[4][4] ); + +#endif diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm new file mode 100644 index 00000000..ebc68d85 --- /dev/null +++ b/common/amd64/mc-a.asm @@ -0,0 +1,489 @@ +;***************************************************************************** +;* mc.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003 x264 project +;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $ +;* +;* Authors: Min Chen (converted to nasm) +;* Laurent Aimar (init algorithm) +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +;***************************************************************************** +;* * +;* Revision history: * +;* * +;* 2004.05.17 portab mc_copy_w4/8/16 (CM) * +;* * +;***************************************************************************** + +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +;============================================================================= +; Local Data (Read Only) +;============================================================================= + +%ifdef FORMAT_COFF +SECTION .rodata data +%else +SECTION .rodata data align=16 +%endif + +;----------------------------------------------------------------------------- +; Various memory constants (trigonometric values or rounding values) +;----------------------------------------------------------------------------- + +ALIGN 16 + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal x264_pixel_avg_w4_mmxext +cglobal x264_pixel_avg_w8_mmxext +cglobal x264_pixel_avg_w16_mmxext +cglobal x264_pixel_avg_w16_sse2 + +cglobal x264_mc_copy_w4_mmxext +cglobal x264_mc_copy_w8_mmxext +cglobal x264_mc_copy_w16_mmxext +cglobal x264_mc_copy_w16_sse2 + +cglobal x264_mc_chroma_sse + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); +;----------------------------------------------------------------------------- +x264_pixel_avg_w4_mmxext: + push ebp + push ebx + push esi + push edi + + mov edi, [esp+20] ; dst + mov ebx, [esp+28] ; src1 + mov ecx, [esp+36] ; src2 + mov esi, [esp+24] ; i_dst_stride + mov eax, [esp+32] ; i_src1_stride + mov edx, [esp+40] ; i_src2_stride + mov ebp, [esp+44] ; i_height +ALIGN 4 +.height_loop + movd mm0, [ebx] + pavgb mm0, [ecx] + movd mm1, [ebx+eax] + pavgb mm1, [ecx+edx] + movd [edi], mm0 + movd [edi+esi], mm1 + dec ebp + dec ebp + lea ebx, [ebx+eax*2] + lea ecx, [ecx+edx*2] + lea edi, [edi+esi*2] + jne .height_loop + + pop edi + pop esi + pop ebx + pop ebp + ret + + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w8_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); +;----------------------------------------------------------------------------- +x264_pixel_avg_w8_mmxext: + push ebp + push ebx + push esi + push edi + + mov edi, [esp+20] ; dst + mov ebx, [esp+28] ; src1 + mov ecx, [esp+36] ; src2 + mov esi, [esp+24] ; i_dst_stride + mov eax, [esp+32] ; i_src1_stride + mov edx, [esp+40] ; i_src2_stride + mov ebp, [esp+44] ; i_height +ALIGN 4 +.height_loop + movq mm0, [ebx] + pavgb mm0, [ecx] + movq [edi], mm0 + dec ebp + lea ebx, [ebx+eax] + lea ecx, [ecx+edx] + lea edi, [edi+esi] + jne .height_loop + + pop edi + pop esi + pop ebx + pop ebp + ret + + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w16_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); +;----------------------------------------------------------------------------- +x264_pixel_avg_w16_mmxext: + push ebp + push ebx + push esi + push edi + + mov edi, [esp+20] ; dst + mov ebx, [esp+28] ; src1 + mov ecx, [esp+36] ; src2 + mov esi, [esp+24] ; i_dst_stride + mov eax, [esp+32] ; i_src1_stride + mov edx, [esp+40] ; i_src2_stride + mov ebp, [esp+44] ; i_height +ALIGN 4 +.height_loop + movq mm0, [ebx ] + movq mm1, [ebx+8] + pavgb mm0, [ecx ] + pavgb mm1, [ecx+8] + movq [edi ], mm0 + movq [edi+8], mm1 + dec ebp + lea ebx, [ebx+eax] + lea ecx, [ecx+edx] + lea edi, [edi+esi] + jne .height_loop + + pop edi + pop esi + pop ebx + pop ebp + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w16_sse2( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); +;----------------------------------------------------------------------------- +x264_pixel_avg_w16_sse2: + push ebp + push ebx + push esi + push edi + + mov edi, [esp+20] ; dst + mov ebx, [esp+28] ; src1 + mov ecx, [esp+36] ; src2 + mov esi, [esp+24] ; i_dst_stride + mov eax, [esp+32] ; i_src1_stride + mov edx, [esp+40] ; i_src2_stride + mov ebp, [esp+44] ; i_height +ALIGN 4 +.height_loop + movdqu xmm0, [ebx] + pavgb xmm0, [ecx] + movdqu [edi], xmm0 + + dec ebp + lea ebx, [ebx+eax] + lea ecx, [ecx+edx] + lea edi, [edi+esi] + jne .height_loop + + pop edi + pop esi + pop ebx + pop ebp + ret + + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_mc_copy_w4_mmxext( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, int i_height ) +;----------------------------------------------------------------------------- +x264_mc_copy_w4_mmxext: + push ebx + push esi + push edi + + mov esi, [esp+16] ; src + mov edi, [esp+24] ; dst + mov ebx, [esp+20] ; i_src_stride + mov edx, [esp+28] ; i_dst_stride + mov ecx, [esp+32] ; i_height +ALIGN 4 +.height_loop + mov eax, [esi] + mov [edi], eax + mov eax, [esi+ebx] + mov [edi+edx], eax + lea esi, [esi+ebx*2] + lea edi, [edi+edx*2] + dec ecx + dec ecx + jne .height_loop + + pop edi + pop esi + pop ebx + ret + +cglobal mc_copy_w8 + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_mc_copy_w8_mmxext( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, int i_height ) +;----------------------------------------------------------------------------- +x264_mc_copy_w8_mmxext: + push ebx + push esi + push edi + + mov esi, [esp+16] ; src + mov edi, [esp+24] ; dst + mov ebx, [esp+20] ; i_src_stride + mov edx, [esp+28] ; i_dst_stride + mov ecx, [esp+32] ; i_height +ALIGN 4 +.height_loop + movq mm0, [esi] + movq [edi], mm0 + movq mm1, [esi+ebx] + movq [edi+edx], mm1 + movq mm2, [esi+ebx*2] + movq [edi+edx*2], mm2 + lea esi, [esi+ebx*2] + lea edi, [edi+edx*2] + movq mm3, [esi+ebx] + movq [edi+edx], mm3 + lea esi, [esi+ebx*2] + lea edi, [edi+edx*2] + + sub ecx, byte 4 + jnz .height_loop + + pop edi + pop esi + pop ebx + ret + +cglobal mc_copy_w16 + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_mc_copy_w16_mmxext( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, int i_height ) +;----------------------------------------------------------------------------- +x264_mc_copy_w16_mmxext: + push ebx + push esi + push edi + + mov esi, [esp+16] ; src + mov edi, [esp+24] ; dst + mov ebx, [esp+20] ; i_src_stride + mov edx, [esp+28] ; i_dst_stride + mov ecx, [esp+32] ; i_height + +ALIGN 4 +.height_loop + movq mm0, [esi] + movq mm1, [esi+8] + movq [edi], mm0 + movq [edi+8], mm1 + movq mm2, [esi+ebx] + movq mm3, [esi+ebx+8] + movq [edi+edx], mm2 + movq [edi+edx+8], mm3 + movq mm4, [esi+ebx*2] + movq mm5, [esi+ebx*2+8] + movq [edi+edx*2], mm4 + movq [edi+edx*2+8], mm5 + lea esi, [esi+ebx*2] + lea edi, [edi+edx*2] + movq mm6, [esi+ebx] + movq mm7, [esi+ebx+8] + movq [edi+edx], mm6 + movq [edi+edx+8], mm7 + lea esi, [esi+ebx*2] + lea edi, [edi+edx*2] + sub ecx, byte 4 + jnz .height_loop + + pop edi + pop esi + pop ebx + ret + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +;----------------------------------------------------------------------------- +x264_mc_copy_w16_sse2: + push ebx + push esi + push edi + + mov esi, [esp+16] ; src + mov edi, [esp+24] ; dst + mov ebx, [esp+20] ; i_src_stride + mov edx, [esp+28] ; i_dst_stride + mov ecx, [esp+32] ; i_height + +ALIGN 4 +.height_loop + movdqu xmm0, [esi] + movdqu xmm1, [esi+ebx] + movdqu [edi], xmm0 + movdqu [edi+edx], xmm1 + dec ecx + dec ecx + lea esi, [esi+ebx*2] + lea edi, [edi+edx*2] + jnz .height_loop + + pop edi + pop esi + pop ebx + ret + + +SECTION .rodata + +ALIGN 16 +eights times 4 dw 8 +thirty2s times 4 dw 32 + +SECTION .text + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_mc_chroma_sse( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, +; int dx, int dy, +; int i_height, int i_width ) +;----------------------------------------------------------------------------- + +x264_mc_chroma_sse: + + pxor mm3, mm3 + + pshufw mm5, [esp+20], 0 ; mm5 - dx + pshufw mm6, [esp+24], 0 ; mm6 - dy + + movq mm4, [eights] + movq mm0, mm4 + + psubw mm4, mm5 ; mm4 - 8-dx + psubw mm0, mm6 ; mm0 - 8-dy + + movq mm7, mm5 + pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB + pmullw mm7, mm6 ; mm7 = dx*dy = cD + pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC + pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA + + push edi + + mov eax, [esp+4+4] ; src + mov edi, [esp+4+12] ; dst + mov ecx, [esp+4+8] ; i_src_stride + mov edx, [esp+4+28] ; i_height + +ALIGN 4 +.height_loop + + movd mm1, [eax+ecx] + movd mm0, [eax] + punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 + punpcklbw mm0, mm3 + pmullw mm1, mm6 ; 2nd line * cC + pmullw mm0, mm4 ; 1st line * cA + + paddw mm0, mm1 ; mm0 <- result + + movd mm2, [eax+1] + movd mm1, [eax+ecx+1] + punpcklbw mm2, mm3 + punpcklbw mm1, mm3 + + paddw mm0, [thirty2s] + + pmullw mm2, mm5 ; line * cB + pmullw mm1, mm7 ; line * cD + paddw mm0, mm2 + paddw mm0, mm1 + + psrlw mm0, 6 + packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4 + movd [edi], mm0 + + add eax, ecx + add edi, [esp+4+16] + + dec edx + jnz .height_loop + + mov eax, [esp+4+32] + sub eax, 8 + jnz .finish ; width != 8 so assume 4 + + mov [esp+4+32], eax + mov edi, [esp+4+12] ; dst + mov eax, [esp+4+4] ; src + mov edx, [esp+4+28] ; i_height + add edi, 4 + add eax, 4 + jmp .height_loop + +.finish + pop edi + ret diff --git a/common/amd64/mc-a2.asm b/common/amd64/mc-a2.asm new file mode 100644 index 00000000..aaab2c1b --- /dev/null +++ b/common/amd64/mc-a2.asm @@ -0,0 +1,402 @@ +;***************************************************************************** +;* mc-a2.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2005 x264 project +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +;============================================================================= +; Read only data +;============================================================================= + +SECTION .rodata data align=16 + +ALIGN 16 +mmx_dw_one: + times 4 dw 16 +mmx_dd_one: + times 2 dd 512 +mmx_dw_20: + times 4 dw 20 +mmx_dw_5: + times 4 dw -5 + +SECTION .data + +width: + dd 0 +height: + dd 0 +dstp1: + dd 0 +dstp2: + dd 0 +buffer: + dd 0 +dst1: + dd 0 +dst2: + dd 0 +src: + dd 0 + + +;============================================================================= +; Macros +;============================================================================= + +%macro LOAD_4 9 + movd %1, %5 + movd %2, %6 + movd %3, %7 + movd %4, %8 + punpcklbw %1, %9 + punpcklbw %2, %9 + punpcklbw %3, %9 + punpcklbw %4, %9 +%endmacro + +%macro FILT_2 2 + psubw %1, %2 + psllw %2, 2 + psubw %1, %2 +%endmacro + +%macro FILT_4 3 + paddw %2, %3 + psllw %2, 2 + paddw %1, %2 + psllw %2, 2 + paddw %1, %2 +%endmacro + +%macro FILT_6 4 + psubw %1, %2 + psllw %2, 2 + psubw %1, %2 + paddw %1, %3 + paddw %1, %4 + psraw %1, 5 +%endmacro + +%macro FILT_ALL 1 + LOAD_4 mm1, mm2, mm3, mm4, [%1], [%1 + ecx], [%1 + 2 * ecx], [%1 + ebx], mm0 + FILT_2 mm1, mm2 + movd mm5, [%1 + 4 * ecx] + movd mm6, [%1 + edx] + FILT_4 mm1, mm3, mm4 + punpcklbw mm5, mm0 + punpcklbw mm6, mm0 + psubw mm1, mm5 + psllw mm5, 2 + psubw mm1, mm5 + paddw mm1, mm6 +%endmacro + + + + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal x264_vertical_filter_mmxext +cglobal x264_horizontal_filter_mmxext +cglobal x264_center_filter_mmxext + +;----------------------------------------------------------------------------- +; +; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride, +; uint8_t *dst2, int i_dst2_stride, +; uint8_t *src, int i_src_stride, +; int i_width, int i_height ); +; +;----------------------------------------------------------------------------- + +ALIGN 16 +x264_center_filter_mmxext : + + push edi + push esi + push ebx + push ebp + + mov esi, [esp + 36] ; src + + mov edx, [esp + 20] ; dst1 + mov [dst1], edx + + mov edi, [esp + 28] ; dst2 + mov [dst2], edi + + mov eax, [esp + 44] ; width + mov [width], eax + + mov eax, [esp + 48] ; height + mov [height], eax + + mov eax, [esp + 24] ; dst1_stride + mov [dstp1], eax + + mov eax, [esp + 32] ; dst2_stride + mov [dstp2], eax + + mov ecx, [esp + 40] ; src_stride + + sub esp, ecx + sub esp, ecx ; esp is now at the beginning of the buffer + mov [buffer], esp + + ;sub esi, 2 + sub esi, ecx + sub esi, ecx ; esi - 2 - 2 * stride + mov [src], esi + + ;sub edi, 2 + + mov ebx, ecx + shl ebx, 1 + add ebx, ecx ; 3 * src_stride + + mov edx, ecx + shl edx, 1 + add edx, ebx ; 5 * src_stride + + pxor mm0, mm0 ; 0 ---> mm0 + movq mm7, [mmx_dd_one] ; for rounding + + mov ebp, [height] + +loopcy: + + dec ebp + mov eax, [width] + mov edi, [dst1] + mov esp, [buffer] + mov esi, [src] + + FILT_ALL esi + + pshufw mm2, mm1, 0 + movq [esp], mm2 + add esp, 8 + movq [esp], mm1 + add esp, 8 + paddw mm1, [mmx_dw_one] + psraw mm1, 5 + + packuswb mm1, mm1 + movd [edi], mm1 + + sub eax, 8 + add edi, 4 + add esi, 4 + +loopcx1: + + sub eax, 4 + + FILT_ALL esi + + movq [esp], mm1 + paddw mm1, [mmx_dw_one] + psraw mm1, 5 + packuswb mm1, mm1 + movd [edi], mm1 + + add esp, 8 + add esi, 4 + add edi, 4 + test eax, eax + jnz loopcx1 + + FILT_ALL esi + + pshufw mm2, mm1, 7 + movq [esp], mm1 + add esp, 8 + movq [esp], mm2 + paddw mm1, [mmx_dw_one] + psraw mm1, 5 + packuswb mm1, mm1 + movd [edi], mm1 + + mov esi, [src] + add esi, ecx + mov [src], esi + + mov edi, [dst1] + add edi, [dstp1] + mov [dst1], edi + + mov eax, [width] + mov edi, [dst2] + mov esp, [buffer] + add esp, 4 + +loopcx2: + + sub eax, 4 + + movq mm2, [esp + 2 * eax + 2] + movq mm3, [esp + 2 * eax + 4] + movq mm4, [esp + 2 * eax + 6] + movq mm5, [esp + 2 * eax + 8] + movq mm1, [esp + 2 * eax] + movq mm6, [esp + 2 * eax + 10] + paddw mm2, mm5 + paddw mm3, mm4 + paddw mm1, mm6 + + movq mm5, [mmx_dw_20] + movq mm4, [mmx_dw_5] + movq mm6, mm1 + pxor mm7, mm7 + + punpckhwd mm5, mm2 + punpcklwd mm4, mm3 + punpcklwd mm2, [mmx_dw_20] + punpckhwd mm3, [mmx_dw_5] + + pcmpgtw mm7, mm1 + + pmaddwd mm2, mm4 + pmaddwd mm3, mm5 + + punpcklwd mm1, mm7 + punpckhwd mm6, mm7 + + paddd mm2, mm1 + paddd mm3, mm6 + + paddd mm2, [mmx_dd_one] + paddd mm3, [mmx_dd_one] + + psrad mm2, 10 + psrad mm3, 10 + + packssdw mm2, mm3 + packuswb mm2, mm0 + + movd [edi + eax], mm2 + + test eax, eax + jnz loopcx2 + + add edi, [dstp2] + mov [dst2], edi + + test ebp, ebp + jnz loopcy + + mov esp, [buffer] + shl ecx, 1 + add esp, ecx + + pop ebp + pop ebx + pop esi + pop edi + + ret + +;----------------------------------------------------------------------------- +; +; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src, int i_src_stride, +; int i_width, int i_height ); +; +;----------------------------------------------------------------------------- + +ALIGN 16 +x264_horizontal_filter_mmxext : + push edi + push esi + + mov edi, [esp + 12] ; dst + mov esi, [esp + 20] ; src + + pxor mm0, mm0 + movq mm7, [mmx_dw_one] + + mov ecx, [esp + 32] ; height + + sub esi, 2 + +loophy: + + dec ecx + mov eax, [esp + 28] ; width + +loophx: + + sub eax, 8 + + LOAD_4 mm1, mm2, mm3, mm4, [esi + eax], [esi + eax + 1], [esi + eax + 2], [esi + eax + 3], mm0 + FILT_2 mm1, mm2 + movd mm5, [esi + eax + 4] + movd mm6, [esi + eax + 5] + FILT_4 mm1, mm3, mm4 + movd mm2, [esi + eax + 4] + movd mm3, [esi + eax + 6] + punpcklbw mm5, mm0 + punpcklbw mm6, mm0 + FILT_6 mm1, mm5, mm6, mm7 + movd mm4, [esi + eax + 7] + movd mm5, [esi + eax + 8] + punpcklbw mm2, mm0 + punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready + FILT_2 mm2, mm6 + movd mm6, [esi + eax + 9] + punpcklbw mm4, mm0 + punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready + FILT_4 mm2, mm3, mm4 + punpcklbw mm6, mm0 + FILT_6 mm2, mm5, mm6, mm7 + + packuswb mm1, mm2 + movq [edi + eax], mm1 + + test eax, eax + jnz loophx + + add esi, [esp + 24] ; src_pitch + add edi, [esp + 16] ; dst_pitch + + test ecx, ecx + jnz loophy + + pop esi + pop edi + + ret diff --git a/common/amd64/mc-c.c b/common/amd64/mc-c.c new file mode 100644 index 00000000..b5b3c3cf --- /dev/null +++ b/common/amd64/mc-c.c @@ -0,0 +1,1161 @@ +/***************************************************************************** + * mc.c: h264 encoder library (Motion Compensation) + ***************************************************************************** + * Copyright (C) 2003 Laurent Aimar + * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $ + * + * Authors: Laurent Aimar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifdef HAVE_STDINT_H +#include +#else +#include +#endif +#include +#include +#include +#include + +#include "x264.h" /* DECLARE_ALIGNED */ +#include "common/mc.h" +#include "common/clip1.h" +#include "mc.h" + +#if 0 + +#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) +#define USED_UINT64(foo) \ + static const uint64_t foo __asm__ (#foo) __attribute__((used)) +#else +#define USED_UINT64(foo) \ + static const uint64_t foo __asm__ (#foo) __attribute__((unused)) +#endif + +USED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL; + + +#define MMX_ZERO( MMZ ) \ + asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: ) + +#define MMX_INIT( MMV, NAME ) \ + asm volatile( "movq " #NAME ", " #MMV "\n" :: ) + +#define MMX_SAVE_4P( MMP, MMZ, dst ) \ + asm volatile( "packuswb " #MMZ "," #MMP "\n" \ + "movd " #MMP ", (%0)" :: "r"(dst) ) + +#define MMX_LOAD_4P( MMP, MMZ, pix ) \ + asm volatile( "movd (%0), " #MMP "\n" \ + "punpcklbw " #MMZ ", " #MMP "\n" : : "r"(pix) ) + +#define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\ + MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \ + MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \ + MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \ + MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] ) + +#define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\ + MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \ + MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ) + +#define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \ + asm volatile( "packuswb " #MMP2 "," #MMP1 "\n" \ + "movq " #MMP1 ", (%0)\n" :: "r"(dst) ) + + +#define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \ + asm volatile( "movq (%0) , " #MMP1 "\n" \ + "movq " #MMP1 ", " #MMP2 "\n" \ + "punpcklbw " #MMZ ", " #MMP1 "\n" \ + "punpckhbw " #MMZ ", " #MMP2 "\n" : : "r"(pix) ) + +#define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\ + MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \ + MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] ) + +#define SBUTTERFLYwd(a,b,t )\ + asm volatile( "movq " #a ", " #t " \n\t" \ + "punpcklwd " #b ", " #a " \n\t" \ + "punpckhwd " #b ", " #t " \n\t" :: ) + +#define SBUTTERFLYdq(a,b,t )\ + asm volatile( "movq " #a ", " #t " \n\t" \ + "punpckldq " #b ", " #a " \n\t" \ + "punpckhdq " #b ", " #t " \n\t" :: ) + +/* input ABCD output ADTC ( or 0?31-2->0123 ) */ +#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \ + SBUTTERFLYwd( MMA, MMB, MMT ); \ + SBUTTERFLYwd( MMC, MMD, MMB ); \ + SBUTTERFLYdq( MMA, MMC, MMD ); \ + SBUTTERFLYdq( MMT, MMB, MMC ) + +/* first pass MM0 = MM0 -5*MM1 */ +#define MMX_FILTERTAP_P1( MMP0, MMP1 ) \ + asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \ + "psllw $2, " #MMP1 "\n" \ + "psubw " #MMP1 "," #MMP0 "\n" :: ) + \ +/* second pass MM0 = MM0 + 20*(MM2+MM3) */ +#define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \ + asm volatile( "paddw " #MMP3 "," #MMP2 "\n" \ + \ + "psllw $2, " #MMP2 "\n" \ + "paddw " #MMP2 "," #MMP0 "\n" \ + "psllw $2, " #MMP2 "\n" \ + "paddw " #MMP2 "," #MMP0 "\n" :: ) + +/* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */ +#define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \ + asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \ + "psllw $2, " #MMP1 "\n" \ + "psubw " #MMP1 "," #MMP0 "\n" \ + \ + "paddw " #MMP2 "," #MMP0 "\n" \ + "paddw " #MMV "," #MMP0 "\n" \ + "psraw $5, " #MMP0 "\n" :: ) + +#define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \ + asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \ + "psubw " #MMP3 "," #MMP2 "\n" \ + "psllw $2, " #MMP1 "\n" \ + "psllw $2, " #MMP3 "\n" \ + "psubw " #MMP1 "," #MMP0 "\n" \ + "psubw " #MMP3 "," #MMP2 "\n" :: ) + +/* second pass MM0 = MM0 + 20*(MM1+MM2) */ +#define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \ + asm volatile( "paddw " #MMP2 "," #MMP1 "\n" \ + "paddw " #MMP5 "," #MMP4 "\n" \ + \ + "psllw $2, " #MMP1 "\n" \ + "psllw $2, " #MMP4 "\n" \ + "paddw " #MMP1 "," #MMP0 "\n" \ + "paddw " #MMP4 "," #MMP3 "\n" \ + "psllw $2, " #MMP1 "\n" \ + "psllw $2, " #MMP4 "\n" \ + "paddw " #MMP1 "," #MMP0 "\n" \ + "paddw " #MMP4 "," #MMP3 "\n" :: ) + +#define MMX_LOAD_1r( m1, dst ) \ + asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \ + +#define MMX_SAVE_1r( m1, dst ) \ + asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \ + +#define MMX_LOAD_2r( m1, m2, dst, i_dst ) \ + asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \ + asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ) + +#define MMX_SAVE_2r( m1, m2, dst, i_dst ) \ + asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \ + asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ) + +#define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \ + asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \ + asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \ + asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \ + asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) ) + +#define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \ + asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \ + asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \ + asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \ + asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) ) + + +static inline int x264_tapfilter( uint8_t *pix, int i_pix_next ) +{ + return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next]; +} +static inline int x264_tapfilter1( uint8_t *pix ) +{ + return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3]; +} + +typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ); + +/* NASM functions */ +extern void x264_pixel_avg_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); + +/* Macro to define NxM functions */ +/* mc I+H */ +#define MC_IH( name, cpu, width, height, off ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \ + \ + mc_hh_w##width( src, i_src_stride, tmp, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + src+(off), i_src_stride, \ + tmp, width, i_height ); \ +} + +/* mc I+V */ +#define MC_IV( name, cpu, width, height, off ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \ + \ + mc_hv_w##width( src, i_src_stride, tmp, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + src+(off), i_src_stride, \ + tmp, width, i_height ); \ +} + +/* mc H+V */ +#define MC_HV( name, cpu, width, height, off1, off2 ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \ + DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \ + \ + mc_hv_w##width( src+(off1), i_src_stride, tmp1, width, i_height ); \ + mc_hh_w##width( src+(off2), i_src_stride, tmp2, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + tmp1, width, tmp2, width, \ + i_height ); \ +} + +/* mc C+H */ +#define MC_CH( name, cpu, width, height, off ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \ + DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \ + \ + mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \ + mc_hh_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + tmp1, width, tmp2, width, \ + i_height ); \ +} + +/* mc C+V */ +#define MC_CV( name, cpu, width, height, off ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \ + DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \ + \ + mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \ + mc_hv_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + tmp1, width, tmp2, width, \ + i_height ); \ +} + + +/***************************************************************************** + * MC with width == 4 (height <= 8) + *****************************************************************************/ + +extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int ); + +static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ) +{ + const int h4 = i_height / 4; + uint8_t srct[4*8*3]; + uint64_t tmp[4]; + int y; + + src -= 2; + + MMX_ZERO( %%mm7 ); + MMX_INIT( %%mm6, x264_w0x10 ); + + for( y = 0; y < h4; y++ ) + { + int i; + + /* Preload data and transpose them */ + MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src ); + MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */ + MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 ); + + MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src ); + MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */ + MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 ); + + /* we read 2 more bytes that needed */ + MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src ); + MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */ + MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 ); + + /* tap filter */ + for( i = 0; i < 4; i++ ) + { + MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 ); + MMX_FILTERTAP_P1( %%mm0, %%mm1 ); + MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 ); + + MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 ); + MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 ); + + MMX_SAVE_1r( %%mm0, &tmp[i] ); + } + + MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 ); + MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */ + MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] ); + MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] ); + MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] ); + MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] ); + + src += 4 * i_src; + dst += 4 * i_dst; + } +} +static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ) +{ + int y; + + src -= 2 * i_src; + + MMX_ZERO( %%mm7 ); + MMX_INIT( %%mm6, x264_w0x10 ); + + for( y = 0; y < i_height; y++ ) + { + MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src ); + MMX_FILTERTAP_P1( %%mm0, %%mm1 ); + MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 ); + + MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src ); + MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 ); + MMX_SAVE_4P( %%mm0, %%mm7, dst ); + + src += i_src; + dst += i_dst; + } +} + +static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + int i, x, y; + + for( y = 0; y < i_height; y++ ) + { + int16_t tap[5+4]; + + for( i = 0; i < 5+4; i++ ) + { + tap[i] = x264_tapfilter( &src[-2+i], i_src_stride ); + } + + for( x = 0; x < 4; x++ ) + { + dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 ); + } + + src += i_src_stride; + dst += i_dst_stride; + } +} + +MC_IH( mc_xy10, mmxext, 4, 8, 0 ) +MC_IH( mc_xy30, mmxext, 4, 8, 1 ) + +MC_IV( mc_xy01, mmxext, 4, 8, 0 ) +MC_IV( mc_xy03, mmxext, 4, 8, i_src_stride ) + +MC_HV( mc_xy11, mmxext, 4, 8, 0, 0 ) +MC_HV( mc_xy31, mmxext, 4, 8, 1, 0 ) +MC_HV( mc_xy13, mmxext, 4, 8, 0, i_src_stride ) +MC_HV( mc_xy33, mmxext, 4, 8, 1, i_src_stride ) + +MC_CH( mc_xy21, mmxext, 4, 8, 0 ) +MC_CH( mc_xy23, mmxext, 4, 8, i_src_stride ) + +MC_CV( mc_xy12, mmxext, 4, 8, 0 ) +MC_CV( mc_xy32, mmxext, 4, 8, 1 ) + +#if 0 +static void mc_xy10_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp[4*8]; + mc_hh_w4( src, i_src_stride, tmp, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height ); +} +static void mc_xy30_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp[4*8]; + mc_hh_w4( src, i_src_stride, tmp, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, src+1, i_src_stride, tmp, 4, i_height ); +} + +static void mc_xy01_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp[4*8]; + mc_hv_w4( src, i_src_stride, tmp, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height ); +} +static void mc_xy03_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp[4*8]; + mc_hv_w4( src, i_src_stride, tmp, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 4, i_height ); +} + +static void mc_xy11_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[4*8]; + uint8_t tmp2[4*8]; + + mc_hv_w4( src, i_src_stride, tmp1, 4, i_height ); + mc_hh_w4( src, i_src_stride, tmp2, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); +} +static void mc_xy31_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[4*8]; + uint8_t tmp2[4*8]; + + mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height ); + mc_hh_w4( src, i_src_stride, tmp2, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); +} +static void mc_xy13_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[4*8]; + uint8_t tmp2[4*8]; + + mc_hv_w4( src, i_src_stride, tmp1, 4, i_height ); + mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); +} +static void mc_xy33_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[4*8]; + uint8_t tmp2[4*8]; + + mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height ); + mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); +} + +static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[4*8]; + uint8_t tmp2[4*8]; + + mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); + mc_hh_w4( src, i_src_stride, tmp2, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); +} +static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[4*8]; + uint8_t tmp2[4*8]; + + mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); + mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); +} + +static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[4*8]; + uint8_t tmp2[4*8]; + + mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); + mc_hv_w4( src, i_src_stride, tmp2, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); +} +static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[4*8]; + uint8_t tmp2[4*8]; + + mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); + mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height ); + pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); +} +#endif + +/***************************************************************************** + * MC with width == 8 (height <= 16) + *****************************************************************************/ +extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int ); + +static inline void mc_hh_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ) +{ + mc_hh_w4( &src[0], i_src, &dst[0], i_dst, i_height ); + mc_hh_w4( &src[4], i_src, &dst[4], i_dst, i_height ); +} +static inline void mc_hv_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ) +{ + int y; + + src -= 2 * i_src; + + MMX_ZERO( %%mm7 ); + MMX_INIT( %%mm6, x264_w0x10 ); + + for( y = 0; y < i_height; y++ ) + { + MMX_LOAD_2x8( %%mm0, %%mm5, %%mm1, %%mm2, %%mm7, &src[0*i_src], i_src ); + MMX_FILTERTAP2_P1( %%mm0, %%mm1, %%mm5, %%mm2 ); + + + MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7, &src[2*i_src], i_src ); + MMX_FILTERTAP2_P2( %%mm0, %%mm1, %%mm2, %%mm5, %%mm3, %%mm4 ); + + MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7, &src[4*i_src], i_src ); + MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 ); + MMX_FILTERTAP_P3( %%mm5, %%mm3, %%mm4, %%mm6, %%mm7 ); + + MMX_SAVEPACK_8P( %%mm0, %%mm5, %%mm7, dst ); + + src += i_src; + dst += i_dst; + } +} + +static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + int x, y; + + asm volatile( "pxor %%mm7, %%mm7\n" : : ); + + for( y = 0; y < i_height; y++ ) + { + int16_t tap[5+8]; + + /* first 8 */ + asm volatile( + "leal (%0, %1), %%eax\n" + + "movq (%0), %%mm0\n" /* load pix-2 */ + "movq %%mm0, %%mm2\n" + "punpcklbw %%mm7, %%mm0\n" + "punpckhbw %%mm7, %%mm2\n" + + "movq (%%eax),%%mm1\n" /* load pix-1 */ + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm0\n" + "psllw $2, %%mm1\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + "psllw $2, %%mm3\n" + "psubw %%mm3, %%mm2\n" + + "movq (%%eax,%1),%%mm1\n" /* load pix */ + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm3\n" + "psllw $2, %%mm1\n" + "paddw %%mm1, %%mm0\n" + "psllw $2, %%mm1\n" + "paddw %%mm1, %%mm0\n" + "psllw $2, %%mm3\n" + "paddw %%mm3, %%mm2\n" + "psllw $2, %%mm3\n" + "paddw %%mm3, %%mm2\n" + + "movq (%%eax,%1,2),%%mm1\n" /* load pix+1 */ + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm3\n" + "psllw $2, %%mm1\n" + "paddw %%mm1, %%mm0\n" + "psllw $2, %%mm1\n" + "paddw %%mm1, %%mm0\n" + "psllw $2, %%mm3\n" + "paddw %%mm3, %%mm2\n" + "psllw $2, %%mm3\n" + "paddw %%mm3, %%mm2\n" + + "movq (%0,%1,4),%%mm1\n" /* load pix+2 */ + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm0\n" + "psllw $2, %%mm1\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + "psllw $2, %%mm3\n" + "psubw %%mm3, %%mm2\n" + + "movq (%%eax,%1,4),%%mm1\n" /* load pix+3 */ + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm3\n" + "paddw %%mm1, %%mm0\n" + "paddw %%mm3, %%mm2\n" + + "movq %%mm0, (%2)\n" + "movq %%mm2, 8(%2)\n" + + + "addl $8, %%eax\n" + "addl $8, %0\n" + + + "movd (%0), %%mm0\n" /* load pix-2 */ + "punpcklbw %%mm7, %%mm0\n" + + "movd (%%eax),%%mm1\n" /* load pix-1 */ + "punpcklbw %%mm7, %%mm1\n" + "psubw %%mm1, %%mm0\n" + "psllw $2, %%mm1\n" + "psubw %%mm1, %%mm0\n" + + "movd (%%eax,%1),%%mm1\n" /* load pix */ + "punpcklbw %%mm7, %%mm1\n" + "psllw $2, %%mm1\n" + "paddw %%mm1, %%mm0\n" + "psllw $2, %%mm1\n" + "paddw %%mm1, %%mm0\n" + + "movd (%%eax,%1,2),%%mm1\n" /* load pix+1 */ + "punpcklbw %%mm7, %%mm1\n" + "psllw $2, %%mm1\n" + "paddw %%mm1, %%mm0\n" + "psllw $2, %%mm1\n" + "paddw %%mm1, %%mm0\n" + + "movd (%0,%1,4),%%mm1\n" /* load pix+2 */ + "punpcklbw %%mm7, %%mm1\n" + "psubw %%mm1, %%mm0\n" + "psllw $2, %%mm1\n" + "psubw %%mm1, %%mm0\n" + + "movd (%%eax,%1,4),%%mm1\n" /* load pix+3 */ + "punpcklbw %%mm7, %%mm1\n" + "paddw %%mm1, %%mm0\n" + + "movq %%mm0, 16(%2)\n" + : : "r"(src-2*i_src_stride-2), "r"(i_src_stride), "r"(&tap[0]) : "%eax" ); + + /* last one */ + tap[8+4] = x264_tapfilter( &src[-2+8+4], i_src_stride ); + + for( x = 0; x < 8; x++ ) + { + dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 ); + } + + src += i_src_stride; + dst += i_dst_stride; + } +} + +MC_IH( mc_xy10, mmxext, 8, 16, 0 ) +MC_IH( mc_xy30, mmxext, 8, 16, 1 ) + +MC_IV( mc_xy01, mmxext, 8, 16, 0 ) +MC_IV( mc_xy03, mmxext, 8, 16, i_src_stride ) + +MC_HV( mc_xy11, mmxext, 8, 16, 0, 0 ) +MC_HV( mc_xy31, mmxext, 8, 16, 1, 0 ) +MC_HV( mc_xy13, mmxext, 8, 16, 0, i_src_stride ) +MC_HV( mc_xy33, mmxext, 8, 16, 1, i_src_stride ) + +MC_CH( mc_xy21, mmxext, 8, 16, 0 ) +MC_CH( mc_xy23, mmxext, 8, 16, i_src_stride ) + +MC_CV( mc_xy12, mmxext, 8, 16, 0 ) +MC_CV( mc_xy32, mmxext, 8, 16, 1 ) + +#if 0 +/* mc I+H */ +static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp[8*16]; + mc_hh_w8( src, i_src_stride, tmp, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height ); +} +static void mc_xy30_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp[8*16]; + mc_hh_w8( src, i_src_stride, tmp, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, src+1, i_src_stride, tmp, 8, i_height ); +} +/* mc I+V */ +static void mc_xy01_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp[8*16]; + mc_hv_w8( src, i_src_stride, tmp, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height ); +} +static void mc_xy03_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp[8*16]; + mc_hv_w8( src, i_src_stride, tmp, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 8, i_height ); +} +/* H+V */ +static void mc_xy11_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[8*16]; + uint8_t tmp2[8*16]; + + mc_hv_w8( src, i_src_stride, tmp1, 8, i_height ); + mc_hh_w8( src, i_src_stride, tmp2, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height ); +} +static void mc_xy31_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[8*16]; + uint8_t tmp2[8*16]; + + mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height ); + mc_hh_w8( src, i_src_stride, tmp2, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height ); +} +static void mc_xy13_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[8*16]; + uint8_t tmp2[8*16]; + + mc_hv_w8( src, i_src_stride, tmp1, 8, i_height ); + mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height ); +} +static void mc_xy33_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[8*16]; + uint8_t tmp2[8*16]; + + mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height ); + mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height ); +} +static void mc_xy21_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[8*16]; + uint8_t tmp2[8*16]; + + mc_hc_w8( src, i_src_stride, tmp1, 8, i_height ); + mc_hh_w8( src, i_src_stride, tmp2, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height ); +} +static void mc_xy12_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[8*16]; + uint8_t tmp2[8*16]; + + mc_hc_w8( src, i_src_stride, tmp1, 8, i_height ); + mc_hv_w8( src, i_src_stride, tmp2, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height ); +} +static void mc_xy32_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[8*16]; + uint8_t tmp2[8*16]; + + mc_hc_w8( src, i_src_stride, tmp1, 8, i_height ); + mc_hv_w8( src+1, i_src_stride, tmp2, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height ); +} +static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + uint8_t tmp1[8*16]; + uint8_t tmp2[8*16]; + + mc_hc_w8( src, i_src_stride, tmp1, 8, i_height ); + mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height ); + pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height ); +} +#endif + +/***************************************************************************** + * MC with width == 16 (height <= 16) + *****************************************************************************/ + +extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int ); +extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); + +static inline void mc_hh_w16( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ) +{ + mc_hh_w4( &src[ 0], i_src, &dst[ 0], i_dst, i_height ); + mc_hh_w4( &src[ 4], i_src, &dst[ 4], i_dst, i_height ); + mc_hh_w4( &src[ 8], i_src, &dst[ 8], i_dst, i_height ); + mc_hh_w4( &src[12], i_src, &dst[12], i_dst, i_height ); +} +static inline void mc_hv_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + mc_hv_w8( src, i_src_stride, dst, i_dst_stride, i_height ); + mc_hv_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height ); +} + +static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + mc_hc_w8( src, i_src_stride, dst, i_dst_stride, i_height ); + mc_hc_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height ); +} + +/* MMX avg/copy */ +MC_IH( mc_xy10, mmxext, 16, 16, 0 ) +MC_IH( mc_xy30, mmxext, 16, 16, 1 ) + +MC_IV( mc_xy01, mmxext, 16, 16, 0 ) +MC_IV( mc_xy03, mmxext, 16, 16, i_src_stride ) + +MC_HV( mc_xy11, mmxext, 16, 16, 0, 0 ) +MC_HV( mc_xy31, mmxext, 16, 16, 1, 0 ) +MC_HV( mc_xy13, mmxext, 16, 16, 0, i_src_stride ) +MC_HV( mc_xy33, mmxext, 16, 16, 1, i_src_stride ) + +MC_CH( mc_xy21, mmxext, 16, 16, 0 ) +MC_CH( mc_xy23, mmxext, 16, 16, i_src_stride ) + +MC_CV( mc_xy12, mmxext, 16, 16, 0 ) +MC_CV( mc_xy32, mmxext, 16, 16, 1 ) + +/* SSE2 avg/copy */ +MC_IH( mc_xy10, sse2, 16, 16, 0 ) +MC_IH( mc_xy30, sse2, 16, 16, 1 ) + +MC_IV( mc_xy01, sse2, 16, 16, 0 ) +MC_IV( mc_xy03, sse2, 16, 16, i_src_stride ) + +MC_HV( mc_xy11, sse2, 16, 16, 0, 0 ) +MC_HV( mc_xy31, sse2, 16, 16, 1, 0 ) +MC_HV( mc_xy13, sse2, 16, 16, 0, i_src_stride ) +MC_HV( mc_xy33, sse2, 16, 16, 1, i_src_stride ) + +MC_CH( mc_xy21, sse2, 16, 16, 0 ) +MC_CH( mc_xy23, sse2, 16, 16, i_src_stride ) + +MC_CV( mc_xy12, sse2, 16, 16, 0 ) +MC_CV( mc_xy32, sse2, 16, 16, 1 ) + + +#if 0 +/* mc I+H */ +static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp[16*16], 16); + mc_hh_w16( src, i_src_stride, tmp, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height ); +} +static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp[16*16], 16); + mc_hh_w16( src, i_src_stride, tmp, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height ); +} +/* mc I+V */ +static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp[16*16], 16); + mc_hv_w16( src, i_src_stride, tmp, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height ); +} +static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp[16*16], 16); + mc_hv_w16( src, i_src_stride, tmp, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height ); +} +/* H+V */ +static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); + + mc_hv_w16( src, i_src_stride, tmp1, 16, i_height ); + mc_hh_w16( src, i_src_stride, tmp2, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height ); +} +static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); + + mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height ); + mc_hh_w16( src, i_src_stride, tmp2, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height ); +} +static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); + + mc_hv_w16( src, i_src_stride, tmp1, 16, i_height ); + mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height ); +} +static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); + + mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height ); + mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height ); +} +static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); + + mc_hc_w16( src, i_src_stride, tmp1, 16, i_height ); + mc_hh_w16( src, i_src_stride, tmp2, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height ); +} +static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); + + mc_hc_w16( src, i_src_stride, tmp1, 16, i_height ); + mc_hv_w16( src, i_src_stride, tmp2, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height ); +} +static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); + + mc_hc_w16( src, i_src_stride, tmp1, 16, i_height ); + mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height ); +} +static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +{ + DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16); + DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16); + + mc_hc_w16( src, i_src_stride, tmp1, 16, i_height ); + mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height ); + pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height ); +} +#endif + +#define MOTION_COMPENSATION_LUMA \ + src += (mvy >> 2) * i_src_stride + (mvx >> 2); \ + if( i_width == 4 ) \ + { \ + pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \ + } \ + else if( i_width == 8 ) \ + { \ + pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \ + } \ + else if( i_width == 16 ) \ + { \ + pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \ + } \ + else \ + { \ + fprintf( stderr, "Error: motion_compensation_luma called with invalid width" ); \ + } + +static void motion_compensation_luma_mmxext( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int mvx,int mvy, + int i_width, int i_height ) +{ + static const pf_mc_t pf_mc[3][4][4] = /*XXX [dqy][dqx] */ + { + { + { x264_mc_copy_w4_mmxext, mc_xy10_w4_mmxext, mc_hh_w4, mc_xy30_w4_mmxext }, + { mc_xy01_w4_mmxext, mc_xy11_w4_mmxext, mc_xy21_w4_mmxext, mc_xy31_w4_mmxext }, + { mc_hv_w4, mc_xy12_w4_mmxext, mc_hc_w4, mc_xy32_w4_mmxext }, + { mc_xy03_w4_mmxext, mc_xy13_w4_mmxext, mc_xy23_w4_mmxext, mc_xy33_w4_mmxext }, + }, + { + { x264_mc_copy_w8_mmxext, mc_xy10_w8_mmxext, mc_hh_w8, mc_xy30_w8_mmxext }, + { mc_xy01_w8_mmxext, mc_xy11_w8_mmxext, mc_xy21_w8_mmxext, mc_xy31_w8_mmxext }, + { mc_hv_w8, mc_xy12_w8_mmxext, mc_hc_w8, mc_xy32_w8_mmxext }, + { mc_xy03_w8_mmxext, mc_xy13_w8_mmxext, mc_xy23_w8_mmxext, mc_xy33_w8_mmxext }, + }, + { + { x264_mc_copy_w16_mmxext, mc_xy10_w16_mmxext, mc_hh_w16, mc_xy30_w16_mmxext }, + { mc_xy01_w16_mmxext, mc_xy11_w16_mmxext, mc_xy21_w16_mmxext, mc_xy31_w16_mmxext }, + { mc_hv_w16, mc_xy12_w16_mmxext, mc_hc_w16, mc_xy32_w16_mmxext }, + { mc_xy03_w16_mmxext, mc_xy13_w16_mmxext, mc_xy23_w16_mmxext, mc_xy33_w16_mmxext }, + } + }; + + MOTION_COMPENSATION_LUMA +} + +static void motion_compensation_luma_sse2( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int mvx,int mvy, + int i_width, int i_height ) +{ + static const pf_mc_t pf_mc[3][4][4] = /*XXX [dqy][dqx] */ + { + { + { x264_mc_copy_w4_mmxext, mc_xy10_w4_mmxext, mc_hh_w4, mc_xy30_w4_mmxext }, + { mc_xy01_w4_mmxext, mc_xy11_w4_mmxext, mc_xy21_w4_mmxext, mc_xy31_w4_mmxext }, + { mc_hv_w4, mc_xy12_w4_mmxext, mc_hc_w4, mc_xy32_w4_mmxext }, + { mc_xy03_w4_mmxext, mc_xy13_w4_mmxext, mc_xy23_w4_mmxext, mc_xy33_w4_mmxext }, + }, + { + { x264_mc_copy_w8_mmxext, mc_xy10_w8_mmxext, mc_hh_w8, mc_xy30_w8_mmxext }, + { mc_xy01_w8_mmxext, mc_xy11_w8_mmxext, mc_xy21_w8_mmxext, mc_xy31_w8_mmxext }, + { mc_hv_w8, mc_xy12_w8_mmxext, mc_hc_w8, mc_xy32_w8_mmxext }, + { mc_xy03_w8_mmxext, mc_xy13_w8_mmxext, mc_xy23_w8_mmxext, mc_xy33_w8_mmxext }, + }, + { + { x264_mc_copy_w16_sse2, mc_xy10_w16_sse2, mc_hh_w16, mc_xy30_w16_sse2 }, + { mc_xy01_w16_sse2, mc_xy11_w16_sse2, mc_xy21_w16_sse2, mc_xy31_w16_sse2 }, + { mc_hv_w16, mc_xy12_w16_sse2, mc_hc_w16, mc_xy32_w16_sse2 }, + { mc_xy03_w16_sse2, mc_xy13_w16_sse2, mc_xy23_w16_sse2, mc_xy33_w16_sse2 }, + } + }; + MOTION_COMPENSATION_LUMA +} + +#endif + +void mc_luma_mmx( uint8_t *src[4], int i_src_stride, + uint8_t *dst, int i_dst_stride, + int mvx,int mvy, + int i_width, int i_height ) +{ + uint8_t *src1, *src2; + + /* todo : fixme... */ + int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0; + + int hpel1x = mvx>>1; + int hpel1y = (mvy+1-correction)>>1; + int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 ); + + + src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1); + + if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */ + { + int hpel2x = (mvx+1)>>1; + int hpel2y = (mvy+correction)>>1; + int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 ); + + src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1); + + switch(i_width) { + case 4: + x264_pixel_avg_w4_mmxext( dst, i_dst_stride, src1, i_src_stride, + src2, i_src_stride, i_height ); + break; + case 8: + x264_pixel_avg_w8_mmxext( dst, i_dst_stride, src1, i_src_stride, + src2, i_src_stride, i_height ); + break; + case 16: + default: + x264_pixel_avg_w16_mmxext(dst, i_dst_stride, src1, i_src_stride, + src2, i_src_stride, i_height ); + } + } + else + { + switch(i_width) { + case 4: + x264_mc_copy_w4_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height ); + break; + case 8: + x264_mc_copy_w8_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height ); + break; + case 16: + x264_mc_copy_w16_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height ); + break; + } + + } +} + +uint8_t *get_ref_mmx( uint8_t *src[4], int i_src_stride, + uint8_t *dst, int *i_dst_stride, + int mvx,int mvy, + int i_width, int i_height ) +{ + uint8_t *src1, *src2; + + /* todo : fixme... */ + int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0; + + int hpel1x = mvx>>1; + int hpel1y = (mvy+1-correction)>>1; + int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 ); + + + src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1); + + if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */ + { + int hpel2x = (mvx+1)>>1; + int hpel2y = (mvy+correction)>>1; + int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 ); + + src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1); + + switch(i_width) { + case 4: + x264_pixel_avg_w4_mmxext( dst, *i_dst_stride, src1, i_src_stride, + src2, i_src_stride, i_height ); + break; + case 8: + x264_pixel_avg_w8_mmxext( dst, *i_dst_stride, src1, i_src_stride, + src2, i_src_stride, i_height ); + break; + case 16: + default: + x264_pixel_avg_w16_mmxext(dst, *i_dst_stride, src1, i_src_stride, + src2, i_src_stride, i_height ); + } + return dst; + + } + else + { + *i_dst_stride = i_src_stride; + return src1; + } +} + + +void x264_mc_mmxext_init( x264_mc_functions_t *pf ) +{ + pf->mc_luma = mc_luma_mmx; + pf->get_ref = get_ref_mmx; +} +void x264_mc_sse2_init( x264_mc_functions_t *pf ) +{ + /* todo: use sse2 */ + pf->mc_luma = mc_luma_mmx; + pf->get_ref = get_ref_mmx; +} + +#if 0 +void get_funcs_mmx(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv) +{ + *int_h = mc_hh_w16; + *int_v = mc_hv_w16; + *int_hv = mc_hc_w16; +} + +void get_funcs_sse2(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv) +{ + *int_h = mc_hh_w16; + *int_v = mc_hv_w16; + *int_hv = mc_hc_w16; +} +#endif diff --git a/common/amd64/mc.h b/common/amd64/mc.h new file mode 100644 index 00000000..69766167 --- /dev/null +++ b/common/amd64/mc.h @@ -0,0 +1,34 @@ +/***************************************************************************** + * mc.h: h264 encoder library + ***************************************************************************** + * Copyright (C) 2003 Laurent Aimar + * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $ + * + * Authors: Laurent Aimar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifndef _I386_MC_H +#define _I386_MC_H 1 + +void x264_mc_mmxext_init( x264_mc_functions_t *pf ); +void x264_mc_sse2_init( x264_mc_functions_t *pf ); + +void x264_mc_chroma_sse( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int dx, int dy, + int i_height, int i_width ); +#endif diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm new file mode 100644 index 00000000..9760173f --- /dev/null +++ b/common/amd64/pixel-a.asm @@ -0,0 +1,811 @@ +;***************************************************************************** +;* pixel.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003 x264 project +;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ +;* +;* Authors: Laurent Aimar +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +%macro SAD_INC_2x16P 0 + movq mm1, [eax] + movq mm2, [ecx] + movq mm3, [eax+8] + movq mm4, [ecx+8] + + psadbw mm1, mm2 + psadbw mm3, mm4 + paddw mm0, mm1 + paddw mm0, mm3 + + movq mm1, [eax+ebx] + movq mm2, [ecx+edx] + movq mm3, [eax+ebx+8] + movq mm4, [ecx+edx+8] + + psadbw mm1, mm2 + psadbw mm3, mm4 + paddw mm0, mm1 + paddw mm0, mm3 + + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] +%endmacro + +%macro SAD_INC_2x8P 0 + movq mm1, [eax] + movq mm2, [ecx] + movq mm3, [eax+ebx] + movq mm4, [ecx+edx] + + psadbw mm1, mm2 + psadbw mm3, mm4 + paddw mm0, mm1 + paddw mm0, mm3 + + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] +%endmacro + +%macro SAD_INC_2x4P 0 + movd mm1, [eax] + movd mm2, [ecx] + movd mm3, [eax+ebx] + movd mm4, [ecx+edx] + + psadbw mm1, mm2 + psadbw mm3, mm4 + paddw mm0, mm1 + paddw mm0, mm3 + + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] +%endmacro + +%macro SSD_INC_1x16P 0 + movq mm1, [eax] + movq mm2, [ecx] + movq mm3, [eax+8] + movq mm4, [ecx+8] + + movq mm5, mm2 + movq mm6, mm4 + psubusb mm2, mm1 + psubusb mm4, mm3 + psubusb mm1, mm5 + psubusb mm3, mm6 + por mm1, mm2 + por mm3, mm4 + + movq mm2, mm1 + movq mm4, mm3 + punpcklbw mm1, mm7 + punpcklbw mm3, mm7 + punpckhbw mm2, mm7 + punpckhbw mm4, mm7 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + pmaddwd mm3, mm3 + pmaddwd mm4, mm4 + + add eax, ebx + add ecx, edx + paddd mm0, mm1 + paddd mm0, mm2 + paddd mm0, mm3 + paddd mm0, mm4 +%endmacro + +%macro SSD_INC_1x8P 0 + movq mm1, [eax] + movq mm2, [ecx] + + movq mm5, mm2 + psubusb mm2, mm1 + psubusb mm1, mm5 + por mm1, mm2 ; mm1 = 8bit abs diff + + movq mm2, mm1 + punpcklbw mm1, mm7 + punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + + add eax, ebx + add ecx, edx + paddd mm0, mm1 + paddd mm0, mm2 +%endmacro + +%macro SSD_INC_1x4P 0 + movd mm1, [eax] + movd mm2, [ecx] + + movq mm5, mm2 + psubusb mm2, mm1 + psubusb mm1, mm5 + por mm1, mm2 + punpcklbw mm1, mm7 + pmaddwd mm1, mm1 + + add eax, ebx + add ecx, edx + paddd mm0, mm1 +%endmacro + +%macro SSD_INC_8x16P 0 + SSD_INC_1x16P + SSD_INC_1x16P + SSD_INC_1x16P + SSD_INC_1x16P + SSD_INC_1x16P + SSD_INC_1x16P + SSD_INC_1x16P + SSD_INC_1x16P +%endmacro + +%macro SSD_INC_4x8P 0 + SSD_INC_1x8P + SSD_INC_1x8P + SSD_INC_1x8P + SSD_INC_1x8P +%endmacro + +%macro SSD_INC_4x4P 0 + SSD_INC_1x4P + SSD_INC_1x4P + SSD_INC_1x4P + SSD_INC_1x4P +%endmacro + +%macro LOAD_DIFF_4P 5 ; MMP, MMT, MMZ, [pix1], [pix2] + movd %1, %4 + punpcklbw %1, %3 + movd %2, %5 + punpcklbw %2, %3 + psubw %1, %2 +%endmacro + +%macro LOAD_DIFF_INC_4x4 11 ; p1,p2,p3,p4, t, z, pix1, i_pix1, pix2, i_pix2, offset + LOAD_DIFF_4P %1, %5, %6, [%7+%11], [%9+%11] + LOAD_DIFF_4P %2, %5, %6, [%7+%8+%11], [%9+%10+%11] + lea %7, [%7+2*%8] + lea %9, [%9+2*%10] + LOAD_DIFF_4P %3, %5, %6, [%7+%11], [%9+%11] + LOAD_DIFF_4P %4, %5, %6, [%7+%8+%11], [%9+%10+%11] + lea %7, [%7+2*%8] + lea %9, [%9+2*%10] +%endmacro + +%macro HADAMARD4_SUB_BADC 4 + paddw %1, %2 + paddw %3, %4 + paddw %2, %2 + paddw %4, %4 + psubw %2, %1 + psubw %4, %3 +%endmacro + +%macro HADAMARD4x4 4 + HADAMARD4_SUB_BADC %1, %2, %3, %4 + HADAMARD4_SUB_BADC %1, %3, %2, %4 +%endmacro + +%macro SBUTTERFLYwd 3 + movq %3, %1 + punpcklwd %1, %2 + punpckhwd %3, %2 +%endmacro + +%macro SBUTTERFLYdq 3 + movq %3, %1 + punpckldq %1, %2 + punpckhdq %3, %2 +%endmacro + +%macro TRANSPOSE4x4 5 ; abcd-t -> adtc + SBUTTERFLYwd %1, %2, %5 + SBUTTERFLYwd %3, %4, %2 + SBUTTERFLYdq %1, %3, %4 + SBUTTERFLYdq %5, %2, %3 +%endmacro + +%macro MMX_ABS 2 ; mma, mmt + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 +%endmacro + +%macro MMX_ABS_SUM 3 ; mma, mmt, mms + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 + paddusw %3, %1 +%endmacro + + +%macro MMX_SUM_MM 2 ; mmv, mmt + movq %2, %1 + psrlq %1, 32 + paddusw %1, %2 + movq %2, %1 + psrlq %1, 16 + paddusw %1, %2 + movd eax,%1 + and eax,0xffff + shr eax,1 +%endmacro + +%macro HADAMARD4x4_FIRST 0 + HADAMARD4x4 mm0, mm1, mm2, mm3 + TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4 + HADAMARD4x4 mm0, mm3, mm4, mm2 + MMX_ABS mm0, mm7 + MMX_ABS_SUM mm3, mm7, mm0 + MMX_ABS_SUM mm4, mm7, mm0 + MMX_ABS_SUM mm2, mm7, mm0 +%endmacro + +%macro HADAMARD4x4_NEXT 0 + HADAMARD4x4 mm1, mm2, mm3, mm4 + TRANSPOSE4x4 mm1, mm2, mm3, mm4, mm5 + HADAMARD4x4 mm1, mm4, mm5, mm3 + MMX_ABS_SUM mm1, mm7, mm0 + MMX_ABS_SUM mm4, mm7, mm0 + MMX_ABS_SUM mm5, mm7, mm0 + MMX_ABS_SUM mm3, mm7, mm0 +%endmacro + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal x264_pixel_sad_16x16_mmxext +cglobal x264_pixel_sad_16x8_mmxext +cglobal x264_pixel_sad_8x16_mmxext +cglobal x264_pixel_sad_8x8_mmxext +cglobal x264_pixel_sad_8x4_mmxext +cglobal x264_pixel_sad_4x8_mmxext +cglobal x264_pixel_sad_4x4_mmxext + +cglobal x264_pixel_ssd_16x16_mmxext +cglobal x264_pixel_ssd_16x8_mmxext +cglobal x264_pixel_ssd_8x16_mmxext +cglobal x264_pixel_ssd_8x8_mmxext +cglobal x264_pixel_ssd_8x4_mmxext +cglobal x264_pixel_ssd_4x8_mmxext +cglobal x264_pixel_ssd_4x4_mmxext + +cglobal x264_pixel_satd_4x4_mmxext +cglobal x264_pixel_satd_4x8_mmxext +cglobal x264_pixel_satd_8x4_mmxext +cglobal x264_pixel_satd_8x8_mmxext +cglobal x264_pixel_satd_16x8_mmxext +cglobal x264_pixel_satd_8x16_mmxext +cglobal x264_pixel_satd_16x16_mmxext + +%macro SAD_START 0 + push ebx + + mov eax, [esp+ 8] ; pix1 + mov ebx, [esp+12] ; stride1 + mov ecx, [esp+16] ; pix2 + mov edx, [esp+20] ; stride2 + + pxor mm0, mm0 +%endmacro +%macro SAD_END 0 + movd eax, mm0 + + pop ebx + ret +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_16x16_mmxext: + SAD_START + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_16x8_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_16x8_mmxext: + SAD_START + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_8x16_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_8x16_mmxext: + SAD_START + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_8x8_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_8x8_mmxext: + SAD_START + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_8x4_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_8x4_mmxext: + SAD_START + SAD_INC_2x8P + SAD_INC_2x8P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_4x8_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_4x8_mmxext: + SAD_START + SAD_INC_2x4P + SAD_INC_2x4P + SAD_INC_2x4P + SAD_INC_2x4P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_4x4_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_4x4_mmxext: + SAD_START + SAD_INC_2x4P + SAD_INC_2x4P + SAD_END + + + +%macro SSD_START 0 + push ebx + + mov eax, [esp+ 8] ; pix1 + mov ebx, [esp+12] ; stride1 + mov ecx, [esp+16] ; pix2 + mov edx, [esp+20] ; stride2 + + pxor mm7, mm7 ; zero + pxor mm0, mm0 ; mm0 holds the sum +%endmacro + +%macro SSD_END 0 + movq mm1, mm0 + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + + pop ebx + ret +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_ssd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_ssd_16x16_mmxext: + SSD_START + SSD_INC_8x16P + SSD_INC_8x16P + SSD_END + +ALIGN 16 +x264_pixel_ssd_16x8_mmxext: + SSD_START + SSD_INC_8x16P + SSD_END + +ALIGN 16 +x264_pixel_ssd_8x16_mmxext: + SSD_START + SSD_INC_4x8P + SSD_INC_4x8P + SSD_INC_4x8P + SSD_INC_4x8P + SSD_END + +ALIGN 16 +x264_pixel_ssd_8x8_mmxext: + SSD_START + SSD_INC_4x8P + SSD_INC_4x8P + SSD_END + +ALIGN 16 +x264_pixel_ssd_8x4_mmxext: + SSD_START + SSD_INC_4x8P + SSD_END + +ALIGN 16 +x264_pixel_ssd_4x8_mmxext: + SSD_START + SSD_INC_4x4P + SSD_INC_4x4P + SSD_END + +ALIGN 16 +x264_pixel_ssd_4x4_mmxext: + SSD_START + SSD_INC_4x4P + SSD_END + + + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_satd_4x4_mmxext: + push ebx + + mov eax, [esp+ 8] ; pix1 + mov ebx, [esp+12] ; stride1 + mov ecx, [esp+16] ; pix2 + mov edx, [esp+20] ; stride2 + + pxor mm7, mm7 + + LOAD_DIFF_4P mm0, mm6, mm7, [eax], [ecx] + LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx], [ecx+edx] + LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*ebx], [ecx+2*edx] + add eax, ebx + add ecx, edx + LOAD_DIFF_4P mm3, mm6, mm7, [eax+2*ebx], [ecx+2*edx] + + HADAMARD4x4_FIRST + + MMX_SUM_MM mm0, mm7 + pop ebx + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_satd_4x8_mmxext: + push ebx + + mov eax, [esp+ 8] ; pix1 + mov ebx, [esp+12] ; stride1 + mov ecx, [esp+16] ; pix2 + mov edx, [esp+20] ; stride2 + + pxor mm7, mm7 + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + pop ebx + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_satd_8x4_mmxext: + push ebx + + mov eax, [esp+ 8] ; pix1 + mov ebx, [esp+12] ; stride1 + mov ecx, [esp+16] ; pix2 + mov edx, [esp+20] ; stride2 + + pxor mm7, mm7 + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_FIRST + + mov eax, [esp+ 8] ; pix1 + mov ecx, [esp+16] ; pix2 + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + pop ebx + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_satd_8x8_mmxext: + push ebx + + mov eax, [esp+ 8] ; pix1 + mov ebx, [esp+12] ; stride1 + mov ecx, [esp+16] ; pix2 + mov edx, [esp+20] ; stride2 + + pxor mm7, mm7 + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_NEXT + + mov eax, [esp+ 8] ; pix1 + mov ecx, [esp+16] ; pix2 + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + pop ebx + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_satd_16x8_mmxext: + push ebx + push ebp + + mov eax, [esp+12] ; pix1 + mov ebx, [esp+16] ; stride1 + mov ecx, [esp+20] ; pix2 + mov edx, [esp+24] ; stride2 + + pxor mm7, mm7 + xor ebp, ebp + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_NEXT + + mov eax, [esp+12] ; pix1 + mov ecx, [esp+20] ; pix2 + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + mov ebp, eax + + mov eax, [esp+12] ; pix1 + mov ecx, [esp+20] ; pix2 + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8 + HADAMARD4x4_NEXT + + mov eax, [esp+12] ; pix1 + mov ecx, [esp+20] ; pix2 + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + add eax, ebp + + pop ebp + pop ebx + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_satd_8x16_mmxext: + push ebx + push ebp + + mov eax, [esp+12] ; pix1 + mov ebx, [esp+16] ; stride1 + mov ecx, [esp+20] ; pix2 + mov edx, [esp+24] ; stride2 + + pxor mm7, mm7 + xor ebp, ebp + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + mov ebp, eax + + mov eax, [esp+12] ; pix1 + mov ecx, [esp+20] ; pix2 + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + add eax, ebp + + pop ebp + pop ebx + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_satd_16x16_mmxext: + push ebx + push ebp + + mov eax, [esp+12] ; pix1 + mov ebx, [esp+16] ; stride1 + mov ecx, [esp+20] ; pix2 + mov edx, [esp+24] ; stride2 + + pxor mm7, mm7 + xor ebp, ebp + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + mov ebp, eax + + mov eax, [esp+12] ; pix1 + mov ecx, [esp+20] ; pix2 + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + add ebp, eax + + mov eax, [esp+12] ; pix1 + mov ecx, [esp+20] ; pix2 + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + add ebp, eax + + mov eax, [esp+12] ; pix1 + mov ecx, [esp+20] ; pix2 + + LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 12 + HADAMARD4x4_FIRST + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12 + HADAMARD4x4_NEXT + + LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12 + HADAMARD4x4_NEXT + + MMX_SUM_MM mm0, mm7 + add eax, ebp + + pop ebp + pop ebx + ret + diff --git a/common/amd64/pixel.h b/common/amd64/pixel.h new file mode 100644 index 00000000..43916c0a --- /dev/null +++ b/common/amd64/pixel.h @@ -0,0 +1,51 @@ +/***************************************************************************** + * mc.h: h264 encoder library + ***************************************************************************** + * Copyright (C) 2003 Laurent Aimar + * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $ + * + * Authors: Laurent Aimar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifndef _I386_PIXEL_H +#define _I386_PIXEL_H 1 + +int x264_pixel_sad_16x16_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_16x8_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_8x16_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_8x8_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int ); + +int x264_pixel_ssd_16x16_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_16x8_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_8x16_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_8x8_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_8x4_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_4x8_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_4x4_mmxext( uint8_t *, int, uint8_t *, int ); + +int x264_pixel_satd_16x16_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_16x8_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_8x16_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_8x8_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_8x4_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int ); + +#endif diff --git a/common/amd64/predict-a.asm b/common/amd64/predict-a.asm new file mode 100644 index 00000000..3237ebb6 --- /dev/null +++ b/common/amd64/predict-a.asm @@ -0,0 +1,141 @@ +;***************************************************************************** +;* predict-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2005 x264 project +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +;============================================================================= +; Read only data +;============================================================================= + +SECTION .rodata data align=16 + +SECTION .data + +;============================================================================= +; Macros +;============================================================================= + +%macro SAVE_0_1 1 + movq [%1] , mm0 + movq [%1 + 8] , mm1 +%endmacro + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal predict_8x8_v_mmx +cglobal predict_16x16_v_mmx + +;----------------------------------------------------------------------------- +; +; void predict_8x8_v_mmx( uint8_t *src, int i_stride ) +; +;----------------------------------------------------------------------------- + +ALIGN 16 +predict_8x8_v_mmx : + + ;push edi + ;push esi + + mov edx , [esp + 4] + mov ecx , [esp + 8] + sub edx , ecx ; esi <-- line -1 + + movq mm0 , [edx] + movq [edx + ecx] , mm0 ; 0 + movq [edx + 2 * ecx] , mm0 ; 1 + movq [edx + 4 * ecx] , mm0 ; 3 + movq [edx + 8 * ecx] , mm0 ; 7 + add edx , ecx ; esi <-- line 0 + movq [edx + 2 * ecx] , mm0 ; 2 + movq [edx + 4 * ecx] , mm0 ; 4 + lea edx , [edx + 4 * ecx] ; esi <-- line 4 + movq [edx + ecx] , mm0 ; 5 + movq [edx + 2 * ecx] , mm0 ; 6 + + ;pop esi + ;pop edi + + ret + +;----------------------------------------------------------------------------- +; +; void predict_16x16_v_mmx( uint8_t *src, int i_stride ) +; +;----------------------------------------------------------------------------- + +ALIGN 16 +predict_16x16_v_mmx : + + ;push edi + ;push esi + + mov edx, [esp + 4] + mov ecx, [esp + 8] + sub edx, ecx ; esi <-- line -1 + + movq mm0, [edx] + movq mm1, [edx + 8] + mov eax, ecx + shl eax, 1 + add eax, ecx ; eax <-- 3* stride + + SAVE_0_1 (edx + ecx) ; 0 + SAVE_0_1 (edx + 2 * ecx) ; 1 + SAVE_0_1 (edx + eax) ; 2 + SAVE_0_1 (edx + 4 * ecx) ; 3 + SAVE_0_1 (edx + 2 * eax) ; 5 + SAVE_0_1 (edx + 8 * ecx) ; 7 + SAVE_0_1 (edx + 4 * eax) ; 11 + add edx, ecx ; esi <-- line 0 + SAVE_0_1 (edx + 4 * ecx) ; 4 + SAVE_0_1 (edx + 2 * eax) ; 6 + SAVE_0_1 (edx + 8 * ecx) ; 8 + SAVE_0_1 (edx + 4 * eax) ; 12 + lea edx, [edx + 8 * ecx] ; esi <-- line 8 + SAVE_0_1 (edx + ecx) ; 9 + SAVE_0_1 (edx + 2 * ecx) ; 10 + lea edx, [edx + 4 * ecx] ; esi <-- line 12 + SAVE_0_1 (edx + ecx) ; 13 + SAVE_0_1 (edx + 2 * ecx) ; 14 + SAVE_0_1 (edx + eax) ; 15 + + + ;pop esi + ;pop edi + + ret diff --git a/common/amd64/predict.c b/common/amd64/predict.c new file mode 100644 index 00000000..5422f15c --- /dev/null +++ b/common/amd64/predict.c @@ -0,0 +1,444 @@ +/***************************************************************************** + * predict.c: h264 encoder + ***************************************************************************** + * Copyright (C) 2003 Laurent Aimar + * $Id: predict.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $ + * + * Authors: Laurent Aimar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +/* XXX predict4x4 are inspired from ffmpeg h264 decoder + */ + +#ifdef HAVE_STDINT_H +#include +#else +#include +#endif +#include +#include + +#include "x264.h" /* for keyword inline */ +#include "common/predict.h" +#include "predict.h" + +static inline int clip_uint8( int a ) +{ + if (a&(~255)) + return (-a)>>31; + else + return a; +} + +/**************************************************************************** + * 16x16 prediction for intra block DC, H, V, P + ****************************************************************************/ +static void predict_16x16_dc( uint8_t *src, int i_stride ) +{ + uint32_t dc = 0; + int i; + + /* calculate DC value */ + for( i = 0; i < 16; i++ ) + { + dc += src[-1 + i * i_stride]; + dc += src[i - i_stride]; + } + dc = (( dc + 16 ) >> 5) * 0x01010101; + + for( i = 0; i < 16; i++ ) + { + uint32_t *p = (uint32_t*)src; + + *p++ = dc; + *p++ = dc; + *p++ = dc; + *p++ = dc; + + src += i_stride; + } +} +static void predict_16x16_dc_left( uint8_t *src, int i_stride ) +{ + uint32_t dc = 0; + int i; + + for( i = 0; i < 16; i++ ) + { + dc += src[-1 + i * i_stride]; + } + dc = (( dc + 8 ) >> 4) * 0x01010101; + + for( i = 0; i < 16; i++ ) + { + uint32_t *p = (uint32_t*)src; + + *p++ = dc; + *p++ = dc; + *p++ = dc; + *p++ = dc; + + src += i_stride; + } +} +static void predict_16x16_dc_top( uint8_t *src, int i_stride ) +{ + uint32_t dc = 0; + int i; + + for( i = 0; i < 16; i++ ) + { + dc += src[i - i_stride]; + } + dc = (( dc + 8 ) >> 4) * 0x01010101; + + for( i = 0; i < 16; i++ ) + { + uint32_t *p = (uint32_t*)src; + + *p++ = dc; + *p++ = dc; + *p++ = dc; + *p++ = dc; + + src += i_stride; + } +} +static void predict_16x16_dc_128( uint8_t *src, int i_stride ) +{ + int i; + + for( i = 0; i < 16; i++ ) + { + uint32_t *p = (uint32_t*)src; + + *p++ = 0x80808080; + *p++ = 0x80808080; + *p++ = 0x80808080; + *p++ = 0x80808080; + + src += i_stride; + } +} +static void predict_16x16_h( uint8_t *src, int i_stride ) +{ + int i; + + for( i = 0; i < 16; i++ ) + { + const uint32_t v = 0x01010101 * src[-1]; + uint32_t *p = (uint32_t*)src; + + *p++ = v; + *p++ = v; + *p++ = v; + *p++ = v; + + src += i_stride; + + } +} + +extern predict_16x16_v_mmx( uint8_t *src, int i_stride ); + +#if 0 +static void predict_16x16_v( uint8_t *src, int i_stride ) +{ + int i; + + asm volatile( + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" :: "r"(&src[-i_stride]) ); + + for( i = 0; i < 16; i++ ) + { + asm volatile( + "movq %%mm0, (%0)\n" + "movq %%mm1, 8(%0)\n" :: "r"(src) ); + src += i_stride; + } +} +#endif + +/**************************************************************************** + * 8x8 prediction for intra chroma block DC, H, V, P + ****************************************************************************/ +static void predict_8x8_dc_128( uint8_t *src, int i_stride ) +{ + int y; + + for( y = 0; y < 8; y++ ) + { + uint32_t *p = (uint32_t*)src; + + *p++ = 0x80808080; + *p++ = 0x80808080; + + src += i_stride; + } +} +static void predict_8x8_dc_left( uint8_t *src, int i_stride ) +{ + int y; + uint32_t dc0 = 0, dc1 = 0; + + for( y = 0; y < 4; y++ ) + { + dc0 += src[y * i_stride - 1]; + dc1 += src[(y+4) * i_stride - 1]; + } + dc0 = (( dc0 + 2 ) >> 2)*0x01010101; + dc1 = (( dc1 + 2 ) >> 2)*0x01010101; + + for( y = 0; y < 4; y++ ) + { + uint32_t *p = (uint32_t*)src; + *p++ = dc0; + *p++ = dc0; + + src += i_stride; + } + for( y = 0; y < 4; y++ ) + { + uint32_t *p = (uint32_t*)src; + *p++ = dc1; + *p++ = dc1; + + src += i_stride; + } + +} +static void predict_8x8_dc_top( uint8_t *src, int i_stride ) +{ + int y, x; + uint32_t dc0 = 0, dc1 = 0; + + for( x = 0; x < 4; x++ ) + { + dc0 += src[x - i_stride]; + dc1 += src[x + 4 - i_stride]; + } + dc0 = (( dc0 + 2 ) >> 2)*0x01010101; + dc1 = (( dc1 + 2 ) >> 2)*0x01010101; + + for( y = 0; y < 8; y++ ) + { + uint32_t *p = (uint32_t*)src; + *p++ = dc0; + *p++ = dc1; + + src += i_stride; + } +} +static void predict_8x8_dc( uint8_t *src, int i_stride ) +{ + int y; + int s0 = 0, s1 = 0, s2 = 0, s3 = 0; + uint32_t dc0, dc1, dc2, dc3; + int i; + + /* First do : + s0 s1 + s2 + s3 + */ + for( i = 0; i < 4; i++ ) + { + s0 += src[i - i_stride]; + s1 += src[i + 4 - i_stride]; + s2 += src[-1 + i * i_stride]; + s3 += src[-1 + (i+4)*i_stride]; + } + /* now calculate + dc0 dc1 + dc2 dc3 + */ + dc0 = (( s0 + s2 + 4 ) >> 3)*0x01010101; + dc1 = (( s1 + 2 ) >> 2)*0x01010101; + dc2 = (( s3 + 2 ) >> 2)*0x01010101; + dc3 = (( s1 + s3 + 4 ) >> 3)*0x01010101; + + for( y = 0; y < 4; y++ ) + { + uint32_t *p = (uint32_t*)src; + *p++ = dc0; + *p++ = dc1; + + src += i_stride; + } + + for( y = 0; y < 4; y++ ) + { + uint32_t *p = (uint32_t*)src; + *p++ = dc2; + *p++ = dc3; + + src += i_stride; + } +} + +static void predict_8x8_h( uint8_t *src, int i_stride ) +{ + int i; + + for( i = 0; i < 8; i++ ) + { + uint32_t v = 0x01010101 * src[-1]; + uint32_t *p = (uint32_t*)src; + + *p++ = v; + *p++ = v; + + src += i_stride; + } +} + +extern void predict_8x8_v_mmx( uint8_t *src, int i_stride ); + +#if 0 +static void predict_8x8_v( uint8_t *src, int i_stride ) +{ + int i; + + asm volatile( "movq (%0), %%mm0\n" :: "r"(&src[-i_stride]) ); + + for( i = 0; i < 8; i++ ) + { + asm volatile( "movq %%mm0, (%0)\n" :: "r"(src) ); + src += i_stride; + } +} +#endif + + +/**************************************************************************** + * 4x4 prediction for intra luma block DC, H, V, P + ****************************************************************************/ +static void predict_4x4_dc_128( uint8_t *src, int i_stride ) +{ + int y; + for( y = 0; y < 4; y++ ) + { + uint32_t *p = (uint32_t*)src; + *p = 0x80808080; + + src += i_stride; + } +} +static void predict_4x4_dc_left( uint8_t *src, int i_stride ) +{ + int y; + uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+ + src[-1+2*i_stride] + src[-1+3*i_stride] + 2 ) >> 2)*0x01010101; + + for( y = 0; y < 4; y++ ) + { + uint32_t *p = (uint32_t*)src; + *p = dc; + + src += i_stride; + } +} +static void predict_4x4_dc_top( uint8_t *src, int i_stride ) +{ + int y; + uint32_t dc = (( src[0 - i_stride] + src[1 - i_stride] + + src[2 - i_stride] + src[3 - i_stride] + 2 ) >> 2)*0x01010101; + + for( y = 0; y < 4; y++ ) + { + uint32_t *p = (uint32_t*)src; + *p = dc; + src += i_stride; + } +} +static void predict_4x4_dc( uint8_t *src, int i_stride ) +{ + int y; + uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+ + src[-1+2*i_stride] + src[-1+3*i_stride] + + src[0 - i_stride] + src[1 - i_stride] + + src[2 - i_stride] + src[3 - i_stride] + 4 ) >> 3)*0x01010101; + + for( y = 0; y < 4; y++ ) + { + uint32_t *p = (uint32_t*)src; + *p = dc; + + src += i_stride; + } +} +static void predict_4x4_h( uint8_t *src, int i_stride ) +{ + int i; + + for( i = 0; i < 4; i++ ) + { + uint32_t *p = (uint32_t*)src; + *p = 0x01010101*src[-1]; + + src += i_stride; + } +} +static void predict_4x4_v( uint8_t *src, int i_stride ) +{ + uint32_t top = *((uint32_t*)&src[-i_stride]); + int i; + + for( i = 0; i < 4; i++ ) + { + uint32_t *p = (uint32_t*)src; + + *p = top; + + src += i_stride; + } +} + +/**************************************************************************** + * Exported functions: + ****************************************************************************/ +void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] ) +{ + pf[I_PRED_16x16_V ] = predict_16x16_v_mmx; + pf[I_PRED_16x16_H ] = predict_16x16_h; + pf[I_PRED_16x16_DC] = predict_16x16_dc; + pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left; + pf[I_PRED_16x16_DC_TOP ]= predict_16x16_dc_top; + pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128; +} + +void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] ) +{ + pf[I_PRED_CHROMA_V ] = predict_8x8_v_mmx; + pf[I_PRED_CHROMA_H ] = predict_8x8_h; + pf[I_PRED_CHROMA_DC] = predict_8x8_dc; + pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left; + pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top; + pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128; +} + +void x264_predict_4x4_init_mmxext( x264_predict_t pf[12] ) +{ + pf[I_PRED_4x4_V] = predict_4x4_v; + pf[I_PRED_4x4_H] = predict_4x4_h; + pf[I_PRED_4x4_DC] = predict_4x4_dc; + pf[I_PRED_4x4_DC_LEFT]= predict_4x4_dc_left; + pf[I_PRED_4x4_DC_TOP] = predict_4x4_dc_top; + pf[I_PRED_4x4_DC_128] = predict_4x4_dc_128; +} + diff --git a/common/amd64/predict.h b/common/amd64/predict.h new file mode 100644 index 00000000..b00b1e59 --- /dev/null +++ b/common/amd64/predict.h @@ -0,0 +1,31 @@ +/***************************************************************************** + * predict.h: h264 encoder library + ***************************************************************************** + * Copyright (C) 2003 Laurent Aimar + * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $ + * + * Authors: Laurent Aimar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifndef _I386_PREDICT_H +#define _I386_PREDICT_H 1 + +void x264_predict_16x16_init_mmxext ( x264_predict_t pf[7] ); +void x264_predict_8x8_init_mmxext ( x264_predict_t pf[7] ); +void x264_predict_4x4_init_mmxext ( x264_predict_t pf[12] ); + +#endif -- 2.40.0