From: Loren Merritt Date: Sun, 16 Mar 2008 19:54:58 +0000 (-0600) Subject: merge x86_32 and x86_64 asm, with macros to abstract calling convention and register... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c17218e8a37ca1ed93a0852b73acc5d4cc046bb8;p=libx264 merge x86_32 and x86_64 asm, with macros to abstract calling convention and register names --- diff --git a/.gitignore b/.gitignore index e5121211..582b9759 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ config.h config.mak x264 +checkasm gtk/test gtk/x264_gtk_encode gtk/x264_icon.h diff --git a/Makefile b/Makefile index daf26d64..3f3a16ba 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,8 @@ include config.mak +all: default + SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \ common/frame.c common/dct.c common/cpu.c common/cabac.c \ common/common.c common/mdate.c common/set.c \ @@ -18,30 +20,26 @@ SRCS += common/visualize.c common/display-x11.c endif # MMX/SSE optims -ifeq ($(ARCH),X86) ifneq ($(AS),) -SRCS += common/i386/mc-c.c common/i386/predict-c.c -ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \ - common/i386/pixel-a.asm common/i386/mc-a.asm \ - common/i386/mc-a2.asm common/i386/predict-a.asm \ - common/i386/pixel-sse2.asm common/i386/quant-a.asm \ - common/i386/deblock-a.asm +X86SRC0 = dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \ + pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \ + cpu-32.asm dct-32.asm +X86SRC = $(X86SRC0:%=common/x86/%) + +ifeq ($(ARCH),X86) +SRCS += common/x86/mc-c.c common/x86/predict-c.c +ASMSRC = $(X86SRC) common/x86/pixel-32.asm OBJASM = $(ASMSRC:%.asm=%.o) -ASFLAGS += -Icommon/i386/ -endif +ASFLAGS += -Icommon/x86/ +$(OBJASM): common/x86/x86inc.asm common/x86/x86inc-32.asm endif -# MMX/SSE optims ifeq ($(ARCH),X86_64) -ifneq ($(AS),) -SRCS += common/i386/mc-c.c common/i386/predict-c.c -ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \ - common/amd64/pixel-a.asm common/amd64/mc-a.asm \ - common/amd64/mc-a2.asm common/amd64/predict-a.asm \ - common/amd64/pixel-sse2.asm common/amd64/quant-a.asm \ - common/amd64/deblock-a.asm +SRCS += common/x86/mc-c.c common/x86/predict-c.c +ASMSRC = $(X86SRC:-32.asm=-64.asm) OBJASM = $(ASMSRC:%.asm=%.o) -ASFLAGS += -Icommon/amd64 +ASFLAGS += -Icommon/x86/ -DARCH_X86_64 +$(OBJASM): common/x86/x86inc.asm common/x86/x86inc-64.asm endif endif @@ -69,7 +67,6 @@ OBJCLI = $(SRCCLI:%.c=%.o) DEP = depend .PHONY: all default fprofiled clean distclean install install-gtk uninstall dox test testclean -all: default default: $(DEP) x264$(EXE) @@ -89,8 +86,6 @@ libx264gtk.a: muxers.o libx264.a checkasm: tools/checkasm.o libx264.a $(CC) -o $@ $+ $(LDFLAGS) -common/amd64/*.o: common/amd64/amd64inc.asm -common/i386/*.o: common/i386/i386inc.asm %.o: %.asm $(AS) $(ASFLAGS) -o $@ $< # delete local/anonymous symbols, so they don't show up in oprofile diff --git a/common/amd64/dct-a.asm b/common/amd64/dct-a.asm deleted file mode 100644 index 805afea9..00000000 --- a/common/amd64/dct-a.asm +++ /dev/null @@ -1,520 +0,0 @@ -;***************************************************************************** -;* dct.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2003 x264 project -;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ -;* -;* Authors: Laurent Aimar (initial version) -;* Min Chen (converted to nasm) -;* Loren Merritt (dct8) -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -;***************************************************************************** -;* * -;* Revision history: * -;* * -;* 2004.04.28 portab all 4x4 function to nasm (CM) * -;* * -;***************************************************************************** - -BITS 64 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "amd64inc.asm" - -%macro MMX_ZERO 1 - pxor %1, %1 -%endmacro - -%macro MMX_LOAD_DIFF_4P 5 - movd %1, %4 - punpcklbw %1, %3 - movd %2, %5 - punpcklbw %2, %3 - psubw %1, %2 -%endmacro - -%macro MMX_LOAD_DIFF_8P 5 - movq %1, %4 - punpcklbw %1, %3 - movq %2, %5 - punpcklbw %2, %3 - psubw %1, %2 -%endmacro - -%macro MMX_SUMSUB_BA 2 - paddw %1, %2 - paddw %2, %2 - psubw %2, %1 -%endmacro - -%macro MMX_SUMSUB_BADC 4 - paddw %1, %2 - paddw %3, %4 - paddw %2, %2 - paddw %4, %4 - psubw %2, %1 - psubw %4, %3 -%endmacro - -%macro MMX_SUMSUB2_AB 3 - movq %3, %1 - paddw %1, %1 - paddw %1, %2 - psubw %3, %2 - psubw %3, %2 -%endmacro - -%macro MMX_SUMSUBD2_AB 4 - movq %4, %1 - movq %3, %2 - psraw %2, 1 - psraw %4, 1 - paddw %1, %2 - psubw %4, %3 -%endmacro - -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 -%endmacro - -;----------------------------------------------------------------------------- -; input ABCD output ADTC -;----------------------------------------------------------------------------- -%macro MMX_TRANSPOSE 5 - SBUTTERFLY q, wd, %1, %2, %5 - SBUTTERFLY q, wd, %3, %4, %2 - SBUTTERFLY q, dq, %1, %3, %4 - SBUTTERFLY q, dq, %5, %2, %3 -%endmacro - -;----------------------------------------------------------------------------- -; input ABCDEFGH output AFHDTECB -;----------------------------------------------------------------------------- -%macro SSE2_TRANSPOSE8x8 9 - SBUTTERFLY dqa, wd, %1, %2, %9 - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, wd, %5, %6, %4 - SBUTTERFLY dqa, wd, %7, %8, %6 - SBUTTERFLY dqa, dq, %1, %3, %8 - SBUTTERFLY dqa, dq, %9, %2, %3 - SBUTTERFLY dqa, dq, %5, %7, %2 - SBUTTERFLY dqa, dq, %4, %6, %7 - SBUTTERFLY dqa, qdq, %1, %5, %6 - SBUTTERFLY dqa, qdq, %9, %4, %5 - SBUTTERFLY dqa, qdq, %8, %2, %4 - SBUTTERFLY dqa, qdq, %3, %7, %2 -%endmacro - -%macro MMX_STORE_DIFF_4P 5 - paddw %1, %3 - psraw %1, 6 - movd %2, %5 - punpcklbw %2, %4 - paddsw %1, %2 - packuswb %1, %1 - movd %5, %1 -%endmacro - -%macro MMX_STORE_DIFF_8P 4 - psraw %1, 6 - movq %2, %4 - punpcklbw %2, %3 - paddsw %1, %2 - packuswb %1, %1 - movq %4, %1 -%endmacro - -;============================================================================= -; Constants -;============================================================================= - -SECTION_RODATA -pw_1: times 8 dw 1 -pw_32: times 8 dw 32 - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -;----------------------------------------------------------------------------- -; void x264_dct4x4dc_mmx( int16_t d[4][4] ) -;----------------------------------------------------------------------------- -cglobal x264_dct4x4dc_mmx - movq mm0, [parm1q+ 0] - movq mm1, [parm1q+ 8] - movq mm2, [parm1q+16] - movq mm3, [parm1q+24] - - MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 - MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 - - MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 - - MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 - MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 - - movq mm6, [pw_1 GLOBAL] - paddw mm0, mm6 - paddw mm2, mm6 - psraw mm0, 1 - movq [parm1q+ 0],mm0 - psraw mm2, 1 - movq [parm1q+ 8],mm2 - paddw mm3, mm6 - paddw mm4, mm6 - psraw mm3, 1 - movq [parm1q+16],mm3 - psraw mm4, 1 - movq [parm1q+24],mm4 - ret - -;----------------------------------------------------------------------------- -; void x264_idct4x4dc_mmx( int16_t d[4][4] ) -;----------------------------------------------------------------------------- -cglobal x264_idct4x4dc_mmx - movq mm0, [parm1q+ 0] - movq mm1, [parm1q+ 8] - movq mm2, [parm1q+16] - movq mm3, [parm1q+24] - - MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 - MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 - - MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 - - MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 - MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 - - movq [parm1q+ 0], mm0 - movq [parm1q+ 8], mm2 - movq [parm1q+16], mm3 - movq [parm1q+24], mm4 - ret - -;----------------------------------------------------------------------------- -; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) -;----------------------------------------------------------------------------- -cglobal x264_sub4x4_dct_mmx - MMX_ZERO mm7 - - ; Load 4 lines - MMX_LOAD_DIFF_4P mm0, mm6, mm7, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE] - MMX_LOAD_DIFF_4P mm1, mm6, mm7, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE] - MMX_LOAD_DIFF_4P mm2, mm6, mm7, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE] - MMX_LOAD_DIFF_4P mm3, mm6, mm7, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE] - - MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12 - - MMX_SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12 - MMX_SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12 - - ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 - MMX_TRANSPOSE mm2, mm0, mm3, mm4, mm1 - - MMX_SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12 - - MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12 - MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12 - - movq [parm1q+ 0], mm1 - movq [parm1q+ 8], mm2 - movq [parm1q+16], mm3 - movq [parm1q+24], mm0 - ret - -;----------------------------------------------------------------------------- -; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ) -;----------------------------------------------------------------------------- -cglobal x264_add4x4_idct_mmx - ; Load dct coeffs - movq mm0, [parm2q+ 0] ; dct - movq mm1, [parm2q+ 8] - movq mm2, [parm2q+16] - movq mm3, [parm2q+24] - - MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02 - MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) - - MMX_SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 - - ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0 - MMX_TRANSPOSE mm1, mm4, mm0, mm2, mm3 - - MMX_SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02 - MMX_SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) - - MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 - - MMX_ZERO mm7 - movq mm6, [pw_32 GLOBAL] - - MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [parm1q+0*FDEC_STRIDE] - MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [parm1q+1*FDEC_STRIDE] - MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [parm1q+2*FDEC_STRIDE] - MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [parm1q+3*FDEC_STRIDE] - - ret - - - -; ============================================================================= -; 8x8 Transform -; ============================================================================= - -; in: ABCDEFGH -; out: FBCGEDHI -%macro DCT8_1D 10 - MMX_SUMSUB_BA %8, %1 ; %8=s07, %1=d07 - MMX_SUMSUB_BA %7, %2 ; %7=s16, %2=d16 - MMX_SUMSUB_BA %6, %3 ; %6=s25, %3=d25 - MMX_SUMSUB_BA %5, %4 ; %5=s34, %4=d34 - - MMX_SUMSUB_BA %5, %8 ; %5=a0, %8=a2 - MMX_SUMSUB_BA %6, %7 ; %6=a1, %7=a3 - - movdqa %9, %1 - psraw %9, 1 - paddw %9, %1 - paddw %9, %2 - paddw %9, %3 ; %9=a4 - - movdqa %10, %4 - psraw %10, 1 - paddw %10, %4 - paddw %10, %2 - psubw %10, %3 ; %10=a7 - - MMX_SUMSUB_BA %4, %1 - psubw %1, %3 - psubw %4, %2 - psraw %3, 1 - psraw %2, 1 - psubw %1, %3 ; %1=a5 - psubw %4, %2 ; %4=a6 - - MMX_SUMSUB_BA %6, %5 ; %6=b0, %5=b4 - - movdqa %2, %10 - psraw %2, 2 - paddw %2, %9 ; %2=b1 - psraw %9, 2 - psubw %9, %10 ; %9=b7 - - movdqa %3, %7 - psraw %3, 1 - paddw %3, %8 ; %3=b2 - psraw %8, 1 - psubw %8, %7 ; %8=b6 - - movdqa %7, %4 - psraw %7, 2 - paddw %7, %1 ; %7=b3 - psraw %1, 2 - psubw %4, %1 ; %4=b5 -%endmacro - -;----------------------------------------------------------------------------- -; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) -;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_sse2 - MMX_ZERO xmm9 - - MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE] - MMX_LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE] - MMX_LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE] - MMX_LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE] - MMX_LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE] - MMX_LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE] - MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE] - MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE] - - DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9 - SSE2_TRANSPOSE8x8 xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0 - DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9 - - movdqa [parm1q+0x00], xmm4 - movdqa [parm1q+0x10], xmm3 - movdqa [parm1q+0x20], xmm8 - movdqa [parm1q+0x30], xmm2 - movdqa [parm1q+0x40], xmm0 - movdqa [parm1q+0x50], xmm6 - movdqa [parm1q+0x60], xmm1 - movdqa [parm1q+0x70], xmm7 - - ret - - -; in: ABCDEFGH -; out: IBHDEACG -%macro IDCT8_1D 10 - MMX_SUMSUB_BA %5, %1 ; %5=a0, %1=a2 - movdqa %10, %3 - psraw %3, 1 - psubw %3, %7 ; %3=a4 - psraw %7, 1 - paddw %7, %10 ; %7=a6 - - movdqa %9, %2 - psraw %9, 1 - paddw %9, %2 - paddw %9, %4 - paddw %9, %6 ; %9=a7 - - movdqa %10, %6 - psraw %10, 1 - paddw %10, %6 - paddw %10, %8 - psubw %10, %2 ; %10=a5 - - psubw %2, %4 - psubw %6, %4 - paddw %2, %8 - psubw %6, %8 - psraw %4, 1 - psraw %8, 1 - psubw %2, %4 ; %2=a3 - psubw %6, %8 ; %6=a1 - - MMX_SUMSUB_BA %7, %5 ; %7=b0, %5=b6 - MMX_SUMSUB_BA %3, %1 ; %3=b2, %1=b4 - - movdqa %4, %9 - psraw %4, 2 - paddw %4, %6 ; %4=b1 - psraw %6, 2 - psubw %9, %6 ; %9=b7 - - movdqa %8, %10 - psraw %8, 2 - paddw %8, %2 ; %8=b3 - psraw %2, 2 - psubw %2, %10 ; %2=b5 - - MMX_SUMSUB_BA %9, %7 ; %9=c0, %7=c7 - MMX_SUMSUB_BA %2, %3 ; %2=c1, %3=c6 - MMX_SUMSUB_BA %8, %1 ; %8=c2, %1=c5 - MMX_SUMSUB_BA %4, %5 ; %4=c3, %5=c4 -%endmacro - -;----------------------------------------------------------------------------- -; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) -;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2 - movdqa xmm0, [parm2q+0x00] - movdqa xmm1, [parm2q+0x10] - movdqa xmm2, [parm2q+0x20] - movdqa xmm3, [parm2q+0x30] - movdqa xmm4, [parm2q+0x40] - movdqa xmm5, [parm2q+0x50] - movdqa xmm6, [parm2q+0x60] - movdqa xmm7, [parm2q+0x70] - - IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8 - SSE2_TRANSPOSE8x8 xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5 - paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end - IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2 - - MMX_ZERO xmm15 - MMX_STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE] - MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE] - MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE] - MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE] - MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE] - MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE] - MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE] - MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE] - - ret - - -;----------------------------------------------------------------------------- -; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4], -; uint8_t *pix1, uint8_t *pix2 ) -;----------------------------------------------------------------------------- -%macro SUB_NxN_DCT 6 -cglobal %1 - call %2 - add parm1q, %3 - add parm2q, %4-%5*FENC_STRIDE - add parm3q, %4-%5*FDEC_STRIDE - call %2 - add parm1q, %3 - add parm2q, %4*FENC_STRIDE-%6 - add parm3q, %4*FDEC_STRIDE-%6 - call %2 - add parm1q, %3 - add parm2q, %4-%5*FENC_STRIDE - add parm3q, %4-%5*FDEC_STRIDE - jmp %2 -%endmacro - -;----------------------------------------------------------------------------- -; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) -;----------------------------------------------------------------------------- -%macro ADD_NxN_IDCT 6 -cglobal %1 - call %2 - add parm1q, %4-%5*FDEC_STRIDE - add parm2q, %3 - call %2 - add parm1q, %4*FDEC_STRIDE-%6 - add parm2q, %3 - call %2 - add parm1q, %4-%5*FDEC_STRIDE - add parm2q, %3 - jmp %2 -%endmacro - -SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx, 32, 4, 0, 4 -ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx, 32, 4, 0, 4 - -SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx, 32, 4, 4, 12 -ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx, 32, 4, 4, 12 - -SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8 -ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8 - - -;----------------------------------------------------------------------------- -; void __cdecl x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] ) -;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_4x4_field_sse2 - punpcklwd xmm0, [parm2q] - punpckhwd xmm1, [parm2q] - punpcklwd xmm2, [parm2q+16] - punpckhwd xmm3, [parm2q+16] - psrad xmm0, 16 - psrad xmm1, 16 - psrad xmm2, 16 - psrad xmm3, 16 - movq [parm1q ], xmm0 - movdqa [parm1q+16], xmm1 - movdqa [parm1q+32], xmm2 - movhlps xmm0, xmm0 - movdqa [parm1q+48], xmm3 - movq [parm1q+12], xmm0 - movd [parm1q+ 8], xmm1 - ret - diff --git a/common/amd64/deblock-a.asm b/common/amd64/deblock-a.asm deleted file mode 100644 index 70aad4c6..00000000 --- a/common/amd64/deblock-a.asm +++ /dev/null @@ -1,475 +0,0 @@ -;***************************************************************************** -;* deblock-a.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2005 x264 project -;* -;* Authors: Loren Merritt -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -BITS 64 - -%include "amd64inc.asm" - -SECTION_RODATA -pb_01: times 16 db 0x01 -pb_03: times 16 db 0x03 -pb_a1: times 16 db 0xa1 - -SECTION .text - -; expands to [base],...,[base+7*stride] -%define PASS8ROWS(base, base3, stride, stride3) \ - [base], [base+stride], [base+stride*2], [base3], \ - [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] - -; in: 8 rows of 4 bytes in %1..%8 -; out: 4 rows of 8 bytes in mm0..mm3 -%macro TRANSPOSE4x8_LOAD 8 - movd mm0, %1 - movd mm2, %2 - movd mm1, %3 - movd mm3, %4 - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - movq mm2, mm0 - punpcklwd mm0, mm1 - punpckhwd mm2, mm1 - - movd mm4, %5 - movd mm6, %6 - movd mm5, %7 - movd mm7, %8 - punpcklbw mm4, mm6 - punpcklbw mm5, mm7 - movq mm6, mm4 - punpcklwd mm4, mm5 - punpckhwd mm6, mm5 - - movq mm1, mm0 - movq mm3, mm2 - punpckldq mm0, mm4 - punpckhdq mm1, mm4 - punpckldq mm2, mm6 - punpckhdq mm3, mm6 -%endmacro - -; in: 4 rows of 8 bytes in mm0..mm3 -; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq mm4, mm0 - movq mm5, mm1 - movq mm6, mm2 - punpckhdq mm4, mm4 - punpckhdq mm5, mm5 - punpckhdq mm6, mm6 - - punpcklbw mm0, mm1 - punpcklbw mm2, mm3 - movq mm1, mm0 - punpcklwd mm0, mm2 - punpckhwd mm1, mm2 - movd %1, mm0 - punpckhdq mm0, mm0 - movd %2, mm0 - movd %3, mm1 - punpckhdq mm1, mm1 - movd %4, mm1 - - punpckhdq mm3, mm3 - punpcklbw mm4, mm5 - punpcklbw mm6, mm3 - movq mm5, mm4 - punpcklwd mm4, mm6 - punpckhwd mm5, mm6 - movd %5, mm4 - punpckhdq mm4, mm4 - movd %6, mm4 - movd %7, mm5 - punpckhdq mm5, mm5 - movd %8, mm5 -%endmacro - -%macro SBUTTERFLY 4 - movq %4, %2 - punpckl%1 %2, %3 - punpckh%1 %4, %3 -%endmacro - -; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 -; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] -%macro TRANSPOSE6x8_MEM 9 - movq mm0, %1 - movq mm1, %3 - movq mm2, %5 - movq mm3, %7 - SBUTTERFLY bw, mm0, %2, mm4 - SBUTTERFLY bw, mm1, %4, mm5 - SBUTTERFLY bw, mm2, %6, mm6 - movq [%9+0x10], mm5 - SBUTTERFLY bw, mm3, %8, mm7 - SBUTTERFLY wd, mm0, mm1, mm5 - SBUTTERFLY wd, mm2, mm3, mm1 - punpckhdq mm0, mm2 - movq [%9+0x00], mm0 - SBUTTERFLY wd, mm4, [%9+0x10], mm3 - SBUTTERFLY wd, mm6, mm7, mm2 - SBUTTERFLY dq, mm4, mm6, mm0 - SBUTTERFLY dq, mm5, mm1, mm7 - punpckldq mm3, mm2 - movq [%9+0x10], mm5 - movq [%9+0x20], mm7 - movq [%9+0x30], mm4 - movq [%9+0x40], mm0 - movq [%9+0x50], mm3 -%endmacro - -; out: %4 = |%1-%2|>%3 -; clobbers: %5 -%macro DIFF_GT 6 - mov%1 %6, %3 - mov%1 %5, %2 - psubusb %6, %2 - psubusb %5, %3 - por %5, %6 - psubusb %5, %4 -%endmacro -%macro DIFF_GT_MMX 5 - DIFF_GT q, %1, %2, %3, %4, %5 -%endmacro -%macro DIFF_GT_SSE2 5 - DIFF_GT dqa, %1, %2, %3, %4, %5 -%endmacro - -; out: %4 = |%1-%2|>%3 -; clobbers: %5 -%macro DIFF_GT2 6 - mov%1 %6, %3 - mov%1 %5, %2 - psubusb %6, %2 - psubusb %5, %3 - psubusb %6, %4 - psubusb %5, %4 - pcmpeqb %5, %6 -%endmacro -%macro DIFF_GT2_MMX 5 - DIFF_GT2 q, %1, %2, %3, %4, %5 -%endmacro -%macro DIFF_GT2_SSE2 5 - DIFF_GT2 dqa, %1, %2, %3, %4, %5 -%endmacro - -; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1 -; out: mm5=beta-1, mm7=mask -; clobbers: mm4,mm6 -%macro LOAD_MASK_MMX 2 - movd mm4, %1 - movd mm5, %2 - pshufw mm4, mm4, 0 - pshufw mm5, mm5, 0 - packuswb mm4, mm4 ; 8x alpha-1 - packuswb mm5, mm5 ; 8x beta-1 - DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1 - DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1 - por mm7, mm4 - DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1 - por mm7, mm4 - pxor mm6, mm6 - pcmpeqb mm7, mm6 -%endmacro -%macro LOAD_MASK_SSE2 2 - movd xmm4, %1 - movd xmm5, %2 - pshuflw xmm4, xmm4, 0 - pshuflw xmm5, xmm5, 0 - punpcklqdq xmm4, xmm4 - punpcklqdq xmm5, xmm5 - packuswb xmm4, xmm4 ; 16x alpha-1 - packuswb xmm5, xmm5 ; 16x beta-1 - DIFF_GT_SSE2 xmm1, xmm2, xmm4, xmm7, xmm6 ; |p0-q0| > alpha-1 - DIFF_GT_SSE2 xmm0, xmm1, xmm5, xmm4, xmm6 ; |p1-p0| > beta-1 - por xmm7, xmm4 - DIFF_GT_SSE2 xmm3, xmm2, xmm5, xmm4, xmm6 ; |q1-q0| > beta-1 - por xmm7, xmm4 - pxor xmm6, xmm6 - pcmpeqb xmm7, xmm6 -%endmacro - -; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) -; out: mm1=p0' mm2=q0' -; clobbers: mm0,3-6 -%macro DEBLOCK_P0_Q0 2 - mov%1 %2m5, %2m1 - pxor %2m5, %2m2 ; p0^q0 - pand %2m5, [pb_01 GLOBAL] ; (p0^q0)&1 - pcmpeqb %2m4, %2m4 - pxor %2m3, %2m4 - pavgb %2m3, %2m0 ; (p1 - q1 + 256)>>1 - pavgb %2m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 - pxor %2m4, %2m1 - pavgb %2m4, %2m2 ; (q0 - p0 + 256)>>1 - pavgb %2m3, %2m5 - paddusb %2m3, %2m4 ; d+128+33 - mov%1 %2m6, [pb_a1 GLOBAL] - psubusb %2m6, %2m3 - psubusb %2m3, [pb_a1 GLOBAL] - pminub %2m6, %2m7 - pminub %2m3, %2m7 - psubusb %2m1, %2m6 - psubusb %2m2, %2m3 - paddusb %2m1, %2m3 - paddusb %2m2, %2m6 -%endmacro -%macro DEBLOCK_P0_Q0_MMX 0 - DEBLOCK_P0_Q0 q, m -%endmacro -%macro DEBLOCK_P0_Q0_SSE2 0 - DEBLOCK_P0_Q0 dqa, xm -%endmacro - -; in: mm1=p0 mm2=q0 -; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp -; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) -; clobbers: q2, tmp, tc0 -%macro LUMA_Q1_SSE2 6 - movdqa %6, xmm1 - pavgb %6, xmm2 - pavgb %2, %6 ; avg(p2,avg(p0,q0)) - pxor %6, %3 - pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 - psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - movdqa %6, %1 - psubusb %6, %5 - paddusb %5, %1 - pmaxub %2, %6 - pminub %2, %5 - movdqa %4, %2 -%endmacro - - -SECTION .text -;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_v_luma_sse2 - ; rdi = pix - movsxd rsi, esi ; stride - dec edx ; alpha-1 - dec ecx ; beta-1 - movd xmm8, [r8] ; tc0 - mov r8, rdi - sub r8, rsi - sub r8, rsi - sub r8, rsi ; pix-3*stride - - movdqa xmm0, [r8+rsi] ; p1 - movdqa xmm1, [r8+2*rsi] ; p0 - movdqa xmm2, [rdi] ; q0 - movdqa xmm3, [rdi+rsi] ; q1 - LOAD_MASK_SSE2 edx, ecx - - punpcklbw xmm8, xmm8 - punpcklbw xmm8, xmm8 ; xmm8 = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] - pcmpeqb xmm9, xmm9 - pcmpeqb xmm9, xmm8 - pandn xmm9, xmm7 - pand xmm8, xmm9 - - movdqa xmm3, [r8] ; p2 - DIFF_GT2_SSE2 xmm1, xmm3, xmm5, xmm6, xmm7 ; |p2-p0| > beta-1 - pand xmm6, xmm9 - movdqa xmm7, xmm8 - psubb xmm7, xmm6 - pand xmm6, xmm8 - LUMA_Q1_SSE2 xmm0, xmm3, [r8], [r8+rsi], xmm6, xmm4 - - movdqa xmm4, [rdi+2*rsi] ; q2 - DIFF_GT2_SSE2 xmm2, xmm4, xmm5, xmm6, xmm3 ; |q2-q0| > beta-1 - pand xmm6, xmm9 - pand xmm8, xmm6 - psubb xmm7, xmm6 - movdqa xmm3, [rdi+rsi] - LUMA_Q1_SSE2 xmm3, xmm4, [rdi+2*rsi], [rdi+rsi], xmm8, xmm6 - - DEBLOCK_P0_Q0_SSE2 - movdqa [r8+2*rsi], xmm1 - movdqa [rdi], xmm2 - - ret - -;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_sse2 - movsxd r10, esi - lea r11, [r10+r10*2] - lea rax, [rdi-4] - lea r9, [rdi-4+r11] - sub rsp, 0x68 - %define pix_tmp rsp - - ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp - lea rax, [rax+r10*8] - lea r9, [r9 +r10*8] - TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 - - ; vertical filter - ; alpha, beta, tc0 are still in edx, ecx, r8 - ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them - lea rdi, [pix_tmp+0x30] - mov esi, 0x10 - call x264_deblock_v_luma_sse2 - - ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - add rax, 2 - add r9, 2 - movq mm0, [pix_tmp+0x18] - movq mm1, [pix_tmp+0x28] - movq mm2, [pix_tmp+0x38] - movq mm3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) - - shl r10, 3 - sub rax, r10 - sub r9, r10 - shr r10, 3 - movq mm0, [pix_tmp+0x10] - movq mm1, [pix_tmp+0x20] - movq mm2, [pix_tmp+0x30] - movq mm3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) - - add rsp, 0x68 - ret - - -%macro CHROMA_V_START 0 - ; rdi = pix - movsxd rsi, esi ; stride - dec edx ; alpha-1 - dec ecx ; beta-1 - mov rax, rdi - sub rax, rsi - sub rax, rsi -%endmacro - -%macro CHROMA_H_START 0 - movsxd rsi, esi - dec edx - dec ecx - sub rdi, 2 - lea r9, [rsi+rsi*2] - mov rax, rdi - add rdi, r9 -%endmacro - -;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext - CHROMA_V_START - - movq mm0, [rax] - movq mm1, [rax+rsi] - movq mm2, [rdi] - movq mm3, [rdi+rsi] - - LOAD_MASK_MMX edx, ecx - movd mm6, [r8] ; tc0 - punpcklbw mm6, mm6 - pand mm7, mm6 - DEBLOCK_P0_Q0_MMX - - movq [rax+rsi], mm1 - movq [rdi], mm2 - ret - - -;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext - CHROMA_H_START - - TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9) - movq [rsp-8], mm0 - movq [rsp-16], mm3 - - LOAD_MASK_MMX edx, ecx - movd mm6, [r8] ; tc0 - punpcklbw mm6, mm6 - pand mm7, mm6 - DEBLOCK_P0_Q0_MMX - - movq mm0, [rsp-8] - movq mm3, [rsp-16] - TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9) - ret - - -; in: %1=p0 %2=p1 %3=q1 -; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 -%macro CHROMA_INTRA_P0 3 - movq mm4, %1 - pxor mm4, %3 - pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1 - pavgb %1, %3 - psubusb %1, mm4 - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) -%endmacro - -%macro CHROMA_INTRA_BODY 0 - LOAD_MASK_MMX edx, ecx - movq mm5, mm1 - movq mm6, mm2 - CHROMA_INTRA_P0 mm1, mm0, mm3 - CHROMA_INTRA_P0 mm2, mm3, mm0 - psubb mm1, mm5 - psubb mm2, mm6 - pand mm1, mm7 - pand mm2, mm7 - paddb mm1, mm5 - paddb mm2, mm6 -%endmacro - -;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext - CHROMA_V_START - - movq mm0, [rax] - movq mm1, [rax+rsi] - movq mm2, [rdi] - movq mm3, [rdi+rsi] - - CHROMA_INTRA_BODY - - movq [rax+rsi], mm1 - movq [rdi], mm2 - ret - -;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext - CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9) - CHROMA_INTRA_BODY - TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9) - ret - diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm deleted file mode 100644 index 4a66c81a..00000000 --- a/common/amd64/mc-a.asm +++ /dev/null @@ -1,618 +0,0 @@ -;***************************************************************************** -;* mc.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2003 x264 project -;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $ -;* -;* Authors: Min Chen (converted to nasm) -;* Laurent Aimar (init algorithm) -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -;***************************************************************************** -;* * -;* Revision history: * -;* * -;* 2004.05.17 portab mc_copy_w4/8/16 (CM) * -;* * -;***************************************************************************** - -BITS 64 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "amd64inc.asm" - -;============================================================================= -; Constants -;============================================================================= - -SECTION_RODATA - -pw_4: times 4 dw 4 -pw_8: times 4 dw 8 -pw_32: times 4 dw 32 -pw_64: times 4 dw 64 - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -;============================================================================= -; pixel avg -;============================================================================= - -;----------------------------------------------------------------------------- -; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src, int src_stride, -; int height ); -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_w4_mmxext -.height_loop: - movd mm0, [parm3q] - movd mm1, [parm3q+parm4q] - pavgb mm0, [parm1q] - pavgb mm1, [parm1q+parm2q] - movd [parm1q], mm0 - movd [parm1q+parm2q], mm1 - sub temp1d, 2 - lea parm3q, [parm3q+parm4q*2] - lea parm1q, [parm1q+parm2q*2] - jg .height_loop - rep ret - -cglobal x264_pixel_avg_w8_mmxext -.height_loop: - movq mm0, [parm3q] - movq mm1, [parm3q+parm4q] - pavgb mm0, [parm1q] - pavgb mm1, [parm1q+parm2q] - movq [parm1q], mm0 - movq [parm1q+parm2q], mm1 - sub temp1d, 2 - lea parm3q, [parm3q+parm4q*2] - lea parm1q, [parm1q+parm2q*2] - jg .height_loop - rep ret - -cglobal x264_pixel_avg_w16_mmxext -.height_loop: - movq mm0, [parm3q ] - movq mm1, [parm3q+8] - movq mm2, [parm3q+parm4q ] - movq mm3, [parm3q+parm4q+8] - pavgb mm0, [parm1q ] - pavgb mm1, [parm1q+8] - pavgb mm2, [parm1q+parm2q ] - pavgb mm3, [parm1q+parm2q+8] - movq [parm1q ], mm0 - movq [parm1q+8], mm1 - movq [parm1q+parm2q ], mm2 - movq [parm1q+parm2q+8], mm3 - sub temp1d, 2 - lea parm3q, [parm3q+parm4q*2] - lea parm1q, [parm1q+parm2q*2] - jg .height_loop - rep ret - -cglobal x264_pixel_avg_w16_sse2 -.height_loop: - movdqu xmm0, [parm3q] - movdqu xmm1, [parm3q+parm4q] - pavgb xmm0, [parm1q] - pavgb xmm1, [parm1q+parm2q] - movdqa [parm1q], xmm0 - movdqa [parm1q+parm2q], xmm1 - sub temp1d, 2 - lea parm3q, [parm3q+parm4q*2] - lea parm1q, [parm1q+parm2q*2] - jg .height_loop - rep ret - -%macro AVGH 2 -cglobal x264_pixel_avg_%1x%2_mmxext - mov temp1d, %2 - jmp x264_pixel_avg_w%1_mmxext -%endmacro - -AVGH 16, 16 -AVGH 16, 8 -AVGH 8, 16 -AVGH 8, 8 -AVGH 8, 4 -AVGH 4, 8 -AVGH 4, 4 -AVGH 4, 2 - -;----------------------------------------------------------------------------- -; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src_stride, -; uint8_t *src2, int height ); -;----------------------------------------------------------------------------- -%macro AVG2_START 0 -%ifdef WIN64 - mov temp1d, parm6d - mov temp2q, parm5q -%endif - sub parm5q, parm3q -%endmacro - -cglobal x264_pixel_avg2_w4_mmxext - AVG2_START - lea r10, [temp2q+parm4q] -.height_loop: - movd mm0, [parm3q] - movd mm1, [parm3q+parm4q] - pavgb mm0, [parm3q+temp2q] - pavgb mm1, [parm3q+r10] - movd [parm1q], mm0 - movd [parm1q+parm2q], mm1 - sub temp1d, 2 - lea parm3q, [parm3q+parm4q*2] - lea parm1q, [parm1q+parm2q*2] - jg .height_loop - rep ret - -cglobal x264_pixel_avg2_w8_mmxext - AVG2_START - lea r10, [temp2q+parm4q] -.height_loop: - movq mm0, [parm3q] - movq mm1, [parm3q+parm4q] - pavgb mm0, [parm3q+temp2q] - pavgb mm1, [parm3q+r10] - movq [parm1q], mm0 - movq [parm1q+parm2q], mm1 - sub temp1d, 2 - lea parm3q, [parm3q+parm4q*2] - lea parm1q, [parm1q+parm2q*2] - jg .height_loop - rep ret - -cglobal x264_pixel_avg2_w16_mmxext - AVG2_START -.height_loop: - movq mm0, [parm3q] - movq mm1, [parm3q+8] - pavgb mm0, [parm3q+temp2q] - pavgb mm1, [parm3q+temp2q+8] - movq [parm1q], mm0 - movq [parm1q+8], mm1 - add parm3q, parm4q - add parm1q, parm2q - dec temp1d - jg .height_loop - rep ret - -cglobal x264_pixel_avg2_w20_mmxext - AVG2_START -.height_loop: - movq mm0, [parm3q] - movq mm1, [parm3q+8] - movd mm2, [parm3q+16] - pavgb mm0, [parm3q+temp2q] - pavgb mm1, [parm3q+temp2q+8] - pavgb mm2, [parm3q+temp2q+16] - movq [parm1q], mm0 - movq [parm1q+8], mm1 - movd [parm1q+16], mm2 - add parm3q, parm4q - add parm1q, parm2q - dec temp1d - jg .height_loop - rep ret - - - -;============================================================================= -; weighted prediction -;============================================================================= -; implicit bipred only: -; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 - -%macro BIWEIGHT_4P_MMX 2 - movd mm0, %1 - movd mm1, %2 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - pmullw mm0, mm4 - pmullw mm1, mm5 - paddw mm0, mm1 - paddw mm0, mm6 - psraw mm0, 6 - pmaxsw mm0, mm7 - packuswb mm0, mm0 - movd %1, mm0 -%endmacro - -%macro BIWEIGHT_START_MMX 0 -; mov rdi, rdi ; dst -; movsxd rsi, esi ; i_dst -; mov rdx, rdx ; src -; movsxd rcx, ecx ; i_src -; movsxd r8, r8d ; i_weight_dst -; movsxd r9, r9d ; i_height - mov r11d, parm6d ; i_height - - movd mm4, parm5d - pshufw mm4, mm4, 0 ; weight_dst - movq mm5, [pw_64 GLOBAL] - psubw mm5, mm4 ; weight_src - movq mm6, [pw_32 GLOBAL] ; rounding - pxor mm7, mm7 - - ALIGN 4 - .height_loop -%endmacro - -;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_w16_mmxext - BIWEIGHT_START_MMX - - BIWEIGHT_4P_MMX [parm1q ], [parm3q ] - BIWEIGHT_4P_MMX [parm1q+ 4], [parm3q+ 4] - BIWEIGHT_4P_MMX [parm1q+ 8], [parm3q+ 8] - BIWEIGHT_4P_MMX [parm1q+12], [parm3q+12] - - add parm1q, parm2q - add parm3q, parm4q - dec r11d - jg .height_loop - rep ret - -;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_w8_mmxext - BIWEIGHT_START_MMX - - BIWEIGHT_4P_MMX [parm1q ], [parm3q ] - BIWEIGHT_4P_MMX [parm1q+4], [parm3q+4] - - add parm1q, parm2q - add parm3q, parm4q - dec r11d - jg .height_loop - rep ret - -;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_4x4_mmxext - BIWEIGHT_START_MMX - BIWEIGHT_4P_MMX [parm1q ], [parm3q ] - BIWEIGHT_4P_MMX [parm1q+parm2q ], [parm3q+parm4q ] - BIWEIGHT_4P_MMX [parm1q+parm2q*2], [parm3q+parm4q*2] - add parm1q, parm2q - add parm3q, parm4q - BIWEIGHT_4P_MMX [parm1q+parm2q*2], [parm3q+parm4q*2] - ret - - - -;============================================================================= -; pixel copy -;============================================================================= - -;----------------------------------------------------------------------------- -; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w4_mmx - mov eax, parm5d ; i_height - -ALIGN 4 -.height_loop - mov r10d, [parm3q] - mov r11d, [parm3q+parm4q] - mov [parm1q], r10d - mov [parm1q+parm2q], r11d - lea parm3q, [parm3q+parm4q*2] - lea parm1q, [parm1q+parm2q*2] - dec eax - dec eax - jg .height_loop - rep ret - -;----------------------------------------------------------------------------- -; void x264_mc_copy_w8_mmx( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w8_mmx - mov eax, parm5d ; i_height - - lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride - lea r11, [parm2q+parm2q*2] ; 3 * i_dst_stride - -ALIGN 4 -.height_loop - movq mm0, [parm3q] - movq mm1, [parm3q+parm4q] - movq mm2, [parm3q+parm4q*2] - movq mm3, [parm3q+r10] - movq [parm1q], mm0 - movq [parm1q+parm2q], mm1 - movq [parm1q+parm2q*2], mm2 - movq [parm1q+r11], mm3 - lea parm3q, [parm3q+parm4q*4] - lea parm1q, [parm1q+parm2q*4] - - sub eax, byte 4 - jg .height_loop - rep ret - -;----------------------------------------------------------------------------- -; void x264_mc_copy_w16_mmx( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w16_mmx - mov eax, parm5d ; i_height - - lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride - lea r11, [parm2q+parm2q*2] ; 3 * i_dst_stride - -ALIGN 4 -.height_loop - movq mm0, [parm3q] - movq mm1, [parm3q+8] - movq mm2, [parm3q+parm4q] - movq mm3, [parm3q+parm4q+8] - movq mm4, [parm3q+parm4q*2] - movq mm5, [parm3q+parm4q*2+8] - movq mm6, [parm3q+r10] - movq mm7, [parm3q+r10+8] - movq [parm1q], mm0 - movq [parm1q+8], mm1 - movq [parm1q+parm2q], mm2 - movq [parm1q+parm2q+8], mm3 - movq [parm1q+parm2q*2], mm4 - movq [parm1q+parm2q*2+8], mm5 - movq [parm1q+r11], mm6 - movq [parm1q+r11+8], mm7 - lea parm3q, [parm3q+parm4q*4] - lea parm1q, [parm1q+parm2q*4] - sub eax, byte 4 - jg .height_loop - rep ret - - -;----------------------------------------------------------------------------- -; void x264_mc_copy_w16_sse2( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w16_sse2 - mov eax, parm5d ; i_height - -ALIGN 4 -.height_loop - movdqu xmm0, [parm3q] - movdqu xmm1, [parm3q+parm4q] - movdqu [parm1q], xmm0 - movdqu [parm1q+parm2q], xmm1 - sub eax, byte 2 - lea parm3q, [parm3q+parm4q*2] - lea parm1q, [parm1q+parm2q*2] - jg .height_loop - rep ret - - - -;============================================================================= -; chroma MC -;============================================================================= - -;----------------------------------------------------------------------------- -; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, -; int dx, int dy, -; int i_width, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_chroma_mmxext - mov r10d, parm6d - mov r11d, parm5d - sar r10d, 3 - sar r11d, 3 - imul r10d, parm4d - pxor mm3, mm3 - add r10d, r11d - movsxd r10, r10d - mov r11d, parm8d - add parm3q, r10 ; src += (dx>>3) + (dy>>3) * src_stride - and parm5d, 7 ; dx &= 7 - je .mc1d - and parm6d, 7 ; dy &= 7 - je .mc1d - - movd mm0, parm5d - movd mm1, parm6d - - pshufw mm5, mm0, 0 ; mm5 = dx - pshufw mm6, mm1, 0 ; mm6 = dy - - movq mm4, [pw_8 GLOBAL] - movq mm0, mm4 - - psubw mm4, mm5 ; mm4 = 8-dx - psubw mm0, mm6 ; mm0 = 8-dy - - movq mm7, mm5 - pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB - pmullw mm7, mm6 ; mm7 = dx*dy = cD - pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC - pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA - - mov rax, parm3q - mov r10, parm1q - -ALIGN 4 -.height_loop - - movd mm1, [rax+parm4q] - movd mm0, [rax] - punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 - punpcklbw mm0, mm3 - pmullw mm1, mm6 ; 2nd line * cC - pmullw mm0, mm4 ; 1st line * cA - - paddw mm0, mm1 ; mm0 <- result - - movd mm2, [rax+1] - movd mm1, [rax+parm4q+1] - punpcklbw mm2, mm3 - punpcklbw mm1, mm3 - - paddw mm0, [pw_32 GLOBAL] - - pmullw mm2, mm5 ; line * cB - pmullw mm1, mm7 ; line * cD - paddw mm0, mm2 - paddw mm0, mm1 - psrlw mm0, 6 - - packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4 - movd [r10], mm0 - - add rax, parm4q - add r10, parm2q ; i_dst_stride - dec r11d - jnz .height_loop - - sub parm7d, 8 - jnz .finish ; width != 8 so assume 4 - - mov r10, parm1q ; dst - mov rax, parm3q ; src - mov r11d, parm8d ; i_height - add r10, 4 - add rax, 4 - jmp .height_loop - -.finish - rep ret - -ALIGN 4 -.mc1d -%define pel_offset temp1q - mov eax, parm5d - or eax, parm6d - and eax, 7 - cmp parm5d, 0 - mov pel_offset, 1 - cmove pel_offset, parm4q ; pel_offset = dx ? 1 : src_stride - movd mm6, eax - movq mm5, [pw_8 GLOBAL] - pshufw mm6, mm6, 0 - movq mm7, [pw_4 GLOBAL] - psubw mm5, mm6 - - cmp parm7d, 8 - je .height_loop1_w8 - -ALIGN 4 -.height_loop1_w4 - movd mm0, [parm3q+pel_offset] - movd mm1, [parm3q] - punpcklbw mm0, mm3 - punpcklbw mm1, mm3 - pmullw mm0, mm6 - pmullw mm1, mm5 - paddw mm0, mm7 - paddw mm0, mm1 - psrlw mm0, 3 - packuswb mm0, mm3 - movd [parm1q], mm0 - add parm3q, parm4q - add parm1q, parm2q - dec r11d - jnz .height_loop1_w4 - rep ret - -ALIGN 4 -.height_loop1_w8 - movq mm0, [parm3q+pel_offset] - movq mm1, [parm3q] - movq mm2, mm0 - movq mm4, mm1 - punpcklbw mm0, mm3 - punpcklbw mm1, mm3 - punpckhbw mm2, mm3 - punpckhbw mm4, mm3 - pmullw mm0, mm6 - pmullw mm1, mm5 - pmullw mm2, mm6 - pmullw mm4, mm5 - paddw mm0, mm7 - paddw mm2, mm7 - paddw mm0, mm1 - paddw mm2, mm4 - psrlw mm0, 3 - psrlw mm2, 3 - packuswb mm0, mm2 - movq [parm1q], mm0 - add parm3q, parm4q - add parm1q, parm2q - dec r11d - jnz .height_loop1_w8 - rep ret - - - -;----------------------------------------------------------------------------- -; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, -; uint8_t *pix_uv, int stride_uv, int mb_x ) -;----------------------------------------------------------------------------- -cglobal x264_prefetch_fenc_mmxext - mov eax, parm5d - and eax, 3 - imul eax, parm2d - lea parm1q, [parm1q+rax*4+64] - prefetcht0 [parm1q] - prefetcht0 [parm1q+parm2q] - lea parm1q, [parm1q+parm2q*2] - prefetcht0 [parm1q] - prefetcht0 [parm1q+parm2q] - - mov eax, parm5d - and eax, 6 - imul eax, parm4d - lea parm3q, [parm3q+rax+64] - prefetcht0 [parm3q] - prefetcht0 [parm3q+parm4q] - ret - -;----------------------------------------------------------------------------- -; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) -;----------------------------------------------------------------------------- -cglobal x264_prefetch_ref_mmxext - dec parm3d - and parm3d, parm2d - lea parm1q, [parm1q+parm3q*8+64] - lea rax, [parm2q*3] - prefetcht0 [parm1q] - prefetcht0 [parm1q+parm2q] - prefetcht0 [parm1q+parm2q*2] - prefetcht0 [parm1q+rax] - lea parm1q, [parm1q+parm2q*4] - prefetcht0 [parm1q] - prefetcht0 [parm1q+parm2q] - prefetcht0 [parm1q+parm2q*2] - prefetcht0 [parm1q+rax] - ret diff --git a/common/amd64/mc-a2.asm b/common/amd64/mc-a2.asm deleted file mode 100644 index 5851eb3a..00000000 --- a/common/amd64/mc-a2.asm +++ /dev/null @@ -1,320 +0,0 @@ -;***************************************************************************** -;* mc-a2.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2005 x264 project -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -BITS 64 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "amd64inc.asm" - -;============================================================================= -; Read only data -;============================================================================= - -SECTION_RODATA - -pw_1: times 4 dw 1 -pw_16: times 4 dw 16 -pw_32: times 4 dw 32 - -;============================================================================= -; Macros -;============================================================================= - -%macro LOAD_ADD 3 - movd %1, %2 - movd mm7, %3 - punpcklbw %1, mm0 - punpcklbw mm7, mm0 - paddw %1, mm7 -%endmacro - -%macro FILT_V 0 - psubw mm1, mm2 ; a-b - psubw mm4, mm5 - psubw mm2, mm3 ; b-c - psubw mm5, mm6 - psllw mm2, 2 - psllw mm5, 2 - psubw mm1, mm2 ; a-5*b+4*c - psubw mm4, mm5 - psllw mm3, 4 - psllw mm6, 4 - paddw mm1, mm3 ; a-5*b+20*c - paddw mm4, mm6 -%endmacro - -%macro FILT_H 0 - psubw mm1, mm2 ; a-b - psubw mm4, mm5 - psraw mm1, 2 ; (a-b)/4 - psraw mm4, 2 - psubw mm1, mm2 ; (a-b)/4-b - psubw mm4, mm5 - paddw mm1, mm3 ; (a-b)/4-b+c - paddw mm4, mm6 - psraw mm1, 2 ; ((a-b)/4-b+c)/4 - psraw mm4, 2 - paddw mm1, mm3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - paddw mm4, mm6 -%endmacro - -%macro FILT_PACK 1 - paddw mm1, mm7 - paddw mm4, mm7 - psraw mm1, %1 - psraw mm4, %1 - packuswb mm1, mm4 -%endmacro - - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -;----------------------------------------------------------------------------- -; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, -; int i_stride, int i_width, int i_height ); -;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_mmxext - -%ifdef WIN64 - push rdi - pushreg rdi - push rsi - pushreg rsi -%endif - push rbp - pushreg rbp - push rbx - pushreg rbx - mov rbp, rsp - setframe rbp, 0 - endprolog - -%ifdef WIN64 - mov rdi, parm1q - mov rsi, parm2q - mov rdx, parm3q - mov rcx, parm4q - movsxd r8, dword [rbp+72] - movsxd r9, dword [rbp+80] - mov ebx, dword [rbp+88] -%else - mov ebx, dword [rbp+24] -%endif - %define dsth rdi - %define dstv rsi - %define dstc rdx - %define src rcx - %define stride r8 - %define width r9 - %define height ebx - %define stride3 r10 - %define stride5 r11 - %define x rax - %define tbuffer rsp + 8 - - lea stride3, [stride*3] - lea stride5, [stride*5] - sub src, stride - sub src, stride - - lea rax, [stride*2 + 24] - sub rsp, rax - - pxor mm0, mm0 - -.loopy: - - xor x, x -ALIGN 16 -.vertical_filter: - - prefetcht0 [src + stride5 + 32] - - LOAD_ADD mm1, [src ], [src + stride5 ] ; a0 - LOAD_ADD mm2, [src + stride ], [src + stride*4 ] ; b0 - LOAD_ADD mm3, [src + stride*2 ], [src + stride3 ] ; c0 - LOAD_ADD mm4, [src + 4], [src + stride5 + 4] ; a1 - LOAD_ADD mm5, [src + stride + 4], [src + stride*4 + 4] ; b1 - LOAD_ADD mm6, [src + stride*2 + 4], [src + stride3 + 4] ; c1 - - FILT_V - - movq mm7, [pw_16 GLOBAL] - movq [tbuffer + x*2], mm1 - movq [tbuffer + x*2 + 8], mm4 - paddw mm1, mm7 - paddw mm4, mm7 - psraw mm1, 5 - psraw mm4, 5 - packuswb mm1, mm4 - movntq [dstv + x], mm1 - - add x, 8 - add src, 8 - cmp x, width - jle .vertical_filter - - pshufw mm2, [tbuffer], 0 - movq [tbuffer - 8], mm2 ; pad left - ; no need to pad right, since vertical_filter already did 4 extra pixels - - sub src, x - xor x, x - movq mm7, [pw_32 GLOBAL] -.center_filter: - - movq mm1, [tbuffer + x*2 - 4 ] - movq mm2, [tbuffer + x*2 - 2 ] - movq mm3, [tbuffer + x*2 ] - movq mm4, [tbuffer + x*2 + 4 ] - movq mm5, [tbuffer + x*2 + 6 ] - paddw mm3, [tbuffer + x*2 + 2 ] ; c0 - paddw mm2, mm4 ; b0 - paddw mm1, mm5 ; a0 - movq mm6, [tbuffer + x*2 + 8 ] - paddw mm4, [tbuffer + x*2 + 14] ; a1 - paddw mm5, [tbuffer + x*2 + 12] ; b1 - paddw mm6, [tbuffer + x*2 + 10] ; c1 - - FILT_H - FILT_PACK 6 - movntq [dstc + x], mm1 - - add x, 8 - cmp x, width - jl .center_filter - - lea src, [src + stride*2] - xor x, x -.horizontal_filter: - - movd mm1, [src + x - 2] - movd mm2, [src + x - 1] - movd mm3, [src + x ] - movd mm6, [src + x + 1] - movd mm4, [src + x + 2] - movd mm5, [src + x + 3] - punpcklbw mm1, mm0 - punpcklbw mm2, mm0 - punpcklbw mm3, mm0 - punpcklbw mm6, mm0 - punpcklbw mm4, mm0 - punpcklbw mm5, mm0 - paddw mm3, mm6 ; c0 - paddw mm2, mm4 ; b0 - paddw mm1, mm5 ; a0 - movd mm7, [src + x + 7] - movd mm6, [src + x + 6] - punpcklbw mm7, mm0 - punpcklbw mm6, mm0 - paddw mm4, mm7 ; c1 - paddw mm5, mm6 ; b1 - movd mm7, [src + x + 5] - movd mm6, [src + x + 4] - punpcklbw mm7, mm0 - punpcklbw mm6, mm0 - paddw mm6, mm7 ; a1 - - movq mm7, [pw_1 GLOBAL] - FILT_H - FILT_PACK 1 - movntq [dsth + x], mm1 - - add x, 8 - cmp x, width - jl .horizontal_filter - - sub src, stride - add dsth, stride - add dstv, stride - add dstc, stride - dec height - jg .loopy - - mov rsp, rbp - pop rbx - pop rbp -%ifdef WIN64 - pop rsi - pop rdi -%endif - ret - - - -;----------------------------------------------------------------------------- -; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, -; uint8_t *src, int i_src, int w, int h) -;----------------------------------------------------------------------------- -cglobal x264_plane_copy_mmxext - movsxd parm2q, parm2d - movsxd parm4q, parm4d - add parm5d, 3 - and parm5d, ~3 - sub parm2q, parm5q - sub parm4q, parm5q - ; shuffle regs because movsd needs dst=rdi, src=rsi, w=ecx - xchg rsi, rdx - mov rax, parm4q -.loopy: - mov ecx, parm5d - sub ecx, 64 - jl .endx -.loopx: - prefetchnta [rsi+256] - movq mm0, [rsi ] - movq mm1, [rsi+ 8] - movq mm2, [rsi+16] - movq mm3, [rsi+24] - movq mm4, [rsi+32] - movq mm5, [rsi+40] - movq mm6, [rsi+48] - movq mm7, [rsi+56] - movntq [rdi ], mm0 - movntq [rdi+ 8], mm1 - movntq [rdi+16], mm2 - movntq [rdi+24], mm3 - movntq [rdi+32], mm4 - movntq [rdi+40], mm5 - movntq [rdi+48], mm6 - movntq [rdi+56], mm7 - add rsi, 64 - add rdi, 64 - sub ecx, 64 - jge .loopx -.endx: - prefetchnta [rsi+256] - add ecx, 64 - shr ecx, 2 - rep movsd - add rdi, rdx - add rsi, rax - sub parm6d, 1 - jg .loopy - emms - ret - diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm deleted file mode 100644 index 0cebcecf..00000000 --- a/common/amd64/pixel-a.asm +++ /dev/null @@ -1,1301 +0,0 @@ -;***************************************************************************** -;* pixel.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2003 x264 project -;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ -;* -;* Authors: Laurent Aimar -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -BITS 64 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "amd64inc.asm" - -; sad - -%macro SAD_INC_2x16P 0 - movq mm1, [parm1q] - movq mm2, [parm1q+8] - movq mm3, [parm1q+parm2q] - movq mm4, [parm1q+parm2q+8] - psadbw mm1, [parm3q] - psadbw mm2, [parm3q+8] - psadbw mm3, [parm3q+parm4q] - psadbw mm4, [parm3q+parm4q+8] - lea parm1q, [parm1q+2*parm2q] - paddw mm1, mm2 - paddw mm3, mm4 - lea parm3q, [parm3q+2*parm4q] - paddw mm0, mm1 - paddw mm0, mm3 -%endmacro - -%macro SAD_INC_2x8P 0 - movq mm1, [parm1q] - movq mm2, [parm1q+parm2q] - psadbw mm1, [parm3q] - psadbw mm2, [parm3q+parm4q] - lea parm1q, [parm1q+2*parm2q] - paddw mm0, mm1 - paddw mm0, mm2 - lea parm3q, [parm3q+2*parm4q] -%endmacro - -%macro SAD_INC_2x4P 0 - movd mm1, [parm1q] - movd mm2, [parm3q] - punpckldq mm1, [parm1q+parm2q] - punpckldq mm2, [parm3q+parm4q] - psadbw mm1, mm2 - paddw mm0, mm1 - lea parm1q, [parm1q+2*parm2q] - lea parm3q, [parm3q+2*parm4q] -%endmacro - -; sad x3 / x4 - -%macro SAD_X3_START_1x8P 0 - movq mm3, [parm1q] - movq mm0, [parm2q] - movq mm1, [parm3q] - movq mm2, [parm4q] - psadbw mm0, mm3 - psadbw mm1, mm3 - psadbw mm2, mm3 -%endmacro - -%macro SAD_X3_1x8P 2 - movq mm3, [parm1q+%1] - movq mm4, [parm2q+%2] - movq mm5, [parm3q+%2] - movq mm6, [parm4q+%2] - psadbw mm4, mm3 - psadbw mm5, mm3 - psadbw mm6, mm3 - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm2, mm6 -%endmacro - -%macro SAD_X3_START_2x4P 3 - movd mm3, [parm1q] - movd %1, [parm2q] - movd %2, [parm3q] - movd %3, [parm4q] - punpckldq mm3, [parm1q+FENC_STRIDE] - punpckldq %1, [parm2q+parm5q] - punpckldq %2, [parm3q+parm5q] - punpckldq %3, [parm4q+parm5q] - psadbw %1, mm3 - psadbw %2, mm3 - psadbw %3, mm3 -%endmacro - -%macro SAD_X3_2x16P 1 -%if %1 - SAD_X3_START_1x8P -%else - SAD_X3_1x8P 0, 0 -%endif - SAD_X3_1x8P 8, 8 - SAD_X3_1x8P FENC_STRIDE, parm5q - SAD_X3_1x8P FENC_STRIDE+8, parm5q+8 - add parm1q, 2*FENC_STRIDE - lea parm2q, [parm2q+2*parm5q] - lea parm3q, [parm3q+2*parm5q] - lea parm4q, [parm4q+2*parm5q] -%endmacro - -%macro SAD_X3_2x8P 1 -%if %1 - SAD_X3_START_1x8P -%else - SAD_X3_1x8P 0, 0 -%endif - SAD_X3_1x8P FENC_STRIDE, parm5q - add parm1q, 2*FENC_STRIDE - lea parm2q, [parm2q+2*parm5q] - lea parm3q, [parm3q+2*parm5q] - lea parm4q, [parm4q+2*parm5q] -%endmacro - -%macro SAD_X3_2x4P 1 -%if %1 - SAD_X3_START_2x4P mm0, mm1, mm2 -%else - SAD_X3_START_2x4P mm4, mm5, mm6 - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm2, mm6 -%endif - add parm1q, 2*FENC_STRIDE - lea parm2q, [parm2q+2*parm5q] - lea parm3q, [parm3q+2*parm5q] - lea parm4q, [parm4q+2*parm5q] -%endmacro - -%macro SAD_X4_START_1x8P 0 - movq mm7, [parm1q] - movq mm0, [parm2q] - movq mm1, [parm3q] - movq mm2, [parm4q] - movq mm3, [parm5q] - psadbw mm0, mm7 - psadbw mm1, mm7 - psadbw mm2, mm7 - psadbw mm3, mm7 -%endmacro - -%macro SAD_X4_1x8P 2 - movq mm7, [parm1q+%1] - movq mm4, [parm2q+%2] - movq mm5, [parm3q+%2] - movq mm6, [parm4q+%2] - psadbw mm4, mm7 - psadbw mm5, mm7 - psadbw mm6, mm7 - psadbw mm7, [parm5q+%2] - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm2, mm6 - paddw mm3, mm7 -%endmacro - -%macro SAD_X4_START_2x4P 0 - movd mm7, [parm1q] - movd mm0, [parm2q] - movd mm1, [parm3q] - movd mm2, [parm4q] - movd mm3, [parm5q] - punpckldq mm7, [parm1q+FENC_STRIDE] - punpckldq mm0, [parm2q+parm6q] - punpckldq mm1, [parm3q+parm6q] - punpckldq mm2, [parm4q+parm6q] - punpckldq mm3, [parm5q+parm6q] - psadbw mm0, mm7 - psadbw mm1, mm7 - psadbw mm2, mm7 - psadbw mm3, mm7 -%endmacro - -%macro SAD_X4_INC_2x4P 0 - movd mm7, [parm1q] - movd mm4, [parm2q] - movd mm5, [parm3q] - punpckldq mm7, [parm1q+FENC_STRIDE] - punpckldq mm4, [parm2q+parm6q] - punpckldq mm5, [parm3q+parm6q] - psadbw mm4, mm7 - psadbw mm5, mm7 - paddw mm0, mm4 - paddw mm1, mm5 - movd mm4, [parm4q] - movd mm5, [parm5q] - punpckldq mm4, [parm4q+parm6q] - punpckldq mm5, [parm5q+parm6q] - psadbw mm4, mm7 - psadbw mm5, mm7 - paddw mm2, mm4 - paddw mm3, mm5 -%endmacro - -%macro SAD_X4_2x16P 1 -%if %1 - SAD_X4_START_1x8P -%else - SAD_X4_1x8P 0, 0 -%endif - SAD_X4_1x8P 8, 8 - SAD_X4_1x8P FENC_STRIDE, parm6q - SAD_X4_1x8P FENC_STRIDE+8, parm6q+8 - add parm1q, 2*FENC_STRIDE - lea parm2q, [parm2q+2*parm6q] - lea parm3q, [parm3q+2*parm6q] - lea parm4q, [parm4q+2*parm6q] - lea parm5q, [parm5q+2*parm6q] -%endmacro - -%macro SAD_X4_2x8P 1 -%if %1 - SAD_X4_START_1x8P -%else - SAD_X4_1x8P 0, 0 -%endif - SAD_X4_1x8P FENC_STRIDE, parm6q - add parm1q, 2*FENC_STRIDE - lea parm2q, [parm2q+2*parm6q] - lea parm3q, [parm3q+2*parm6q] - lea parm4q, [parm4q+2*parm6q] - lea parm5q, [parm5q+2*parm6q] -%endmacro - -%macro SAD_X4_2x4P 1 -%if %1 - SAD_X4_START_2x4P -%else - SAD_X4_INC_2x4P -%endif - add parm1q, 2*FENC_STRIDE - lea parm2q, [parm2q+2*parm6q] - lea parm3q, [parm3q+2*parm6q] - lea parm4q, [parm4q+2*parm6q] - lea parm5q, [parm5q+2*parm6q] -%endmacro - -%macro SAD_X3_END 0 - movd [parm6q+0], mm0 - movd [parm6q+4], mm1 - movd [parm6q+8], mm2 - ret -%endmacro - -%macro SAD_X4_END 0 - mov rax, parm7q - movd [rax+0], mm0 - movd [rax+4], mm1 - movd [rax+8], mm2 - movd [rax+12], mm3 - ret -%endmacro - -; ssd - -%macro SSD_INC_1x16P 0 - movq mm1, [parm1q] - movq mm2, [parm3q] - movq mm3, [parm1q+8] - movq mm4, [parm3q+8] - - movq mm5, mm2 - movq mm6, mm4 - psubusb mm2, mm1 - psubusb mm4, mm3 - psubusb mm1, mm5 - psubusb mm3, mm6 - por mm1, mm2 - por mm3, mm4 - - movq mm2, mm1 - movq mm4, mm3 - punpcklbw mm1, mm7 - punpcklbw mm3, mm7 - punpckhbw mm2, mm7 - punpckhbw mm4, mm7 - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - pmaddwd mm3, mm3 - pmaddwd mm4, mm4 - - add parm1q, parm2q - add parm3q, parm4q - paddd mm0, mm1 - paddd mm0, mm2 - paddd mm0, mm3 - paddd mm0, mm4 -%endmacro - -%macro SSD_INC_1x8P 0 - movq mm1, [parm1q] - movq mm2, [parm3q] - - movq mm5, mm2 - psubusb mm2, mm1 - psubusb mm1, mm5 - por mm1, mm2 ; mm1 = 8bit abs diff - - movq mm2, mm1 - punpcklbw mm1, mm7 - punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - - add parm1q, parm2q - add parm3q, parm4q - paddd mm0, mm1 - paddd mm0, mm2 -%endmacro - -%macro SSD_INC_1x4P 0 - movd mm1, [parm1q] - movd mm2, [parm3q] - - movq mm5, mm2 - psubusb mm2, mm1 - psubusb mm1, mm5 - por mm1, mm2 - punpcklbw mm1, mm7 - pmaddwd mm1, mm1 - - add parm1q, parm2q - add parm3q, parm4q - paddd mm0, mm1 -%endmacro - -; satd - -%macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2] - movd %1, %3 - movd %2, %4 - punpcklbw %1, %2 - punpcklbw %2, %2 - psubw %1, %2 -%endmacro - -%macro HADAMARD4_SUB_BADC 4 - paddw %1, %2 - paddw %3, %4 - paddw %2, %2 - paddw %4, %4 - psubw %2, %1 - psubw %4, %3 -%endmacro - -%macro HADAMARD4x4 4 - HADAMARD4_SUB_BADC %1, %2, %3, %4 - HADAMARD4_SUB_BADC %1, %3, %2, %4 -%endmacro - -%macro SBUTTERFLYwd 3 - movq %3, %1 - punpcklwd %1, %2 - punpckhwd %3, %2 -%endmacro - -%macro SBUTTERFLYdq 3 - movq %3, %1 - punpckldq %1, %2 - punpckhdq %3, %2 -%endmacro - -%macro TRANSPOSE4x4 5 ; abcd-t -> adtc - SBUTTERFLYwd %1, %2, %5 - SBUTTERFLYwd %3, %4, %2 - SBUTTERFLYdq %1, %3, %4 - SBUTTERFLYdq %5, %2, %3 -%endmacro - -%macro MMX_ABS 2 ; mma, tmp - pxor %2, %2 - psubw %2, %1 - pmaxsw %1, %2 -%endmacro - -%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1 - pxor %3, %3 - pxor %4, %4 - psubw %3, %1 - psubw %4, %2 - pmaxsw %1, %3 - pmaxsw %2, %4 -%endmacro - -%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) - HADAMARD4x4 mm4, mm5, mm6, mm7 - TRANSPOSE4x4 mm4, mm5, mm6, mm7, %1 - HADAMARD4x4 mm4, mm7, %1, mm6 - MMX_ABS_TWO mm4, mm7, mm3, mm5 - MMX_ABS_TWO %1, mm6, mm3, mm5 - paddw %1, mm4 - paddw mm6, mm7 - pavgw %1, mm6 -%endmacro - -; in: r10=3*stride1, r11=3*stride2 -; in: %2 = horizontal offset -; in: %3 = whether we need to increment pix1 and pix2 -; clobber: mm3..mm7 -; out: %1 = satd -%macro LOAD_DIFF_HADAMARD_SUM 3 - LOAD_DIFF_4P mm4, mm3, [parm1q+%2], [parm3q+%2] - LOAD_DIFF_4P mm5, mm3, [parm1q+parm2q+%2], [parm3q+parm4q+%2] - LOAD_DIFF_4P mm6, mm3, [parm1q+2*parm2q+%2], [parm3q+2*parm4q+%2] - LOAD_DIFF_4P mm7, mm3, [parm1q+r10+%2], [parm3q+r11+%2] -%if %3 - lea parm1q, [parm1q+4*parm2q] - lea parm3q, [parm3q+4*parm4q] -%endif - HADAMARD4x4_SUM %1 -%endmacro - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -%macro SAD_START 0 - pxor mm0, mm0 -%endmacro - -%macro SAD_END 0 - movd eax, mm0 - ret -%endmacro - -;----------------------------------------------------------------------------- -; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -%macro SAD 2 -cglobal x264_pixel_sad_%1x%2_mmxext - SAD_START -%rep %2/2 - SAD_INC_2x%1P -%endrep - SAD_END -%endmacro - -SAD 16, 16 -SAD 16, 8 -SAD 8, 16 -SAD 8, 8 -SAD 8, 4 -SAD 4, 8 -SAD 4, 4 - -;----------------------------------------------------------------------------- -; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, -; uint8_t *pix2, int i_stride, int scores[3] ) -;----------------------------------------------------------------------------- -%macro SAD_X 3 -cglobal x264_pixel_sad_x%1_%2x%3_mmxext - SAD_X%1_2x%2P 1 -%rep %3/2-1 - SAD_X%1_2x%2P 0 -%endrep - SAD_X%1_END -%endmacro - -SAD_X 3, 16, 16 -SAD_X 3, 16, 8 -SAD_X 3, 8, 16 -SAD_X 3, 8, 8 -SAD_X 3, 8, 4 -SAD_X 3, 4, 8 -SAD_X 3, 4, 4 -SAD_X 4, 16, 16 -SAD_X 4, 16, 8 -SAD_X 4, 8, 16 -SAD_X 4, 8, 8 -SAD_X 4, 8, 4 -SAD_X 4, 4, 8 -SAD_X 4, 4, 4 - - -%macro SSD_START 0 - pxor mm7, mm7 ; zero - pxor mm0, mm0 ; mm0 holds the sum -%endmacro - -%macro SSD_END 0 - movq mm1, mm0 - psrlq mm1, 32 - paddd mm0, mm1 - movd eax, mm0 - ret -%endmacro - -;----------------------------------------------------------------------------- -; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -%macro SSD 2 -cglobal x264_pixel_ssd_%1x%2_mmx - SSD_START -%rep %2 - SSD_INC_1x%1P -%endrep - SSD_END -%endmacro - -SSD 16, 16 -SSD 16, 8 -SSD 8, 16 -SSD 8, 8 -SSD 8, 4 -SSD 4, 8 -SSD 4, 4 - - - -%macro SATD_START 0 - lea r10, [3*parm2q] ; 3*stride1 - lea r11, [3*parm4q] ; 3*stride2 -%endmacro - -%macro SATD_END 0 - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - pshufw mm1, mm0, 10110001b - paddw mm0, mm1 - movd eax, mm0 - and eax, 0xffff - ret -%endmacro - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_4x4_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_4x8_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x4_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x8_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 - LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm2 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_16x8_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 - paddw mm0, mm2 - - LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 - paddw mm0, mm2 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x16_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 - LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 - paddw mm0, mm2 - - LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 - paddw mm1, mm2 - LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 - paddw mm0, mm2 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_16x16_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 - paddw mm0, mm2 - - LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 - paddw mm0, mm2 - - LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 - paddw mm0, mm2 - - LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 - paddw mm0, mm2 - paddw mm0, mm1 - - pxor mm3, mm3 - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - punpcklwd mm0, mm3 - pshufw mm1, mm0, 01001110b - paddd mm0, mm1 - movd eax, mm0 - ret - - -; in: parm1 = fenc -; out: mm0..mm3 = hadamard coefs -ALIGN 16 -load_hadamard: - pxor mm7, mm7 - movd mm0, [parm1q+0*FENC_STRIDE] - movd mm4, [parm1q+1*FENC_STRIDE] - movd mm3, [parm1q+2*FENC_STRIDE] - movd mm1, [parm1q+3*FENC_STRIDE] - punpcklbw mm0, mm7 - punpcklbw mm4, mm7 - punpcklbw mm3, mm7 - punpcklbw mm1, mm7 - HADAMARD4x4 mm0, mm4, mm3, mm1 - TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2 - HADAMARD4x4 mm0, mm1, mm2, mm3 - ret - -%macro SCALAR_SUMSUB 4 - add %1, %2 - add %3, %4 - add %2, %2 - add %4, %4 - sub %2, %1 - sub %4, %3 -%endmacro - -%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op - pxor %7, %7 - pshufw %4, %1, 01001110b - pshufw %5, %2, 01001110b - pshufw %6, %3, 01001110b - paddw %1, %4 - paddw %2, %5 - paddw %3, %6 - punpcklwd %1, %7 - punpcklwd %2, %7 - punpcklwd %3, %7 - pshufw %4, %1, 01001110b - pshufw %5, %2, 01001110b - pshufw %6, %3, 01001110b - %8 %1, %4 - %8 %2, %5 - %8 %3, %6 -%endmacro - -;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) -;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_4x4_mmxext -%define top_1d rsp-8 ; +8 -%define left_1d rsp-16 ; +8 - call load_hadamard - - movzx r8d, byte [parm2q-1+0*FDEC_STRIDE] - movzx r9d, byte [parm2q-1+1*FDEC_STRIDE] - movzx r10d, byte [parm2q-1+2*FDEC_STRIDE] - movzx r11d, byte [parm2q-1+3*FDEC_STRIDE] - SCALAR_SUMSUB r8d, r9d, r10d, r11d - SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 1x4 hadamard - mov [left_1d+0], r8w - mov [left_1d+2], r9w - mov [left_1d+4], r10w - mov [left_1d+6], r11w - mov eax, r8d ; dc - - movzx r8d, byte [parm2q-FDEC_STRIDE+0] - movzx r9d, byte [parm2q-FDEC_STRIDE+1] - movzx r10d, byte [parm2q-FDEC_STRIDE+2] - movzx r11d, byte [parm2q-FDEC_STRIDE+3] - SCALAR_SUMSUB r8d, r9d, r10d, r11d - SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 4x1 hadamard - lea rax, [rax + r8 + 4] ; dc - mov [top_1d+0], r8w - mov [top_1d+2], r9w - mov [top_1d+4], r10w - mov [top_1d+6], r11w - and eax, -8 - shl eax, 1 - - movq mm4, mm1 - movq mm5, mm2 - MMX_ABS_TWO mm4, mm5, mm6, mm7 - movq mm7, mm3 - paddw mm4, mm5 - MMX_ABS mm7, mm6 - paddw mm7, mm4 ; 3x4 sum - - movq mm4, [left_1d] - movd mm5, eax - psllw mm4, 2 - psubw mm4, mm0 - psubw mm5, mm0 - punpcklwd mm0, mm1 - punpcklwd mm2, mm3 - punpckldq mm0, mm2 ; transpose - movq mm1, [top_1d] - psllw mm1, 2 - psubw mm0, mm1 - MMX_ABS mm4, mm3 ; 1x4 sum - MMX_ABS mm5, mm2 ; 1x4 sum - MMX_ABS mm0, mm1 ; 4x1 sum - paddw mm4, mm7 - paddw mm5, mm7 - movq mm1, mm5 - psrlq mm1, 16 ; 4x3 sum - paddw mm0, mm1 - - SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw - movd [parm3q+0], mm0 ; i4x4_v satd - movd [parm3q+4], mm4 ; i4x4_h satd - movd [parm3q+8], mm5 ; i4x4_dc satd - ret - -;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) -;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_16x16_mmxext - sub rsp, 96 -%define sums rsp+64 ; size 24 -%define top_1d rsp+32 ; size 32 -%define left_1d rsp ; size 32 - - mov qword [sums+0], 0 - mov qword [sums+8], 0 - mov qword [sums+16], 0 - - ; 1D hadamards - xor ecx, ecx - mov eax, 12 -.loop_edge: - ; left - shl eax, 5 ; log(FDEC_STRIDE) - movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE] - movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE] - movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE] - movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE] - shr eax, 5 - SCALAR_SUMSUB r8d, r9d, r10d, r11d - SCALAR_SUMSUB r8d, r10d, r9d, r11d - add ecx, r8d - mov [left_1d+2*rax+0], r8w - mov [left_1d+2*rax+2], r9w - mov [left_1d+2*rax+4], r10w - mov [left_1d+2*rax+6], r11w - - ; top - movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0] - movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1] - movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2] - movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3] - SCALAR_SUMSUB r8d, r9d, r10d, r11d - SCALAR_SUMSUB r8d, r10d, r9d, r11d - add ecx, r8d - mov [top_1d+2*rax+0], r8w - mov [top_1d+2*rax+2], r9w - mov [top_1d+2*rax+4], r10w - mov [top_1d+2*rax+6], r11w - sub eax, 4 - jge .loop_edge - - ; dc - shr ecx, 1 - add ecx, 8 - and ecx, -16 - - ; 2D hadamards - xor eax, eax -.loop_y: - xor esi, esi -.loop_x: - call load_hadamard - - movq mm4, mm1 - movq mm5, mm2 - MMX_ABS_TWO mm4, mm5, mm6, mm7 - movq mm7, mm3 - paddw mm4, mm5 - MMX_ABS mm7, mm6 - paddw mm7, mm4 ; 3x4 sum - - movq mm4, [left_1d+8*rax] - movd mm5, ecx - psllw mm4, 2 - psubw mm4, mm0 - psubw mm5, mm0 - punpcklwd mm0, mm1 - punpcklwd mm2, mm3 - punpckldq mm0, mm2 ; transpose - movq mm1, [top_1d+8*rsi] - psllw mm1, 2 - psubw mm0, mm1 - MMX_ABS mm4, mm3 ; 1x4 sum - MMX_ABS mm5, mm2 ; 1x4 sum - MMX_ABS mm0, mm1 ; 4x1 sum - pavgw mm4, mm7 - pavgw mm5, mm7 - paddw mm0, [sums+0] ; i4x4_v satd - paddw mm4, [sums+8] ; i4x4_h satd - paddw mm5, [sums+16] ; i4x4_dc satd - movq [sums+0], mm0 - movq [sums+8], mm4 - movq [sums+16], mm5 - - add parm1q, 4 - inc esi - cmp esi, 4 - jl .loop_x - add parm1q, 4*FENC_STRIDE-16 - inc eax - cmp eax, 4 - jl .loop_y - -; horizontal sum - movq mm2, [sums+16] - movq mm1, [sums+8] - movq mm0, [sums+0] - movq mm7, mm2 - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - psrld mm0, 1 - pslld mm7, 16 - psrld mm7, 16 - paddd mm0, mm2 - psubd mm0, mm7 - movd [parm3q+8], mm2 ; i16x16_dc satd - movd [parm3q+4], mm1 ; i16x16_h satd - movd [parm3q+0], mm0 ; i16x16_v satd - add rsp, 96 - ret - -;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) -;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_8x8c_mmxext - sub rsp, 64 -%define sums rsp+32 ; size 24 -%define top_1d rsp+16 ; size 16 -%define left_1d rsp ; size 16 - - mov qword [sums+0], 0 - mov qword [sums+8], 0 - mov qword [sums+16], 0 - - ; 1D hadamards - mov eax, 4 -.loop_edge: - ; left - shl eax, 5 ; log(FDEC_STRIDE) - movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE] - movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE] - movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE] - movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE] - shr eax, 5 - SCALAR_SUMSUB r8d, r9d, r10d, r11d - SCALAR_SUMSUB r8d, r10d, r9d, r11d - mov [left_1d+2*rax+0], r8w - mov [left_1d+2*rax+2], r9w - mov [left_1d+2*rax+4], r10w - mov [left_1d+2*rax+6], r11w - - ; top - movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0] - movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1] - movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2] - movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3] - SCALAR_SUMSUB r8d, r9d, r10d, r11d - SCALAR_SUMSUB r8d, r10d, r9d, r11d - mov [top_1d+2*rax+0], r8w - mov [top_1d+2*rax+2], r9w - mov [top_1d+2*rax+4], r10w - mov [top_1d+2*rax+6], r11w - sub eax, 4 - jge .loop_edge - - ; dc - movzx r8d, word [left_1d+0] - movzx r9d, word [top_1d+0] - movzx r10d, word [left_1d+8] - movzx r11d, word [top_1d+8] - add r8d, r9d - lea r9, [r10 + r11] - lea r8, [2*r8 + 8] - lea r9, [2*r9 + 8] - lea r10, [4*r10 + 8] - lea r11, [4*r11 + 8] - and r8d, -16 ; tl - and r9d, -16 ; br - and r10d, -16 ; bl - and r11d, -16 ; tr - shl r9, 16 - mov r9w, r10w - shl r9, 16 - mov r9w, r11w - shl r9, 16 - mov r9w, r8w - - ; 2D hadamards - xor eax, eax -.loop_y: - xor esi, esi -.loop_x: - call load_hadamard - - movq mm4, mm1 - movq mm5, mm2 - MMX_ABS_TWO mm4, mm5, mm6, mm7 - movq mm7, mm3 - paddw mm4, mm5 - MMX_ABS mm7, mm6 - paddw mm7, mm4 ; 3x4 sum - - movq mm4, [left_1d+8*rax] - movzx ecx, r9w - shr r9, 16 - movd mm5, ecx - psllw mm4, 2 - psubw mm4, mm0 - psubw mm5, mm0 - punpcklwd mm0, mm1 - punpcklwd mm2, mm3 - punpckldq mm0, mm2 ; transpose - movq mm1, [top_1d+8*rsi] - psllw mm1, 2 - psubw mm0, mm1 - MMX_ABS mm4, mm3 ; 1x4 sum - MMX_ABS mm5, mm2 ; 1x4 sum - MMX_ABS mm0, mm1 ; 4x1 sum - pavgw mm4, mm7 - pavgw mm5, mm7 - paddw mm0, [sums+16] ; i4x4_v satd - paddw mm4, [sums+8] ; i4x4_h satd - paddw mm5, [sums+0] ; i4x4_dc satd - movq [sums+16], mm0 - movq [sums+8], mm4 - movq [sums+0], mm5 - - add parm1q, 4 - inc esi - cmp esi, 2 - jl .loop_x - add parm1q, 4*FENC_STRIDE-8 - inc eax - cmp eax, 2 - jl .loop_y - -; horizontal sum - movq mm0, [sums+0] - movq mm1, [sums+8] - movq mm2, [sums+16] - movq mm7, mm0 - psrlq mm7, 15 - paddw mm2, mm7 - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - psrld mm2, 1 - movd [parm3q+0], mm0 ; i8x8c_dc satd - movd [parm3q+4], mm1 ; i8x8c_h satd - movd [parm3q+8], mm2 ; i8x8c_v satd - add rsp, 64 - ret - - - -; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) -; { -; int nmv=0, i, j; -; *(uint32_t*)(masks+width) = 0; -; for( i=0; i -;* Loren Merritt -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -BITS 64 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "amd64inc.asm" - -SECTION_RODATA - -pw_1: times 8 dw 1 -ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 -ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 -mask_ff: times 16 db 0xff - times 16 db 0 -sw_64: dq 64 - -SECTION .text - -%macro HADDD 2 ; sum junk - movhlps %2, %1 - paddd %1, %2 - pshuflw %2, %1, 0xE - paddd %1, %2 -%endmacro - -%macro HADDW 2 - pmaddwd %1, [pw_1 GLOBAL] - HADDD %1, %2 -%endmacro - -%macro SAD_END_SSE2 0 - movhlps xmm1, xmm0 - paddw xmm0, xmm1 - movd eax, xmm0 - ret -%endmacro - -%macro SAD_W16 1 -;----------------------------------------------------------------------------- -; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x16_%1 - movdqu xmm0, [rdx] - movdqu xmm1, [rdx+rcx] - lea rdx, [rdx+2*rcx] - movdqu xmm2, [rdx] - movdqu xmm3, [rdx+rcx] - lea rdx, [rdx+2*rcx] - psadbw xmm0, [rdi] - psadbw xmm1, [rdi+rsi] - lea rdi, [rdi+2*rsi] - movdqu xmm4, [rdx] - paddw xmm0, xmm1 - psadbw xmm2, [rdi] - psadbw xmm3, [rdi+rsi] - lea rdi, [rdi+2*rsi] - movdqu xmm5, [rdx+rcx] - lea rdx, [rdx+2*rcx] - paddw xmm2, xmm3 - movdqu xmm6, [rdx] - movdqu xmm7, [rdx+rcx] - lea rdx, [rdx+2*rcx] - paddw xmm0, xmm2 - psadbw xmm4, [rdi] - psadbw xmm5, [rdi+rsi] - lea rdi, [rdi+2*rsi] - movdqu xmm1, [rdx] - paddw xmm4, xmm5 - psadbw xmm6, [rdi] - psadbw xmm7, [rdi+rsi] - lea rdi, [rdi+2*rsi] - movdqu xmm2, [rdx+rcx] - lea rdx, [rdx+2*rcx] - paddw xmm6, xmm7 - movdqu xmm3, [rdx] - paddw xmm0, xmm4 - movdqu xmm4, [rdx+rcx] - lea rdx, [rdx+2*rcx] - paddw xmm0, xmm6 - psadbw xmm1, [rdi] - psadbw xmm2, [rdi+rsi] - lea rdi, [rdi+2*rsi] - movdqu xmm5, [rdx] - paddw xmm1, xmm2 - psadbw xmm3, [rdi] - psadbw xmm4, [rdi+rsi] - lea rdi, [rdi+2*rsi] - movdqu xmm6, [rdx+rcx] - lea rdx, [rdx+2*rcx] - paddw xmm3, xmm4 - movdqu xmm7, [rdx] - paddw xmm0, xmm1 - movdqu xmm1, [rdx+rcx] - paddw xmm0, xmm3 - psadbw xmm5, [rdi] - psadbw xmm6, [rdi+rsi] - lea rdi, [rdi+2*rsi] - paddw xmm5, xmm6 - psadbw xmm7, [rdi] - psadbw xmm1, [rdi+rsi] - paddw xmm7, xmm1 - paddw xmm0, xmm5 - paddw xmm0, xmm7 - SAD_END_SSE2 - -;----------------------------------------------------------------------------- -; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x8_%1 - movdqu xmm0, [rdx] - movdqu xmm2, [rdx+rcx] - lea rdx, [rdx+2*rcx] - movdqu xmm3, [rdx] - movdqu xmm4, [rdx+rcx] - psadbw xmm0, [rdi] - psadbw xmm2, [rdi+rsi] - lea rdi, [rdi+2*rsi] - psadbw xmm3, [rdi] - psadbw xmm4, [rdi+rsi] - lea rdi, [rdi+2*rsi] - lea rdx, [rdx+2*rcx] - paddw xmm0, xmm2 - paddw xmm3, xmm4 - paddw xmm0, xmm3 - movdqu xmm1, [rdx] - movdqu xmm2, [rdx+rcx] - lea rdx, [rdx+2*rcx] - movdqu xmm3, [rdx] - movdqu xmm4, [rdx+rcx] - psadbw xmm1, [rdi] - psadbw xmm2, [rdi+rsi] - lea rdi, [rdi+2*rsi] - psadbw xmm3, [rdi] - psadbw xmm4, [rdi+rsi] - lea rdi, [rdi+2*rsi] - lea rdx, [rdx+2*rcx] - paddw xmm1, xmm2 - paddw xmm3, xmm4 - paddw xmm0, xmm1 - paddw xmm0, xmm3 - SAD_END_SSE2 -%endmacro - -SAD_W16 sse2 -%ifdef HAVE_SSE3 -%define movdqu lddqu -SAD_W16 sse3 -%undef movdqu -%endif - - -; sad x3 / x4 - -%macro SAD_X3_START_1x16P 0 - movdqa xmm3, [parm1q] - movdqu xmm0, [parm2q] - movdqu xmm1, [parm3q] - movdqu xmm2, [parm4q] - psadbw xmm0, xmm3 - psadbw xmm1, xmm3 - psadbw xmm2, xmm3 -%endmacro - -%macro SAD_X3_1x16P 2 - movdqa xmm3, [parm1q+%1] - movdqu xmm4, [parm2q+%2] - movdqu xmm5, [parm3q+%2] - movdqu xmm6, [parm4q+%2] - psadbw xmm4, xmm3 - psadbw xmm5, xmm3 - psadbw xmm6, xmm3 - paddw xmm0, xmm4 - paddw xmm1, xmm5 - paddw xmm2, xmm6 -%endmacro - -%macro SAD_X3_2x16P 1 -%if %1 - SAD_X3_START_1x16P -%else - SAD_X3_1x16P 0, 0 -%endif - SAD_X3_1x16P FENC_STRIDE, parm5q - add parm1q, 2*FENC_STRIDE - lea parm2q, [parm2q+2*parm5q] - lea parm3q, [parm3q+2*parm5q] - lea parm4q, [parm4q+2*parm5q] -%endmacro - -%macro SAD_X4_START_1x16P 0 - movdqa xmm7, [parm1q] - movdqu xmm0, [parm2q] - movdqu xmm1, [parm3q] - movdqu xmm2, [parm4q] - movdqu xmm3, [parm5q] - psadbw xmm0, xmm7 - psadbw xmm1, xmm7 - psadbw xmm2, xmm7 - psadbw xmm3, xmm7 -%endmacro - -%macro SAD_X4_1x16P 2 - movdqa xmm7, [parm1q+%1] - movdqu xmm4, [parm2q+%2] - movdqu xmm5, [parm3q+%2] - movdqu xmm6, [parm4q+%2] - movdqu xmm8, [parm5q+%2] - psadbw xmm4, xmm7 - psadbw xmm5, xmm7 - psadbw xmm6, xmm7 - psadbw xmm8, xmm7 - paddw xmm0, xmm4 - paddw xmm1, xmm5 - paddw xmm2, xmm6 - paddw xmm3, xmm8 -%endmacro - -%macro SAD_X4_2x16P 1 -%if %1 - SAD_X4_START_1x16P -%else - SAD_X4_1x16P 0, 0 -%endif - SAD_X4_1x16P FENC_STRIDE, parm6q - add parm1q, 2*FENC_STRIDE - lea parm2q, [parm2q+2*parm6q] - lea parm3q, [parm3q+2*parm6q] - lea parm4q, [parm4q+2*parm6q] - lea parm5q, [parm5q+2*parm6q] -%endmacro - -%macro SAD_X3_END 0 - movhlps xmm4, xmm0 - movhlps xmm5, xmm1 - movhlps xmm6, xmm2 - paddw xmm0, xmm4 - paddw xmm1, xmm5 - paddw xmm2, xmm6 - movd [parm6q+0], xmm0 - movd [parm6q+4], xmm1 - movd [parm6q+8], xmm2 - ret -%endmacro - -%macro SAD_X4_END 0 - mov rax, parm7q - psllq xmm1, 32 - psllq xmm3, 32 - paddw xmm0, xmm1 - paddw xmm2, xmm3 - movhlps xmm1, xmm0 - movhlps xmm3, xmm2 - paddw xmm0, xmm1 - paddw xmm2, xmm3 - movq [rax+0], xmm0 - movq [rax+8], xmm2 - ret -%endmacro - -;----------------------------------------------------------------------------- -; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, -; uint8_t *pix2, int i_stride, int scores[3] ) -;----------------------------------------------------------------------------- -%macro SAD_X 4 -cglobal x264_pixel_sad_x%1_%2x%3_%4 - SAD_X%1_2x%2P 1 -%rep %3/2-1 - SAD_X%1_2x%2P 0 -%endrep - SAD_X%1_END -%endmacro - -SAD_X 3, 16, 16, sse2 -SAD_X 3, 16, 8, sse2 -SAD_X 4, 16, 16, sse2 -SAD_X 4, 16, 8, sse2 - -%ifdef HAVE_SSE3 -%define movdqu lddqu -SAD_X 3, 16, 16, sse3 -SAD_X 3, 16, 8, sse3 -SAD_X 4, 16, 16, sse3 -SAD_X 4, 16, 8, sse3 -%undef movdqu -%endif - - -; Core2 (Conroe) can load unaligned data just as quickly as aligned data... -; unless the unaligned data spans the border between 2 cachelines, in which -; case it's really slow. The exact numbers may differ, but all Intel cpus -; have a large penalty for cacheline splits. -; (8-byte alignment exactly half way between two cachelines is ok though.) -; LDDQU was supposed to fix this, but it only works on Pentium 4. -; So in the split case we load aligned data and explicitly perform the -; alignment between registers. Like on archs that have only aligned loads, -; except complicated by the fact that PALIGNR takes only an immediate, not -; a variable alignment. -; It is also possible to hoist the realignment to the macroblock level (keep -; 2 copies of the reference frame, offset by 32 bytes), but the extra memory -; needed for that method makes it often slower. - -; sad 16x16 costs on Core2: -; good offsets: 49 cycles (50/64 of all mvs) -; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles) -; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles) -; cache or page split with palignr: 57 cycles (ammortized: +2 cycles) - -; computed jump assumes this loop is exactly 64 bytes -%macro SAD16_CACHELINE_LOOP 1 ; alignment -ALIGN 16 -sad_w16_align%1: - movdqa xmm1, [rdx+16] - movdqa xmm2, [rdx+rcx+16] - palignr xmm1, [rdx], %1 - palignr xmm2, [rdx+rcx], %1 - psadbw xmm1, [rdi] - psadbw xmm2, [rdi+rsi] - paddw xmm0, xmm1 - paddw xmm0, xmm2 - lea rdx, [rdx+2*rcx] - lea rdi, [rdi+2*rsi] - dec eax - jg sad_w16_align%1 - ret -%endmacro - -%macro SAD16_CACHELINE_FUNC 1 ; height -cglobal x264_pixel_sad_16x%1_cache64_ssse3 - mov eax, parm3d - and eax, 0x37 - cmp eax, 0x30 - jle x264_pixel_sad_16x%1_sse2 - mov eax, parm3d - and eax, 15 - shl eax, 6 -%ifdef __PIC__ - lea r10, [sad_w16_align1 - 64 GLOBAL] - add r10, rax -%else - lea r10, [sad_w16_align1 - 64 + rax] -%endif - and parm3q, ~15 - mov eax, %1/2 - pxor xmm0, xmm0 - call r10 - SAD_END_SSE2 -%endmacro - -%macro SAD8_CACHELINE_FUNC 1 ; height -cglobal x264_pixel_sad_8x%1_cache64_mmxext - mov eax, parm3d - and eax, 0x3f - cmp eax, 0x38 - jle x264_pixel_sad_8x%1_mmxext - and eax, 7 - shl eax, 3 - movd mm6, [sw_64 GLOBAL] - movd mm7, eax - psubw mm6, mm7 - and parm3q, ~7 - mov eax, %1/2 - pxor mm0, mm0 -.loop: - movq mm1, [parm3q+8] - movq mm2, [parm3q+parm4q+8] - movq mm3, [parm3q] - movq mm4, [parm3q+parm4q] - psllq mm1, mm6 - psllq mm2, mm6 - psrlq mm3, mm7 - psrlq mm4, mm7 - por mm1, mm3 - por mm2, mm4 - psadbw mm1, [parm1q] - psadbw mm2, [parm1q+parm2q] - paddw mm0, mm1 - paddw mm0, mm2 - lea parm3q, [parm3q+2*parm4q] - lea parm1q, [parm1q+2*parm2q] - dec eax - jg .loop - movd eax, mm0 - ret -%endmacro - - -; sad_x3/x4_cache64: check each mv. -; if they're all within a cacheline, use normal sad_x3/x4. -; otherwise, send them individually to sad_cache64. -%macro CHECK_SPLIT 2 ; pix, width - mov eax, %1 - and eax, 0x37|%2 - cmp eax, 0x30|%2 - jg .split -%endmacro - -%macro SADX3_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver -cglobal x264_pixel_sad_x3_%1x%2_cache64_%4 - CHECK_SPLIT parm2d, %1 - CHECK_SPLIT parm3d, %1 - CHECK_SPLIT parm4d, %1 - jmp x264_pixel_sad_x3_%1x%2_%3 -.split: - push parm4q - push parm3q - mov parm3q, parm2q - mov parm2q, FENC_STRIDE - mov parm4q, parm5q - mov parm5q, parm1q - call x264_pixel_sad_%1x%2_cache64_%4 - mov [parm6q], eax - pop parm3q - mov parm1q, parm5q - call x264_pixel_sad_%1x%2_cache64_%4 - mov [parm6q+4], eax - pop parm3q - mov parm1q, parm5q - call x264_pixel_sad_%1x%2_cache64_%4 - mov [parm6q+8], eax - ret -%endmacro - -%macro SADX4_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver -cglobal x264_pixel_sad_x4_%1x%2_cache64_%4 - CHECK_SPLIT parm2d, %1 - CHECK_SPLIT parm3d, %1 - CHECK_SPLIT parm4d, %1 - CHECK_SPLIT parm5d, %1 - jmp x264_pixel_sad_x4_%1x%2_%3 -.split: - mov r11, parm7q - push parm5q - push parm4q - push parm3q - mov parm3q, parm2q - mov parm2q, FENC_STRIDE - mov parm4q, parm6q - mov parm5q, parm1q - call x264_pixel_sad_%1x%2_cache64_%4 - mov [r11], eax - pop parm3q - mov parm1q, parm5q - call x264_pixel_sad_%1x%2_cache64_%4 - mov [r11+4], eax - pop parm3q - mov parm1q, parm5q - call x264_pixel_sad_%1x%2_cache64_%4 - mov [r11+8], eax - pop parm3q - mov parm1q, parm5q - call x264_pixel_sad_%1x%2_cache64_%4 - mov [r11+12], eax - ret -%endmacro - -%macro SADX34_CACHELINE_FUNC 4 - SADX3_CACHELINE_FUNC %1, %2, %3, %4 - SADX4_CACHELINE_FUNC %1, %2, %3, %4 -%endmacro - -cextern x264_pixel_sad_8x16_mmxext -cextern x264_pixel_sad_8x8_mmxext -cextern x264_pixel_sad_8x4_mmxext -cextern x264_pixel_sad_x3_8x16_mmxext -cextern x264_pixel_sad_x3_8x8_mmxext -cextern x264_pixel_sad_x4_8x16_mmxext -cextern x264_pixel_sad_x4_8x8_mmxext - -; instantiate the aligned sads - -SAD8_CACHELINE_FUNC 4 -SAD8_CACHELINE_FUNC 8 -SAD8_CACHELINE_FUNC 16 -SADX34_CACHELINE_FUNC 8, 16, mmxext, mmxext -SADX34_CACHELINE_FUNC 8, 8, mmxext, mmxext - -%ifdef HAVE_SSE3 - -SAD16_CACHELINE_FUNC 8 -SAD16_CACHELINE_FUNC 16 -%assign i 1 -%rep 15 -SAD16_CACHELINE_LOOP i -%assign i i+1 -%endrep - -SADX34_CACHELINE_FUNC 16, 16, sse2, ssse3 -SADX34_CACHELINE_FUNC 16, 8, sse2, ssse3 - -%endif ; HAVE_SSE3 - - -; ssd - -%macro SSD_INC_2x16P_SSE2 0 - movdqu xmm1, [rdi] - movdqu xmm2, [rdx] - movdqu xmm3, [rdi+rsi] - movdqu xmm4, [rdx+rcx] - - movdqa xmm5, xmm1 - movdqa xmm6, xmm3 - psubusb xmm1, xmm2 - psubusb xmm3, xmm4 - psubusb xmm2, xmm5 - psubusb xmm4, xmm6 - por xmm1, xmm2 - por xmm3, xmm4 - - movdqa xmm2, xmm1 - movdqa xmm4, xmm3 - punpcklbw xmm1, xmm7 - punpckhbw xmm2, xmm7 - punpcklbw xmm3, xmm7 - punpckhbw xmm4, xmm7 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - pmaddwd xmm4, xmm4 - - lea rdi, [rdi+2*rsi] - lea rdx, [rdx+2*rcx] - - paddd xmm1, xmm2 - paddd xmm3, xmm4 - paddd xmm0, xmm1 - paddd xmm0, xmm3 -%endmacro - -%macro SSD_START_SSE2 0 - pxor xmm7, xmm7 ; zero - pxor xmm0, xmm0 ; mm0 holds the sum -%endmacro - -%macro SSD_END_SSE2 0 - HADDD xmm0, xmm1 - movd eax, xmm0 - ret -%endmacro - -;----------------------------------------------------------------------------- -; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ssd_16x16_sse2 - SSD_START_SSE2 -%rep 8 - SSD_INC_2x16P_SSE2 -%endrep - SSD_END_SSE2 - -;----------------------------------------------------------------------------- -; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ssd_16x8_sse2 - SSD_START_SSE2 -%rep 4 - SSD_INC_2x16P_SSE2 -%endrep - SSD_END_SSE2 - - - -%macro SUMSUB_BADC 4 - paddw %1, %2 - paddw %3, %4 - paddw %2, %2 - paddw %4, %4 - psubw %2, %1 - psubw %4, %3 -%endmacro - -%macro HADAMARD1x4 4 - SUMSUB_BADC %1, %2, %3, %4 - SUMSUB_BADC %1, %3, %2, %4 -%endmacro - -%macro HADAMARD1x8 8 - SUMSUB_BADC %1, %5, %2, %6 - SUMSUB_BADC %3, %7, %4, %8 - SUMSUB_BADC %1, %3, %2, %4 - SUMSUB_BADC %5, %7, %6, %8 - SUMSUB_BADC %1, %2, %3, %4 - SUMSUB_BADC %5, %6, %7, %8 -%endmacro - -;;; row transform not used, because phaddw is much slower than paddw on a Conroe -;%macro PHSUMSUB 3 -; movdqa %3, %1 -; phaddw %1, %2 -; phsubw %3, %2 -;%endmacro - -;%macro HADAMARD4x1_SSSE3 5 ; ABCD-T -> ADTC -; PHSUMSUB %1, %2, %5 -; PHSUMSUB %3, %4, %2 -; PHSUMSUB %1, %3, %4 -; PHSUMSUB %5, %2, %3 -;%endmacro - -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 -%endmacro - -%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers - mov%1 %5, %3 - punpckh%2 %3, %4 - punpckl%2 %5, %4 -%endmacro - -%macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC - SBUTTERFLY dqa, dq, %1, %2, %5 - SBUTTERFLY dqa, dq, %3, %4, %2 - SBUTTERFLY dqa, qdq, %1, %3, %4 - SBUTTERFLY dqa, qdq, %5, %2, %3 -%endmacro - -%macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD - SBUTTERFLY dqa, wd, %1, %2, %5 - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, dq, %1, %3, %4 - SBUTTERFLY2 dqa, dq, %5, %2, %3 - SBUTTERFLY dqa, qdq, %1, %3, %2 - SBUTTERFLY2 dqa, qdq, %4, %5, %3 -%endmacro - -%macro TRANSPOSE8x8 9 ; ABCDEFGH-T -> AFHDTECB - SBUTTERFLY dqa, wd, %1, %2, %9 - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, wd, %5, %6, %4 - SBUTTERFLY dqa, wd, %7, %8, %6 - SBUTTERFLY dqa, dq, %1, %3, %8 - SBUTTERFLY dqa, dq, %9, %2, %3 - SBUTTERFLY dqa, dq, %5, %7, %2 - SBUTTERFLY dqa, dq, %4, %6, %7 - SBUTTERFLY dqa, qdq, %1, %5, %6 - SBUTTERFLY dqa, qdq, %9, %4, %5 - SBUTTERFLY dqa, qdq, %8, %2, %4 - SBUTTERFLY dqa, qdq, %3, %7, %2 -%endmacro - -%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2] - movq %1, %3 - movq %2, %4 - punpcklbw %1, %2 - punpcklbw %2, %2 - psubw %1, %2 -%endmacro - -%macro LOAD_DIFF_4x8P 6 ; 4x dest, 2x temp - LOAD_DIFF_8P %1, %5, [parm1q], [parm3q] - LOAD_DIFF_8P %2, %6, [parm1q+parm2q], [parm3q+parm4q] - LOAD_DIFF_8P %3, %5, [parm1q+2*parm2q], [parm3q+2*parm4q] - LOAD_DIFF_8P %4, %6, [parm1q+r10], [parm3q+r11] -%endmacro - -%macro SUM1x8_SSE2 3 ; 01 junk sum - pxor %2, %2 - psubw %2, %1 - pmaxsw %1, %2 - paddusw %3, %1 -%endmacro - -%macro SUM4x4_SSE2 4 ; 02 13 junk sum - pxor %3, %3 - psubw %3, %1 - pmaxsw %1, %3 - - pxor %3, %3 - psubw %3, %2 - pmaxsw %2, %3 - - paddusw %4, %1 - paddusw %4, %2 -%endmacro - -%macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum - pxor %3, %3 - pxor %6, %6 - psubw %3, %1 - psubw %6, %4 - pmaxsw %1, %3 - pmaxsw %4, %6 - pxor %3, %3 - pxor %6, %6 - psubw %3, %2 - psubw %6, %5 - pmaxsw %2, %3 - pmaxsw %5, %6 - paddusw %1, %2 - paddusw %4, %5 - paddusw %7, %1 - paddusw %7, %4 -%endmacro - -%macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum - pabsw %1, %1 - pabsw %2, %2 - pabsw %4, %4 - pabsw %5, %5 - paddusw %1, %2 - paddusw %4, %5 - paddusw %7, %1 - paddusw %7, %4 -%endmacro - -%macro SATD_TWO_SSE2 0 - LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - lea parm1q, [parm1q+4*parm2q] - lea parm3q, [parm3q+4*parm4q] - HADAMARD1x4 xmm0, xmm1, xmm2, xmm3 - TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4 - HADAMARD1x4 xmm0, xmm1, xmm2, xmm3 - SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6 -%endmacro - -%macro SATD_START 0 - pxor xmm6, xmm6 - lea r10, [3*parm2q] - lea r11, [3*parm4q] -%endmacro - -%macro SATD_END 0 - psrlw xmm6, 1 - HADDW xmm6, xmm7 - movd eax, xmm6 - ret -%endmacro - -%macro SATDS 1 -;----------------------------------------------------------------------------- -; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_16x16_%1 - SATD_START - mov r8, rdi - mov r9, rdx - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - lea rdi, [r8+8] - lea rdx, [r9+8] - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x16_%1 - SATD_START - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_16x8_%1 - SATD_START - mov r8, rdi - mov r9, rdx - SATD_TWO_SSE2 - SATD_TWO_SSE2 - lea rdi, [r8+8] - lea rdx, [r9+8] - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x8_%1 - SATD_START - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_END - -;----------------------------------------------------------------------------- -; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x4_%1 - SATD_START - SATD_TWO_SSE2 - SATD_END - - -;----------------------------------------------------------------------------- -; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_sa8d_8x8_%1 - lea r10, [3*parm2q] - lea r11, [3*parm4q] - LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8 - lea parm1q, [parm1q+4*parm2q] - lea parm3q, [parm3q+4*parm4q] - LOAD_DIFF_4x8P xmm4, xmm5, xmm6, xmm7, xmm8, xmm8 - - HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 - TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 - HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1 - - pxor xmm10, xmm10 - SUM8x4 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10 - SUM8x4 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10 - psrlw xmm10, 1 - HADDW xmm10, xmm0 - movd eax, xmm10 - add r8d, eax ; preserve rounding for 16x16 - add eax, 1 - shr eax, 1 - ret - -;----------------------------------------------------------------------------- -; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -;; violates calling convention -cglobal x264_pixel_sa8d_16x16_%1 - xor r8d, r8d - call x264_pixel_sa8d_8x8_%1 ; pix[0] - lea parm1q, [parm1q+4*parm2q] - lea parm3q, [parm3q+4*parm4q] - call x264_pixel_sa8d_8x8_%1 ; pix[8*stride] - lea r10, [3*parm2q-2] - lea r11, [3*parm4q-2] - shl r10, 2 - shl r11, 2 - sub parm1q, r10 - sub parm3q, r11 - call x264_pixel_sa8d_8x8_%1 ; pix[8] - lea parm1q, [parm1q+4*parm2q] - lea parm3q, [parm3q+4*parm4q] - call x264_pixel_sa8d_8x8_%1 ; pix[8*stride+8] - mov eax, r8d - add eax, 1 - shr eax, 1 - ret -%endmacro ; SATDS - -%define SUM8x4 SUM8x4_SSE2 -SATDS sse2 -%ifdef HAVE_SSE3 -%define SUM8x4 SUM8x4_SSSE3 -SATDS ssse3 -%endif - - - -;----------------------------------------------------------------------------- -; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res ) -;----------------------------------------------------------------------------- -cglobal x264_intra_sa8d_x3_8x8_core_sse2 - ; 8x8 hadamard - pxor xmm4, xmm4 - movq xmm0, [parm1q+0*FENC_STRIDE] - movq xmm7, [parm1q+1*FENC_STRIDE] - movq xmm6, [parm1q+2*FENC_STRIDE] - movq xmm3, [parm1q+3*FENC_STRIDE] - movq xmm5, [parm1q+4*FENC_STRIDE] - movq xmm1, [parm1q+5*FENC_STRIDE] - movq xmm8, [parm1q+6*FENC_STRIDE] - movq xmm2, [parm1q+7*FENC_STRIDE] - punpcklbw xmm0, xmm4 - punpcklbw xmm7, xmm4 - punpcklbw xmm6, xmm4 - punpcklbw xmm3, xmm4 - punpcklbw xmm5, xmm4 - punpcklbw xmm1, xmm4 - punpcklbw xmm8, xmm4 - punpcklbw xmm2, xmm4 - HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2 - TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4 - HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 - - ; dc - movzx edi, word [parm2q+0] - add di, word [parm2q+16] - add edi, 8 - and edi, -16 - shl edi, 2 - - pxor xmm15, xmm15 - movdqa xmm8, xmm2 - movdqa xmm9, xmm3 - movdqa xmm10, xmm4 - movdqa xmm11, xmm5 - SUM8x4_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15 - movdqa xmm8, xmm6 - movdqa xmm9, xmm7 - SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15 - movdqa xmm8, xmm1 - SUM1x8_SSE2 xmm8, xmm10, xmm15 - movdqa xmm14, xmm15 ; 7x8 sum - - movdqa xmm8, [parm2q+0] ; left edge - movd xmm9, edi - psllw xmm8, 3 - psubw xmm8, xmm0 - psubw xmm9, xmm0 - SUM1x8_SSE2 xmm8, xmm10, xmm14 - SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum - punpcklwd xmm0, xmm1 - punpcklwd xmm2, xmm3 - punpcklwd xmm4, xmm5 - punpcklwd xmm6, xmm7 - punpckldq xmm0, xmm2 - punpckldq xmm4, xmm6 - punpcklqdq xmm0, xmm4 ; transpose - movdqa xmm1, [parm2q+16] ; top edge - movdqa xmm2, xmm15 - psllw xmm1, 3 - psrldq xmm2, 2 ; 8x7 sum - psubw xmm0, xmm1 ; 8x1 sum - SUM1x8_SSE2 xmm0, xmm1, xmm2 - - HADDW xmm14, xmm3 - movd eax, xmm14 - add eax, 2 - shr eax, 2 - mov [parm3q+4], eax ; i8x8_h sa8d - HADDW xmm15, xmm4 - movd eax, xmm15 - add eax, 2 - shr eax, 2 - mov [parm3q+8], eax ; i8x8_dc sa8d - HADDW xmm2, xmm5 - movd eax, xmm2 - add eax, 2 - shr eax, 2 - mov [parm3q+0], eax ; i8x8_v sa8d - - ret - - - -;----------------------------------------------------------------------------- -; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_4x4x2_core_sse2 - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm2, xmm2 - pxor xmm3, xmm3 - pxor xmm4, xmm4 - movdqa xmm8, [pw_1 GLOBAL] -%rep 4 - movq xmm5, [parm1q] - movq xmm6, [parm3q] - punpcklbw xmm5, xmm0 - punpcklbw xmm6, xmm0 - paddw xmm1, xmm5 - paddw xmm2, xmm6 - movdqa xmm7, xmm5 - pmaddwd xmm5, xmm5 - pmaddwd xmm7, xmm6 - pmaddwd xmm6, xmm6 - paddd xmm3, xmm5 - paddd xmm4, xmm7 - paddd xmm3, xmm6 - add parm1q, parm2q - add parm3q, parm4q -%endrep - ; PHADDW xmm1, xmm2 - ; PHADDD xmm3, xmm4 - pshufd xmm5, xmm3, 0xB1 - pmaddwd xmm1, xmm8 - pmaddwd xmm2, xmm8 - pshufd xmm6, xmm4, 0xB1 - packssdw xmm1, xmm2 - paddd xmm3, xmm5 - pshufd xmm1, xmm1, 0xD8 - paddd xmm4, xmm6 - pmaddwd xmm1, xmm8 - movdqa xmm5, xmm3 - punpckldq xmm3, xmm4 - punpckhdq xmm5, xmm4 - movq [parm5q+ 0], xmm1 - movq [parm5q+ 8], xmm3 - psrldq xmm1, 8 - movq [parm5q+16], xmm1 - movq [parm5q+24], xmm5 - ret - -;----------------------------------------------------------------------------- -; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_end4_sse2 - movdqa xmm0, [parm1q+ 0] - movdqa xmm1, [parm1q+16] - movdqa xmm2, [parm1q+32] - movdqa xmm3, [parm1q+48] - movdqa xmm4, [parm1q+64] - paddd xmm0, [parm2q+ 0] - paddd xmm1, [parm2q+16] - paddd xmm2, [parm2q+32] - paddd xmm3, [parm2q+48] - paddd xmm4, [parm2q+64] - paddd xmm0, xmm1 - paddd xmm1, xmm2 - paddd xmm2, xmm3 - paddd xmm3, xmm4 - movdqa xmm5, [ssim_c1 GLOBAL] - movdqa xmm6, [ssim_c2 GLOBAL] - TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4 - -; s1=mm0, s2=mm3, ss=mm4, s12=mm2 - movdqa xmm1, xmm3 - pslld xmm3, 16 - pmaddwd xmm1, xmm0 ; s1*s2 - por xmm0, xmm3 - pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2 - pslld xmm1, 1 - pslld xmm2, 7 - pslld xmm4, 6 - psubd xmm2, xmm1 ; covar*2 - psubd xmm4, xmm0 ; vars - paddd xmm0, xmm5 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm4, xmm6 - cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1) - cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1) - cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2) - cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2) - mulps xmm1, xmm2 - mulps xmm0, xmm4 - divps xmm1, xmm0 ; ssim - - neg parm3q -%ifdef __PIC__ - lea rax, [mask_ff + 16 GLOBAL] - movdqu xmm3, [rax + parm3q*4] -%else - movdqu xmm3, [mask_ff + parm3q*4 + 16] -%endif - pand xmm1, xmm3 - movhlps xmm0, xmm1 - addps xmm0, xmm1 - pshuflw xmm1, xmm0, 0xE - addss xmm0, xmm1 - ret - diff --git a/common/cpu.c b/common/cpu.c index 1de9fb16..a486793e 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -48,8 +48,10 @@ uint32_t x264_cpu_detect( void ) int max_extended_cap; int cache; +#ifndef ARCH_X86_64 if( !x264_cpu_cpuid_test() ) return 0; +#endif x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 ); if( eax == 0 ) diff --git a/common/dct.c b/common/dct.c index 69e72588..ae95ab5f 100644 --- a/common/dct.c +++ b/common/dct.c @@ -23,7 +23,7 @@ #include "common.h" #ifdef HAVE_MMX -# include "i386/dct.h" +# include "x86/dct.h" #endif #ifdef ARCH_PPC # include "ppc/dct.h" @@ -597,11 +597,9 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) if( cpu&X264_CPU_MMX ) pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmx; #endif -#ifdef ARCH_X86_64 if( cpu&X264_CPU_SSE2 ) pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; #endif -#endif #ifdef ARCH_PPC if( cpu&X264_CPU_ALTIVEC ) diff --git a/common/frame.c b/common/frame.c index b5eec92d..4c4cc910 100644 --- a/common/frame.c +++ b/common/frame.c @@ -751,10 +751,9 @@ void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); -#ifdef ARCH_X86_64 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -#else +#ifdef ARCH_X86 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); @@ -789,17 +788,15 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext; pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext; pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext; - -#ifdef ARCH_X86_64 +#ifdef ARCH_X86 + pf->deblock_v_luma = x264_deblock_v_luma_mmxext; + pf->deblock_h_luma = x264_deblock_h_luma_mmxext; +#endif if( cpu&X264_CPU_SSE2 ) { pf->deblock_v_luma = x264_deblock_v_luma_sse2; pf->deblock_h_luma = x264_deblock_h_luma_sse2; } -#else - pf->deblock_v_luma = x264_deblock_v_luma_mmxext; - pf->deblock_h_luma = x264_deblock_h_luma_mmxext; -#endif } #endif diff --git a/common/i386/dct-a.asm b/common/i386/dct-a.asm deleted file mode 100644 index cfa64f31..00000000 --- a/common/i386/dct-a.asm +++ /dev/null @@ -1,807 +0,0 @@ -;***************************************************************************** -;* dct.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2003 x264 project -;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ -;* -;* Authors: Laurent Aimar (initial version) -;* Min Chen (converted to nasm) -;* Christian Heine (dct8/idct8 functions) -;* Loren Merritt (misc) -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -;***************************************************************************** -;* * -;* Revision history: * -;* * -;* 2004.04.28 portab all 4x4 function to nasm (CM) * -;* 2005.08.24 added mmxext optimized dct8/idct8 functions (CH) * -;* * -;***************************************************************************** - -BITS 32 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "i386inc.asm" - -%macro MMX_ZERO 1 - pxor %1, %1 -%endmacro - -%macro MMX_LOAD_DIFF_4P 5 - movd %1, %4 - punpcklbw %1, %3 - movd %2, %5 - punpcklbw %2, %3 - psubw %1, %2 -%endmacro - -%macro MMX_SUMSUB_BA 2 - paddw %1, %2 - paddw %2, %2 - psubw %2, %1 -%endmacro - -%macro MMX_SUMSUB_BADC 4 - paddw %1, %2 - paddw %3, %4 - paddw %2, %2 - paddw %4, %4 - psubw %2, %1 - psubw %4, %3 -%endmacro - -%macro MMX_SUMSUB2_AB 3 - movq %3, %1 - paddw %1, %1 - paddw %1, %2 - psubw %3, %2 - psubw %3, %2 -%endmacro - -%macro MMX_SUMSUBD2_AB 4 - movq %4, %1 - movq %3, %2 - psraw %2, 1 - psraw %4, 1 - paddw %1, %2 - psubw %4, %3 -%endmacro - -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 -%endmacro - -;----------------------------------------------------------------------------- -; input ABCD output ADTC -;----------------------------------------------------------------------------- -%macro MMX_TRANSPOSE 5 - SBUTTERFLY q, wd, %1, %2, %5 - SBUTTERFLY q, wd, %3, %4, %2 - SBUTTERFLY q, dq, %1, %3, %4 - SBUTTERFLY q, dq, %5, %2, %3 -%endmacro - -%macro MMX_STORE_DIFF_4P 5 - paddw %1, %3 - psraw %1, 6 - movd %2, %5 - punpcklbw %2, %4 - paddsw %1, %2 - packuswb %1, %1 - movd %5, %1 -%endmacro - -;============================================================================= -; Local Data (Read Only) -;============================================================================= - -SECTION_RODATA - -;----------------------------------------------------------------------------- -; Various memory constants (trigonometric values or rounding values) -;----------------------------------------------------------------------------- - -ALIGN 16 -pw_32: times 8 dw 32 -pw_1: times 4 dw 1 - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -;----------------------------------------------------------------------------- -; void __cdecl x264_dct4x4dc_mmx( int16_t d[4][4] ) -;----------------------------------------------------------------------------- -cglobal x264_dct4x4dc_mmx - mov eax, [esp+ 4] - movq mm0, [eax+ 0] - movq mm1, [eax+ 8] - movq mm2, [eax+16] - movq mm3, [eax+24] - - picgetgot edx - - MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 - MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 - - MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 - - MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 - MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 - - movq mm6, [pw_1 GLOBAL] - paddw mm0, mm6 - paddw mm2, mm6 - psraw mm0, 1 - movq [eax+ 0], mm0 - psraw mm2, 1 - movq [eax+ 8], mm2 - paddw mm3, mm6 - paddw mm4, mm6 - psraw mm3, 1 - movq [eax+16], mm3 - psraw mm4, 1 - movq [eax+24], mm4 - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_idct4x4dc_mmx( int16_t d[4][4] ) -;----------------------------------------------------------------------------- -cglobal x264_idct4x4dc_mmx - mov eax, [esp+ 4] - movq mm0, [eax+ 0] - movq mm1, [eax+ 8] - movq mm2, [eax+16] - movq mm3, [eax+24] - - MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 - MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 - - MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 - - MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 - MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 - - movq [eax+ 0], mm0 - movq [eax+ 8], mm2 - movq [eax+16], mm3 - movq [eax+24], mm4 - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) -;----------------------------------------------------------------------------- -cglobal x264_sub4x4_dct_mmx - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+12] ; pix2 - - MMX_ZERO mm7 - - ; Load 4 lines - MMX_LOAD_DIFF_4P mm0, mm6, mm7, [eax+0*FENC_STRIDE], [ecx+0*FDEC_STRIDE] - MMX_LOAD_DIFF_4P mm1, mm6, mm7, [eax+1*FENC_STRIDE], [ecx+1*FDEC_STRIDE] - MMX_LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*FENC_STRIDE], [ecx+2*FDEC_STRIDE] - MMX_LOAD_DIFF_4P mm3, mm6, mm7, [eax+3*FENC_STRIDE], [ecx+3*FDEC_STRIDE] - - MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12 - - MMX_SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12 - MMX_SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12 - - ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 - MMX_TRANSPOSE mm2, mm0, mm3, mm4, mm1 - - MMX_SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12 - - MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12 - MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12 - - mov eax, [esp+ 4] ; dct - movq [eax+ 0], mm1 - movq [eax+ 8], mm2 - movq [eax+16], mm3 - movq [eax+24], mm0 - - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ) -;----------------------------------------------------------------------------- -cglobal x264_add4x4_idct_mmx - ; Load dct coeffs - mov eax, [esp+ 8] ; dct - movq mm0, [eax+ 0] - movq mm1, [eax+ 8] - movq mm2, [eax+16] - movq mm3, [eax+24] - - mov eax, [esp+ 4] ; p_dst - - picgetgot edx - - MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02 - MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) - - MMX_SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 - - ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0 - MMX_TRANSPOSE mm1, mm4, mm0, mm2, mm3 - - MMX_SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02 - MMX_SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) - - MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 - - MMX_ZERO mm7 - movq mm6, [pw_32 GLOBAL] - - MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax+0*FDEC_STRIDE] - MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+1*FDEC_STRIDE] - MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+2*FDEC_STRIDE] - MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+3*FDEC_STRIDE] - - ret - - - -; ============================================================================= -; 8x8 Transform -; ============================================================================= - -; ----------------------------------------------------------------------------- -; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2) -; ----------------------------------------------------------------------------- -%macro MMX_LOAD_DIFF_8P 7 - movq %1, %5 - movq %2, %1 - punpcklbw %1, %7 - punpckhbw %2, %7 - movq %3, %6 - movq %4, %3 - punpcklbw %3, %7 - punpckhbw %4, %7 - psubw %1, %3 - psubw %2, %4 -%endmacro - -%macro MMX_LOADSUMSUB 4 ; returns %1=%3+%4, %2=%3-%4 - movq %2, %3 - movq %1, %4 - MMX_SUMSUB_BA %1, %2 -%endmacro - -%macro MMX_STORE_DIFF_8P 4 - psraw %1, 6 - movq %3, %2 - punpcklbw %3, %4 - paddsw %1, %3 - packuswb %1, %1 - movq %2, %1 -%endmacro - - -;----------------------------------------------------------------------------- -; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 ); -;----------------------------------------------------------------------------- -ALIGN 16 -x264_pixel_sub_8x8_mmx: - - mov edx, [esp+ 4] ; diff - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+12] ; pix2 - - MMX_ZERO mm7 - - %assign disp 0 - %rep 8 - MMX_LOAD_DIFF_8P mm0, mm1, mm2, mm3, [eax], [ecx], mm7 - movq [edx+disp], mm0 - movq [edx+disp+8], mm1 - add eax, FENC_STRIDE - add ecx, FDEC_STRIDE - %assign disp disp+16 - %endrep - - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_ydct8_mmx( int16_t dest[8][8] ); -;----------------------------------------------------------------------------- -ALIGN 16 -x264_ydct8_mmx: - - mov eax, [esp+04] ; dest - - ;------------------------------------------------------------------------- - ; vertical dct ( compute 4 columns at a time -> 2 loops ) - ;------------------------------------------------------------------------- - - %assign disp 0 - %rep 2 - - MMX_LOADSUMSUB mm2, mm3, [eax+disp+0*16], [eax+disp+7*16] ; mm2 = s07, mm3 = d07 - MMX_LOADSUMSUB mm1, mm5, [eax+disp+1*16], [eax+disp+6*16] ; mm1 = s16, mm5 = d16 - MMX_LOADSUMSUB mm0, mm6, [eax+disp+2*16], [eax+disp+5*16] ; mm0 = s25, mm6 = d25 - MMX_LOADSUMSUB mm4, mm7, [eax+disp+3*16], [eax+disp+4*16] ; mm4 = s34, mm7 = d34 - - MMX_SUMSUB_BA mm4, mm2 ; mm4 = a0, mm2 = a2 - MMX_SUMSUB_BA mm0, mm1 ; mm0 = a1, mm1 = a3 - MMX_SUMSUB_BA mm0, mm4 ; mm0 = dst0, mm1 = dst4 - - movq [eax+disp+0*16], mm0 - movq [eax+disp+4*16], mm4 - - movq mm0, mm1 ; a3 - psraw mm0, 1 ; a3>>1 - paddw mm0, mm2 ; a2 + (a3>>1) - psraw mm2, 1 ; a2>>1 - psubw mm2, mm1 ; (a2>>1) - a3 - - movq [eax+disp+2*16], mm0 - movq [eax+disp+6*16], mm2 - - movq mm0, mm6 - psraw mm0, 1 - paddw mm0, mm6 ; d25+(d25>>1) - movq mm1, mm3 - psubw mm1, mm7 ; a5 = d07-d34-(d25+(d25>>1)) - psubw mm1, mm0 - - movq mm0, mm5 - psraw mm0, 1 - paddw mm0, mm5 ; d16+(d16>>1) - movq mm2, mm3 - paddw mm2, mm7 ; a6 = d07+d34-(d16+(d16>>1)) - psubw mm2, mm0 - - movq mm0, mm3 - psraw mm0, 1 - paddw mm0, mm3 ; d07+(d07>>1) - paddw mm0, mm5 - paddw mm0, mm6 ; a4 = d16+d25+(d07+(d07>>1)) - - movq mm3, mm7 - psraw mm3, 1 - paddw mm3, mm7 ; d34+(d34>>1) - paddw mm3, mm5 - psubw mm3, mm6 ; a7 = d16-d25+(d34+(d34>>1)) - - movq mm7, mm3 - psraw mm7, 2 - paddw mm7, mm0 ; a4 + (a7>>2) - - movq mm6, mm2 - psraw mm6, 2 - paddw mm6, mm1 ; a5 + (a6>>2) - - psraw mm0, 2 - psraw mm1, 2 - psubw mm0, mm3 ; (a4>>2) - a7 - psubw mm2, mm1 ; a6 - (a5>>2) - - movq [eax+disp+1*16], mm7 - movq [eax+disp+3*16], mm6 - movq [eax+disp+5*16], mm2 - movq [eax+disp+7*16], mm0 - - %assign disp disp+8 - %endrep - - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_yidct8_mmx( int16_t dest[8][8] ); -;----------------------------------------------------------------------------- -ALIGN 16 -x264_yidct8_mmx: - - mov eax, [esp+04] ; dest - - ;------------------------------------------------------------------------- - ; vertical idct ( compute 4 columns at a time -> 2 loops ) - ;------------------------------------------------------------------------- - - %assign disp 0 - %rep 2 - - movq mm1, [eax+disp+1*16] ; mm1 = d1 - movq mm3, [eax+disp+3*16] ; mm3 = d3 - movq mm5, [eax+disp+5*16] ; mm5 = d5 - movq mm7, [eax+disp+7*16] ; mm7 = d7 - - movq mm4, mm7 - psraw mm4, 1 - movq mm0, mm5 - psubw mm0, mm7 - psubw mm0, mm4 - psubw mm0, mm3 ; mm0 = e1 - - movq mm6, mm3 - psraw mm6, 1 - movq mm2, mm7 - psubw mm2, mm6 - psubw mm2, mm3 - paddw mm2, mm1 ; mm2 = e3 - - movq mm4, mm5 - psraw mm4, 1 - paddw mm4, mm5 - paddw mm4, mm7 - psubw mm4, mm1 ; mm4 = e5 - - movq mm6, mm1 - psraw mm6, 1 - paddw mm6, mm1 - paddw mm6, mm5 - paddw mm6, mm3 ; mm6 = e7 - - movq mm1, mm0 - movq mm3, mm4 - movq mm5, mm2 - movq mm7, mm6 - psraw mm6, 2 - psraw mm3, 2 - psraw mm5, 2 - psraw mm0, 2 - paddw mm1, mm6 ; mm1 = f1 - paddw mm3, mm2 ; mm3 = f3 - psubw mm5, mm4 ; mm5 = f5 - psubw mm7, mm0 ; mm7 = f7 - - movq mm2, [eax+disp+2*16] ; mm2 = d2 - movq mm6, [eax+disp+6*16] ; mm6 = d6 - movq mm4, mm2 - movq mm0, mm6 - psraw mm4, 1 - psraw mm6, 1 - psubw mm4, mm0 ; mm4 = a4 - paddw mm6, mm2 ; mm6 = a6 - - movq mm2, [eax+disp+0*16] ; mm2 = d0 - movq mm0, [eax+disp+4*16] ; mm0 = d4 - MMX_SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2 - - MMX_SUMSUB_BADC mm6, mm0, mm4, mm2 ; mm6 = f0, mm0 = f6 - ; mm4 = f2, mm2 = f4 - - MMX_SUMSUB_BADC mm7, mm6, mm5, mm4 ; mm7 = g0, mm6 = g7 - ; mm5 = g1, mm4 = g6 - MMX_SUMSUB_BADC mm3, mm2, mm1, mm0 ; mm3 = g2, mm2 = g5 - ; mm1 = g3, mm0 = g4 - - movq [eax+disp+0*16], mm7 - movq [eax+disp+1*16], mm5 - movq [eax+disp+2*16], mm3 - movq [eax+disp+3*16], mm1 - movq [eax+disp+4*16], mm0 - movq [eax+disp+5*16], mm2 - movq [eax+disp+6*16], mm4 - movq [eax+disp+7*16], mm6 - - %assign disp disp+8 - %endrep - - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] ); -;----------------------------------------------------------------------------- -ALIGN 16 -x264_pixel_add_8x8_mmx: - mov eax, [esp+4] ; dst - mov edx, [esp+8] ; src - - MMX_ZERO mm7 - - %assign disp 0 - %rep 8 - movq mm0, [eax] - movq mm2, [edx+disp] - movq mm3, [edx+disp+8] - movq mm1, mm0 - psraw mm2, 6 - psraw mm3, 6 - punpcklbw mm0, mm7 - punpckhbw mm1, mm7 - paddw mm0, mm2 - paddw mm1, mm3 - packuswb mm0, mm1 - movq [eax], mm0 - add eax, FDEC_STRIDE - %assign disp disp+16 - %endrep - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_transpose_8x8_mmx( int16_t src[8][8] ); -;----------------------------------------------------------------------------- -ALIGN 16 -x264_transpose_8x8_mmx: - mov eax, [esp+4] - - movq mm0, [eax ] - movq mm1, [eax+ 16] - movq mm2, [eax+ 32] - movq mm3, [eax+ 48] - MMX_TRANSPOSE mm0, mm1, mm2, mm3, mm4 - movq [eax ], mm0 - movq [eax+ 16], mm3 - movq [eax+ 32], mm4 - movq [eax+ 48], mm2 - - movq mm0, [eax+ 72] - movq mm1, [eax+ 88] - movq mm2, [eax+104] - movq mm3, [eax+120] - MMX_TRANSPOSE mm0, mm1, mm2, mm3, mm4 - movq [eax+ 72], mm0 - movq [eax+ 88], mm3 - movq [eax+104], mm4 - movq [eax+120], mm2 - - movq mm0, [eax+ 8] - movq mm1, [eax+ 24] - movq mm2, [eax+ 40] - movq mm3, [eax+ 56] - MMX_TRANSPOSE mm0, mm1, mm2, mm3, mm4 - movq mm1, [eax+ 64] - movq mm5, [eax+ 80] - movq mm6, [eax+ 96] - movq mm7, [eax+112] - - movq [eax+ 64], mm0 - movq [eax+ 80], mm3 - movq [eax+ 96], mm4 - movq [eax+112], mm2 - MMX_TRANSPOSE mm1, mm5, mm6, mm7, mm4 - movq [eax+ 8], mm1 - movq [eax+ 24], mm7 - movq [eax+ 40], mm4 - movq [eax+ 56], mm6 - - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) -;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_mmx - push dword [esp+12] - push dword [esp+12] - push dword [esp+12] - call x264_pixel_sub_8x8_mmx - call x264_ydct8_mmx - call x264_transpose_8x8_mmx - add esp, 12 - jmp x264_ydct8_mmx - -;----------------------------------------------------------------------------- -; void __cdecl x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] ) -;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_mmx - mov eax, [esp+8] - add word [eax], 32 - push eax - call x264_yidct8_mmx - call x264_transpose_8x8_mmx - call x264_yidct8_mmx - add esp, 4 - jmp x264_pixel_add_8x8_mmx - -%macro IDCT8_1D 8 - movdqa %1, %3 - movdqa %5, %7 - psraw %3, 1 - psraw %7, 1 - psubw %3, %5 - paddw %7, %1 - movdqa %5, %2 - psraw %5, 1 - paddw %5, %2 - paddw %5, %4 - paddw %5, %6 - movdqa %1, %6 - psraw %1, 1 - paddw %1, %6 - paddw %1, %8 - psubw %1, %2 - psubw %2, %4 - psubw %6, %4 - paddw %2, %8 - psubw %6, %8 - psraw %4, 1 - psraw %8, 1 - psubw %2, %4 - psubw %6, %8 - movdqa %4, %5 - movdqa %8, %1 - psraw %4, 2 - psraw %8, 2 - paddw %4, %6 - paddw %8, %2 - psraw %6, 2 - psraw %2, 2 - psubw %5, %6 - psubw %2, %1 - movdqa %1, [eax+0x00] - movdqa %6, [eax+0x40] - MMX_SUMSUB_BA %6, %1 - MMX_SUMSUB_BA %7, %6 - MMX_SUMSUB_BA %3, %1 - MMX_SUMSUB_BA %5, %7 - MMX_SUMSUB_BA %2, %3 - MMX_SUMSUB_BA %8, %1 - MMX_SUMSUB_BA %4, %6 -%endmacro - -%macro TRANSPOSE8 9 - movdqa [%9], %8 - SBUTTERFLY dqa, wd, %1, %2, %8 - movdqa [%9+16], %8 - movdqa %8, [%9] - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, wd, %5, %6, %4 - SBUTTERFLY dqa, wd, %7, %8, %6 - SBUTTERFLY dqa, dq, %1, %3, %8 - movdqa [%9], %8 - movdqa %8, [16+%9] - SBUTTERFLY dqa, dq, %8, %2, %3 - SBUTTERFLY dqa, dq, %5, %7, %2 - SBUTTERFLY dqa, dq, %4, %6, %7 - SBUTTERFLY dqa, qdq, %1, %5, %6 - SBUTTERFLY dqa, qdq, %8, %4, %5 - movdqa [%9+16], %8 - movdqa %8, [%9] - SBUTTERFLY dqa, qdq, %8, %2, %4 - SBUTTERFLY dqa, qdq, %3, %7, %2 - movdqa %7, [%9+16] -%endmacro - -;----------------------------------------------------------------------------- -; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) -;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2 - mov ecx, [esp+4] - mov eax, [esp+8] - movdqa xmm1, [eax+0x10] - movdqa xmm2, [eax+0x20] - movdqa xmm3, [eax+0x30] - movdqa xmm5, [eax+0x50] - movdqa xmm6, [eax+0x60] - movdqa xmm7, [eax+0x70] - IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 - TRANSPOSE8 xmm4, xmm1, xmm7, xmm3, xmm5, xmm0, xmm2, xmm6, eax - picgetgot edx - paddw xmm4, [pw_32 GLOBAL] - movdqa [eax+0x00], xmm4 - movdqa [eax+0x40], xmm2 - IDCT8_1D xmm4, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7, xmm1 - movdqa [eax+0x60], xmm6 - movdqa [eax+0x70], xmm7 - pxor xmm7, xmm7 - MMX_STORE_DIFF_8P xmm2, [ecx+FDEC_STRIDE*0], xmm6, xmm7 - MMX_STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*1], xmm6, xmm7 - MMX_STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*2], xmm6, xmm7 - MMX_STORE_DIFF_8P xmm3, [ecx+FDEC_STRIDE*3], xmm6, xmm7 - MMX_STORE_DIFF_8P xmm5, [ecx+FDEC_STRIDE*4], xmm6, xmm7 - MMX_STORE_DIFF_8P xmm4, [ecx+FDEC_STRIDE*5], xmm6, xmm7 - movdqa xmm0, [eax+0x60] - movdqa xmm1, [eax+0x70] - MMX_STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7 - MMX_STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7 - ret - -;----------------------------------------------------------------------------- -; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4], -; uint8_t *pix1, uint8_t *pix2 ) -;----------------------------------------------------------------------------- -%macro SUB_NxN_DCT 4 -cglobal %1 - mov edx, [esp+12] - mov ecx, [esp+ 8] - mov eax, [esp+ 4] - add edx, %4 - add ecx, %4 - add eax, %3 - push edx - push ecx - push eax - call %2 - add dword [esp+0], %3 - add dword [esp+4], %4*FENC_STRIDE-%4 - add dword [esp+8], %4*FDEC_STRIDE-%4 - call %2 - add dword [esp+0], %3 - add dword [esp+4], %4 - add dword [esp+8], %4 - call %2 - add esp, 12 - jmp %2 -%endmacro - -;----------------------------------------------------------------------------- -; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) -;----------------------------------------------------------------------------- -%macro ADD_NxN_IDCT 4 -cglobal %1 - mov ecx, [esp+8] - mov eax, [esp+4] - add ecx, %3 - add eax, %4 - push ecx - push eax - call %2 - add dword [esp+0], %4*FDEC_STRIDE-%4 - add dword [esp+4], %3 - call %2 - add dword [esp+0], %4 - add dword [esp+4], %3 - call %2 - add esp, 8 - jmp %2 -%endmacro - -SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx, 32, 4 -ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx, 32, 4 - -SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx, 128, 8 -ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx, 128, 8 - -SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx, 128, 8 -ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8 - -ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8 - -;----------------------------------------------------------------------------- -; void __cdecl x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] ) -;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_4x4_field_mmx - mov edx, [esp+8] - mov ecx, [esp+4] - punpcklwd mm0, [edx] - punpckhwd mm1, [edx] - punpcklwd mm2, [edx+8] - punpckhwd mm3, [edx+8] - punpcklwd mm4, [edx+16] - punpckhwd mm5, [edx+16] - punpcklwd mm6, [edx+24] - punpckhwd mm7, [edx+24] - psrad mm0, 16 - psrad mm1, 16 - psrad mm2, 16 - psrad mm3, 16 - psrad mm4, 16 - psrad mm5, 16 - psrad mm6, 16 - psrad mm7, 16 - movq [ecx ], mm0 - movq [ecx+16], mm2 - movq [ecx+24], mm3 - movq [ecx+32], mm4 - movq [ecx+40], mm5 - movq [ecx+48], mm6 - movq [ecx+56], mm7 - movq [ecx+12], mm1 - movd [ecx+ 8], mm2 - ret diff --git a/common/i386/deblock-a.asm b/common/i386/deblock-a.asm deleted file mode 100644 index f4c52c43..00000000 --- a/common/i386/deblock-a.asm +++ /dev/null @@ -1,503 +0,0 @@ -;***************************************************************************** -;* deblock-a.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2005 x264 project -;* -;* Authors: Loren Merritt -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -BITS 32 - -%include "i386inc.asm" - -SECTION_RODATA -pb_01: times 8 db 0x01 -pb_03: times 8 db 0x03 -pb_a1: times 8 db 0xa1 - -SECTION .text - -; expands to [base],...,[base+7*stride] -%define PASS8ROWS(base, base3, stride, stride3) \ - [base], [base+stride], [base+stride*2], [base3], \ - [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] - -; in: 8 rows of 4 bytes in %1..%8 -; out: 4 rows of 8 bytes in mm0..mm3 -%macro TRANSPOSE4x8_LOAD 8 - movd mm0, %1 - movd mm2, %2 - movd mm1, %3 - movd mm3, %4 - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - movq mm2, mm0 - punpcklwd mm0, mm1 - punpckhwd mm2, mm1 - - movd mm4, %5 - movd mm6, %6 - movd mm5, %7 - movd mm7, %8 - punpcklbw mm4, mm6 - punpcklbw mm5, mm7 - movq mm6, mm4 - punpcklwd mm4, mm5 - punpckhwd mm6, mm5 - - movq mm1, mm0 - movq mm3, mm2 - punpckldq mm0, mm4 - punpckhdq mm1, mm4 - punpckldq mm2, mm6 - punpckhdq mm3, mm6 -%endmacro - -; in: 4 rows of 8 bytes in mm0..mm3 -; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq mm4, mm0 - movq mm5, mm1 - movq mm6, mm2 - punpckhdq mm4, mm4 - punpckhdq mm5, mm5 - punpckhdq mm6, mm6 - - punpcklbw mm0, mm1 - punpcklbw mm2, mm3 - movq mm1, mm0 - punpcklwd mm0, mm2 - punpckhwd mm1, mm2 - movd %1, mm0 - punpckhdq mm0, mm0 - movd %2, mm0 - movd %3, mm1 - punpckhdq mm1, mm1 - movd %4, mm1 - - punpckhdq mm3, mm3 - punpcklbw mm4, mm5 - punpcklbw mm6, mm3 - movq mm5, mm4 - punpcklwd mm4, mm6 - punpckhwd mm5, mm6 - movd %5, mm4 - punpckhdq mm4, mm4 - movd %6, mm4 - movd %7, mm5 - punpckhdq mm5, mm5 - movd %8, mm5 -%endmacro - -%macro SBUTTERFLY 4 - movq %4, %2 - punpckl%1 %2, %3 - punpckh%1 %4, %3 -%endmacro - -; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 -; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] -%macro TRANSPOSE6x8_MEM 9 - movq mm0, %1 - movq mm1, %3 - movq mm2, %5 - movq mm3, %7 - SBUTTERFLY bw, mm0, %2, mm4 - SBUTTERFLY bw, mm1, %4, mm5 - SBUTTERFLY bw, mm2, %6, mm6 - movq [%9+0x10], mm5 - SBUTTERFLY bw, mm3, %8, mm7 - SBUTTERFLY wd, mm0, mm1, mm5 - SBUTTERFLY wd, mm2, mm3, mm1 - punpckhdq mm0, mm2 - movq [%9+0x00], mm0 - SBUTTERFLY wd, mm4, [%9+0x10], mm3 - SBUTTERFLY wd, mm6, mm7, mm2 - SBUTTERFLY dq, mm4, mm6, mm0 - SBUTTERFLY dq, mm5, mm1, mm7 - punpckldq mm3, mm2 - movq [%9+0x10], mm5 - movq [%9+0x20], mm7 - movq [%9+0x30], mm4 - movq [%9+0x40], mm0 - movq [%9+0x50], mm3 -%endmacro - -; out: %4 = |%1-%2|>%3 -; clobbers: %5 -%macro DIFF_GT_MMX 5 - movq %5, %2 - movq %4, %1 - psubusb %5, %1 - psubusb %4, %2 - por %4, %5 - psubusb %4, %3 -%endmacro - -%macro DIFF_GT2_MMX 5 - movq %5, %2 - movq %4, %1 - psubusb %5, %1 - psubusb %4, %2 - psubusb %5, %3 - psubusb %4, %3 - pcmpeqb %4, %5 -%endmacro - -; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1 -; out: mm5=beta-1, mm7=mask -; clobbers: mm4,mm6 -%macro LOAD_MASK_MMX 2 - movd mm4, %1 - movd mm5, %2 - pshufw mm4, mm4, 0 - pshufw mm5, mm5, 0 - packuswb mm4, mm4 ; 8x alpha-1 - packuswb mm5, mm5 ; 8x beta-1 - DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1 - DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1 - por mm7, mm4 - DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1 - por mm7, mm4 - pxor mm6, mm6 - pcmpeqb mm7, mm6 -%endmacro - -; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) -; out: mm1=p0' mm2=q0' -; clobbers: mm0,3-6 -%macro DEBLOCK_P0_Q0_MMX 0 - movq mm5, mm1 - pxor mm5, mm2 ; p0^q0 - pand mm5, [pb_01 GLOBAL] ; (p0^q0)&1 - pcmpeqb mm4, mm4 - pxor mm3, mm4 - pavgb mm3, mm0 ; (p1 - q1 + 256)>>1 - pavgb mm3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 - pxor mm4, mm1 - pavgb mm4, mm2 ; (q0 - p0 + 256)>>1 - pavgb mm3, mm5 - paddusb mm3, mm4 ; d+128+33 - movq mm6, [pb_a1 GLOBAL] - psubusb mm6, mm3 - psubusb mm3, [pb_a1 GLOBAL] - pminub mm6, mm7 - pminub mm3, mm7 - psubusb mm1, mm6 - psubusb mm2, mm3 - paddusb mm1, mm3 - paddusb mm2, mm6 -%endmacro - -; in: mm1=p0 mm2=q0 -; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp -; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) -; clobbers: q2, tmp, tc0 -%macro LUMA_Q1_MMX 6 - movq %6, mm1 - pavgb %6, mm2 - pavgb %2, %6 ; avg(p2,avg(p0,q0)) - pxor %6, %3 - pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 - psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - movq %6, %1 - psubusb %6, %5 - paddusb %5, %1 - pmaxub %2, %6 - pminub %2, %5 - movq %4, %2 -%endmacro - - -SECTION .text - -;----------------------------------------------------------------------------- -; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_v8_luma_mmxext - picpush ebx - picgetgot ebx - push edi - push esi - mov edi, [picesp+12] ; pix - mov esi, [picesp+16] ; stride - mov edx, [picesp+20] ; alpha - mov ecx, [picesp+24] ; beta - dec edx - dec ecx - mov eax, edi - sub eax, esi - sub eax, esi - sub eax, esi ; pix-3*stride - sub esp, 16 - - movq mm0, [eax+esi] ; p1 - movq mm1, [eax+2*esi] ; p0 - movq mm2, [edi] ; q0 - movq mm3, [edi+esi] ; q1 - LOAD_MASK_MMX edx, ecx - - mov ecx, [picesp+44] ; tc0, use only the low 16 bits - movd mm4, [ecx] - punpcklbw mm4, mm4 - punpcklbw mm4, mm4 ; tc = 4x tc0[1], 4x tc0[0] - movq [esp+8], mm4 ; tc - pcmpeqb mm3, mm3 - pcmpgtb mm4, mm3 - pand mm4, mm7 - movq [esp+0], mm4 ; mask - - movq mm3, [eax] ; p2 - DIFF_GT2_MMX mm1, mm3, mm5, mm6, mm7 ; |p2-p0| > beta-1 - pand mm6, mm4 - pand mm4, [esp+8] ; tc - movq mm7, mm4 - psubb mm7, mm6 - pand mm6, mm4 - LUMA_Q1_MMX mm0, mm3, [eax], [eax+esi], mm6, mm4 - - movq mm4, [edi+2*esi] ; q2 - DIFF_GT2_MMX mm2, mm4, mm5, mm6, mm3 ; |q2-q0| > beta-1 - movq mm5, [esp+0] ; mask - pand mm6, mm5 - movq mm5, [esp+8] ; tc - pand mm5, mm6 - psubb mm7, mm6 - movq mm3, [edi+esi] - LUMA_Q1_MMX mm3, mm4, [edi+2*esi], [edi+esi], mm5, mm6 - - DEBLOCK_P0_Q0_MMX - movq [eax+2*esi], mm1 - movq [edi], mm2 - - add esp, 16 - pop esi - pop edi - picpop ebx - ret - - -;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_mmxext - push ebx - push ebp - mov eax, [esp+12] ; pix - mov ebx, [esp+16] ; stride - lea ebp, [ebx+ebx*2] - sub eax, 4 - lea ecx, [eax+ebp] - sub esp, 96 -%define pix_tmp esp - - ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(eax, ecx, ebx, ebp), pix_tmp - lea eax, [eax+ebx*8] - lea ecx, [ecx+ebx*8] - TRANSPOSE6x8_MEM PASS8ROWS(eax, ecx, ebx, ebp), pix_tmp+8 - - ; vertical filter - push dword [esp+124] ; tc0 - push dword [esp+124] ; beta - push dword [esp+124] ; alpha - push dword 16 - push dword pix_tmp - add dword [esp], 0x40 ; pix_tmp+0x30 - call x264_deblock_v8_luma_mmxext - - add dword [esp ], 8 ; pix_tmp+0x38 - add dword [esp+16], 2 ; tc0+2 - call x264_deblock_v8_luma_mmxext - add esp, 20 - - ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - mov eax, [esp+108] ; pix - sub eax, 2 - lea ecx, [eax+ebp] - - movq mm0, [pix_tmp+0x10] - movq mm1, [pix_tmp+0x20] - movq mm2, [pix_tmp+0x30] - movq mm3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(eax, ecx, ebx, ebp) - - lea eax, [eax+ebx*8] - lea ecx, [ecx+ebx*8] - movq mm0, [pix_tmp+0x18] - movq mm1, [pix_tmp+0x28] - movq mm2, [pix_tmp+0x38] - movq mm3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(eax, ecx, ebx, ebp) - - add esp, 96 - pop ebp - pop ebx - ret - - -%macro CHROMA_V_START 0 - push edi - push esi - mov edi, [esp+12] ; pix - mov esi, [esp+16] ; stride - mov edx, [esp+20] ; alpha - mov ecx, [esp+24] ; beta - dec edx - dec ecx - mov eax, edi - sub eax, esi - sub eax, esi -%endmacro - -%macro CHROMA_H_START 0 - push edi - push esi - push ebp - mov edi, [esp+16] - mov esi, [esp+20] - mov edx, [esp+24] - mov ecx, [esp+28] - dec edx - dec ecx - sub edi, 2 - mov ebp, esi - add ebp, esi - add ebp, esi - mov eax, edi - add edi, ebp -%endmacro - -%macro CHROMA_END 0 - pop esi - pop edi - ret -%endmacro - -;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext - CHROMA_V_START - push ebx - mov ebx, [esp+32] ; tc0 - - movq mm0, [eax] - movq mm1, [eax+esi] - movq mm2, [edi] - movq mm3, [edi+esi] - - LOAD_MASK_MMX edx, ecx - movd mm6, [ebx] - punpcklbw mm6, mm6 - pand mm7, mm6 - picgetgot ebx ; no need to push ebx, it's already been done - DEBLOCK_P0_Q0_MMX - - movq [eax+esi], mm1 - movq [edi], mm2 - - pop ebx - CHROMA_END - - -;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext - CHROMA_H_START - push ebx - mov ebx, [esp+36] ; tc0 - sub esp, 16 - - TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp) - movq [esp+8], mm0 - movq [esp+0], mm3 - - LOAD_MASK_MMX edx, ecx - movd mm6, [ebx] - punpcklbw mm6, mm6 - pand mm7, mm6 - picgetgot ebx ; no need to push ebx, it's already been done - DEBLOCK_P0_Q0_MMX - - movq mm0, [esp+8] - movq mm3, [esp+0] - TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp) - - add esp, 16 - pop ebx - pop ebp - CHROMA_END - - -; in: %1=p0 %2=p1 %3=q1 -; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 -%macro CHROMA_INTRA_P0 3 - movq mm4, %1 - pxor mm4, %3 - pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1 - pavgb %1, %3 - psubusb %1, mm4 - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) -%endmacro - -%macro CHROMA_INTRA_BODY 0 - LOAD_MASK_MMX edx, ecx - movq mm5, mm1 - movq mm6, mm2 - CHROMA_INTRA_P0 mm1, mm0, mm3 - CHROMA_INTRA_P0 mm2, mm3, mm0 - psubb mm1, mm5 - psubb mm2, mm6 - pand mm1, mm7 - pand mm2, mm7 - paddb mm1, mm5 - paddb mm2, mm6 -%endmacro - -;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext - CHROMA_V_START - picpush ebx - picgetgot ebx - movq mm0, [eax] - movq mm1, [eax+esi] - movq mm2, [edi] - movq mm3, [edi+esi] - CHROMA_INTRA_BODY - movq [eax+esi], mm1 - movq [edi], mm2 - picpop ebx - CHROMA_END - -;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext - CHROMA_H_START - picpush ebx - picgetgot ebx - TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp) - CHROMA_INTRA_BODY - TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp) - picpop ebx - pop ebp ; needed because of CHROMA_H_START - CHROMA_END - diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm deleted file mode 100644 index 3a34e2ea..00000000 --- a/common/i386/mc-a.asm +++ /dev/null @@ -1,633 +0,0 @@ -;***************************************************************************** -;* mc.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2003 x264 project -;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $ -;* -;* Authors: Min Chen (converted to nasm) -;* Laurent Aimar (init algorithm) -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -;***************************************************************************** -;* * -;* Revision history: * -;* * -;* 2004.05.17 portab mc_copy_w4/8/16 (CM) * -;* * -;***************************************************************************** - -BITS 32 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "i386inc.asm" - -;============================================================================= -; Constants -;============================================================================= - -SECTION_RODATA - -ALIGN 16 -pw_4: times 4 dw 4 -pw_8: times 4 dw 8 -pw_32: times 4 dw 32 -pw_64: times 4 dw 64 - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -;============================================================================= -; pixel avg -;============================================================================= - -;----------------------------------------------------------------------------- -; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src, int src_stride, -; int height ); -;----------------------------------------------------------------------------- -%macro AVG_START 1 -cglobal %1 - push ebx - mov eax, [esp+12] ; dst - mov ebx, [esp+16] ; dst_stride - mov ecx, [esp+20] ; src - mov edx, [esp+24] ; src_stride - ; esi = height -.height_loop: -%endmacro - -%macro AVG_END 0 - sub esi, 2 - lea eax, [eax+ebx*2] - lea ecx, [ecx+edx*2] - jg .height_loop - pop ebx - pop esi - ret -%endmacro - -AVG_START x264_pixel_avg_w4_mmxext - movd mm0, [ecx] - movd mm1, [ecx+edx] - pavgb mm0, [eax] - pavgb mm1, [eax+ebx] - movd [eax], mm0 - movd [eax+ebx], mm1 -AVG_END - -AVG_START x264_pixel_avg_w8_mmxext - movq mm0, [ecx] - movq mm1, [ecx+edx] - pavgb mm0, [eax] - pavgb mm1, [eax+ebx] - movq [eax], mm0 - movq [eax+ebx], mm1 -AVG_END - -AVG_START x264_pixel_avg_w16_mmxext - movq mm0, [ecx] - movq mm1, [ecx+8] - movq mm2, [ecx+edx] - movq mm3, [ecx+edx+8] - pavgb mm0, [eax] - pavgb mm1, [eax+8] - pavgb mm2, [eax+ebx] - pavgb mm3, [eax+ebx+8] - movq [eax], mm0 - movq [eax+8], mm1 - movq [eax+ebx], mm2 - movq [eax+ebx+8], mm3 -AVG_END - -AVG_START x264_pixel_avg_w16_sse2 - movdqu xmm0, [ecx] - movdqu xmm1, [ecx+edx] - pavgb xmm0, [eax] - pavgb xmm1, [eax+ebx] - movdqa [eax], xmm0 - movdqa [eax+ebx], xmm1 -AVG_END - -%macro AVGH 2 -cglobal x264_pixel_avg_%1x%2_mmxext - push esi - mov esi, %2 - jmp x264_pixel_avg_w%1_mmxext -%endmacro - -AVGH 16, 16 -AVGH 16, 8 -AVGH 8, 16 -AVGH 8, 8 -AVGH 8, 4 -AVGH 4, 8 -AVGH 4, 4 -AVGH 4, 2 - -%macro AVG2_START 1 -cglobal %1 - push ebx - push esi - push edi - push ebp - mov eax, [esp+20] ; dst - mov ebx, [esp+24] ; dst_stride - mov ecx, [esp+28] ; src1 - mov edx, [esp+32] ; src_stride - mov edi, [esp+36] ; src2 - mov esi, [esp+40] ; height - sub edi, ecx - lea ebp, [edi+edx] -.height_loop: -%endmacro - -%macro AVG2_END 0 - sub esi, 2 - lea eax, [eax+ebx*2] - lea ecx, [ecx+edx*2] - jg .height_loop - pop ebp - pop edi - pop esi - pop ebx - ret -%endmacro - -AVG2_START x264_pixel_avg2_w4_mmxext - movd mm0, [ecx] - movd mm1, [ecx+edx] - pavgb mm0, [ecx+edi] - pavgb mm1, [ecx+ebp] - movd [eax], mm0 - movd [eax+ebx], mm1 -AVG2_END - -AVG2_START x264_pixel_avg2_w8_mmxext - movq mm0, [ecx] - movq mm1, [ecx+edx] - pavgb mm0, [ecx+edi] - pavgb mm1, [ecx+ebp] - movq [eax], mm0 - movq [eax+ebx], mm1 -AVG2_END - -AVG2_START x264_pixel_avg2_w16_mmxext - movq mm0, [ecx] - movq mm1, [ecx+8] - movq mm2, [ecx+edx] - movq mm3, [ecx+edx+8] - pavgb mm0, [ecx+edi] - pavgb mm1, [ecx+edi+8] - pavgb mm2, [ecx+ebp] - pavgb mm3, [ecx+ebp+8] - movq [eax], mm0 - movq [eax+8], mm1 - movq [eax+ebx], mm2 - movq [eax+ebx+8], mm3 -AVG2_END - -AVG2_START x264_pixel_avg2_w20_mmxext - movq mm0, [ecx] - movq mm1, [ecx+8] - movd mm2, [ecx+16] - movq mm3, [ecx+edx] - movq mm4, [ecx+edx+8] - movd mm5, [ecx+edx+16] - pavgb mm0, [ecx+edi] - pavgb mm1, [ecx+edi+8] - pavgb mm2, [ecx+edi+16] - pavgb mm3, [ecx+ebp] - pavgb mm4, [ecx+ebp+8] - pavgb mm5, [ecx+ebp+16] - movq [eax], mm0 - movq [eax+8], mm1 - movd [eax+16], mm2 - movq [eax+ebx], mm3 - movq [eax+ebx+8], mm4 - movd [eax+ebx+16], mm5 -AVG2_END - - - -;============================================================================= -; weighted prediction -;============================================================================= -; implicit bipred only: -; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 - -%macro BIWEIGHT_4P_MMX 2 - movd mm0, %1 - movd mm1, %2 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - pmullw mm0, mm4 - pmullw mm1, mm5 - paddw mm0, mm1 - paddw mm0, mm6 - psraw mm0, 6 - pmaxsw mm0, mm7 - packuswb mm0, mm0 - movd %1, mm0 -%endmacro - -%macro BIWEIGHT_START_MMX 0 - push edi - push esi - picgetgot ecx - movq mm5, [pw_64 GLOBAL] - movq mm6, [pw_32 GLOBAL] ; rounding - mov edi, [esp+12] ; dst - mov esi, [esp+16] ; i_dst - mov edx, [esp+20] ; src - mov ecx, [esp+24] ; i_src - pshufw mm4, [esp+28], 0 ; weight_dst - pxor mm7, mm7 - psubw mm5, mm4 ; weight_src -%endmacro - -%macro BIWEIGHT_END_MMX 0 - pop esi - pop edi - ret -%endmacro - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_w16_mmxext - BIWEIGHT_START_MMX - mov eax, [esp+32] ; i_height - ALIGN 4 - .height_loop - - BIWEIGHT_4P_MMX [edi ], [edx ] - BIWEIGHT_4P_MMX [edi+ 4], [edx+ 4] - BIWEIGHT_4P_MMX [edi+ 8], [edx+ 8] - BIWEIGHT_4P_MMX [edi+12], [edx+12] - - add edi, esi - add edx, ecx - dec eax - jg .height_loop - BIWEIGHT_END_MMX - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_w8_mmxext - BIWEIGHT_START_MMX - mov eax, [esp+32] - ALIGN 4 - .height_loop - - BIWEIGHT_4P_MMX [edi ], [edx ] - BIWEIGHT_4P_MMX [edi+4 ], [edx+4 ] - BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ] - BIWEIGHT_4P_MMX [edi+esi+4], [edx+ecx+4] - - lea edi, [edi+esi*2] - lea edx, [edx+ecx*2] - sub eax, byte 2 - jg .height_loop - BIWEIGHT_END_MMX - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_4x4_mmxext - BIWEIGHT_START_MMX - BIWEIGHT_4P_MMX [edi ], [edx ] - BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ] - BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2] - add edi, esi - add edx, ecx - BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2] - BIWEIGHT_END_MMX - - - -;============================================================================= -; pixel copy -;============================================================================= - -;----------------------------------------------------------------------------- -; void x264_mc_copy_w4_mmx( uint8_t *src, int i_src_stride, -; uint8_t *dst, int i_dst_stride, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w4_mmx - push ebx - push esi - push edi - - mov esi, [esp+24] ; src - mov edi, [esp+16] ; dst - mov ebx, [esp+28] ; i_src_stride - mov edx, [esp+20] ; i_dst_stride - mov ecx, [esp+32] ; i_height -ALIGN 4 -.height_loop - mov eax, [esi] - mov [edi], eax - mov eax, [esi+ebx] - mov [edi+edx], eax - lea esi, [esi+ebx*2] - lea edi, [edi+edx*2] - dec ecx - dec ecx - jg .height_loop - - pop edi - pop esi - pop ebx - ret - -;----------------------------------------------------------------------------- -; void x264_mc_copy_w8_mmx( uint8_t *src, int i_src_stride, -; uint8_t *dst, int i_dst_stride, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w8_mmx - push ebx - push esi - push edi - - mov esi, [esp+24] ; src - mov edi, [esp+16] ; dst - mov ebx, [esp+28] ; i_src_stride - mov edx, [esp+20] ; i_dst_stride - mov ecx, [esp+32] ; i_height -ALIGN 4 -.height_loop - movq mm0, [esi] - movq [edi], mm0 - movq mm1, [esi+ebx] - movq [edi+edx], mm1 - movq mm2, [esi+ebx*2] - movq [edi+edx*2], mm2 - lea esi, [esi+ebx*2] - lea edi, [edi+edx*2] - movq mm3, [esi+ebx] - movq [edi+edx], mm3 - lea esi, [esi+ebx*2] - lea edi, [edi+edx*2] - - sub ecx, byte 4 - jg .height_loop - - pop edi - pop esi - pop ebx - ret - -;----------------------------------------------------------------------------- -; void x264_mc_copy_w16_mmx( uint8_t *src, int i_src_stride, -; uint8_t *dst, int i_dst_stride, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w16_mmx - push ebx - push esi - push edi - - mov esi, [esp+24] ; src - mov edi, [esp+16] ; dst - mov ebx, [esp+28] ; i_src_stride - mov edx, [esp+20] ; i_dst_stride - mov ecx, [esp+32] ; i_height - -ALIGN 4 -.height_loop - movq mm0, [esi] - movq mm1, [esi+8] - movq [edi], mm0 - movq [edi+8], mm1 - movq mm2, [esi+ebx] - movq mm3, [esi+ebx+8] - movq [edi+edx], mm2 - movq [edi+edx+8], mm3 - movq mm4, [esi+ebx*2] - movq mm5, [esi+ebx*2+8] - movq [edi+edx*2], mm4 - movq [edi+edx*2+8], mm5 - lea esi, [esi+ebx*2] - lea edi, [edi+edx*2] - movq mm6, [esi+ebx] - movq mm7, [esi+ebx+8] - movq [edi+edx], mm6 - movq [edi+edx+8], mm7 - lea esi, [esi+ebx*2] - lea edi, [edi+edx*2] - sub ecx, byte 4 - jg .height_loop - - pop edi - pop esi - pop ebx - ret - - -;----------------------------------------------------------------------------- -; void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w16_sse2 - push ebx - push esi - push edi - - mov esi, [esp+24] ; src - mov edi, [esp+16] ; dst - mov ebx, [esp+28] ; i_src_stride - mov edx, [esp+20] ; i_dst_stride - mov ecx, [esp+32] ; i_height - -ALIGN 4 -.height_loop - movdqu xmm0, [esi] - movdqu xmm1, [esi+ebx] - movdqu [edi], xmm0 - movdqu [edi+edx], xmm1 - dec ecx - dec ecx - lea esi, [esi+ebx*2] - lea edi, [edi+edx*2] - jg .height_loop - - pop edi - pop esi - pop ebx - ret - - - -;============================================================================= -; chroma MC -;============================================================================= - -;----------------------------------------------------------------------------- -; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, -; int dx, int dy, -; int i_width, int i_height ) -;----------------------------------------------------------------------------- - -cglobal x264_mc_chroma_mmxext - picpush ebx - picgetgot ebx - push edi - - mov ecx, [picesp+4+24] - mov edx, [picesp+4+20] - mov eax, ecx - mov edi, edx - sar ecx, 3 - sar edx, 3 - imul ecx, [picesp+4+16] - add ecx, edx - add [picesp+4+12], ecx ; src += (dx>>3) + (dy>>3) * src_stride - - pxor mm3, mm3 - - and edi, 7 - and eax, 7 - movd mm5, edi - movd mm6, eax - pshufw mm5, mm5, 0 ; mm5 = dx&7 - pshufw mm6, mm6, 0 ; mm6 = dy&7 - - movq mm4, [pw_8 GLOBAL] - movq mm0, mm4 - - psubw mm4, mm5 ; mm4 = 8-dx - psubw mm0, mm6 ; mm0 = 8-dy - - movq mm7, mm5 - pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB - pmullw mm7, mm6 ; mm7 = dx*dy = cD - pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC - pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA - - mov eax, [picesp+4+12] ; src - mov edi, [picesp+4+4] ; dst - mov ecx, [picesp+4+16] ; i_src_stride - mov edx, [picesp+4+32] ; i_height - -ALIGN 4 -.height_loop - - movd mm1, [eax+ecx] - movd mm0, [eax] - punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 - punpcklbw mm0, mm3 - pmullw mm1, mm6 ; 2nd line * cC - pmullw mm0, mm4 ; 1st line * cA - - paddw mm0, mm1 ; mm0 <- result - - movd mm2, [eax+1] - movd mm1, [eax+ecx+1] - punpcklbw mm2, mm3 - punpcklbw mm1, mm3 - - paddw mm0, [pw_32 GLOBAL] - - pmullw mm2, mm5 ; line * cB - pmullw mm1, mm7 ; line * cD - paddw mm0, mm2 - paddw mm0, mm1 - - psrlw mm0, 6 - packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4 - movd [edi], mm0 - - add eax, ecx - add edi, [picesp+4+8] - - dec edx - jnz .height_loop - - sub [picesp+4+28], dword 8 - jnz .finish ; width != 8 so assume 4 - - mov edi, [picesp+4+4] ; dst - mov eax, [picesp+4+12] ; src - mov edx, [picesp+4+32] ; i_height - add edi, 4 - add eax, 4 - jmp .height_loop - -.finish - pop edi - picpop ebx - ret - - - -; prefetches tuned for 64 byte cachelines (K7/K8/Core2) -; TODO add 32 and 128 byte versions for P3/P4 - -;----------------------------------------------------------------------------- -; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, -; uint8_t *pix_uv, int stride_uv, int mb_x ) -;----------------------------------------------------------------------------- -cglobal x264_prefetch_fenc_mmxext - mov eax, [esp+20] - mov ecx, [esp+8] - mov edx, [esp+4] - and eax, 3 - imul eax, ecx - lea edx, [edx+eax*4+64] - prefetcht0 [edx] - prefetcht0 [edx+ecx] - lea edx, [edx+ecx*2] - prefetcht0 [edx] - prefetcht0 [edx+ecx] - - mov eax, [esp+20] - mov ecx, [esp+16] - mov edx, [esp+12] - and eax, 6 - imul eax, ecx - lea edx, [edx+eax+64] - prefetcht0 [edx] - prefetcht0 [edx+ecx] - ret - -;----------------------------------------------------------------------------- -; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) -;----------------------------------------------------------------------------- -cglobal x264_prefetch_ref_mmxext - mov eax, [esp+12] - mov ecx, [esp+8] - mov edx, [esp+4] - sub eax, 1 - and eax, ecx - lea edx, [edx+eax*8+64] - lea eax, [ecx*3] - prefetcht0 [edx] - prefetcht0 [edx+ecx] - prefetcht0 [edx+ecx*2] - prefetcht0 [edx+eax] - lea edx, [edx+ecx*4] - prefetcht0 [edx] - prefetcht0 [edx+ecx] - prefetcht0 [edx+ecx*2] - prefetcht0 [edx+eax] - ret diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm deleted file mode 100644 index 21f4c7c9..00000000 --- a/common/i386/pixel-a.asm +++ /dev/null @@ -1,1835 +0,0 @@ -;***************************************************************************** -;* pixel.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2003 x264 project -;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ -;* -;* Authors: Laurent Aimar -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -BITS 32 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "i386inc.asm" - -; sad - -%macro SAD_INC_2x16P 0 - movq mm1, [eax] - movq mm2, [eax+8] - movq mm3, [eax+ebx] - movq mm4, [eax+ebx+8] - psadbw mm1, [ecx] - psadbw mm2, [ecx+8] - psadbw mm3, [ecx+edx] - psadbw mm4, [ecx+edx+8] - lea eax, [eax+2*ebx] - paddw mm1, mm2 - paddw mm3, mm4 - lea ecx, [ecx+2*edx] - paddw mm0, mm1 - paddw mm0, mm3 -%endmacro - -%macro SAD_INC_2x8P 0 - movq mm1, [eax] - movq mm2, [eax+ebx] - psadbw mm1, [ecx] - psadbw mm2, [ecx+edx] - lea eax, [eax+2*ebx] - paddw mm0, mm1 - paddw mm0, mm2 - lea ecx, [ecx+2*edx] -%endmacro - -%macro SAD_INC_2x4P 0 - movd mm1, [eax] - movd mm2, [ecx] - punpckldq mm1, [eax+ebx] - punpckldq mm2, [ecx+edx] - psadbw mm1, mm2 - paddw mm0, mm1 - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] -%endmacro - -; sad x3 / x4 - -%macro SAD_X3_START 0 - push edi - push esi - mov edi, [esp+12] - mov eax, [esp+16] - mov ecx, [esp+20] - mov edx, [esp+24] - mov esi, [esp+28] -%endmacro - -%macro SAD_X3_START_1x8P 0 - movq mm3, [edi] - movq mm0, [eax] - movq mm1, [ecx] - movq mm2, [edx] - psadbw mm0, mm3 - psadbw mm1, mm3 - psadbw mm2, mm3 -%endmacro - -%macro SAD_X3_1x8P 2 - movq mm3, [edi+%1] - movq mm4, [eax+%2] - movq mm5, [ecx+%2] - movq mm6, [edx+%2] - psadbw mm4, mm3 - psadbw mm5, mm3 - psadbw mm6, mm3 - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm2, mm6 -%endmacro - -%macro SAD_X3_START_2x4P 3 - movd mm3, [edi] - movd %1, [eax] - movd %2, [ecx] - movd %3, [edx] - punpckldq mm3, [edi+FENC_STRIDE] - punpckldq %1, [eax+esi] - punpckldq %2, [ecx+esi] - punpckldq %3, [edx+esi] - psadbw %1, mm3 - psadbw %2, mm3 - psadbw %3, mm3 -%endmacro - -%macro SAD_X3_2x16P 1 -%if %1 - SAD_X3_START - SAD_X3_START_1x8P -%else - SAD_X3_1x8P 0, 0 -%endif - SAD_X3_1x8P 8, 8 - SAD_X3_1x8P FENC_STRIDE, esi - SAD_X3_1x8P FENC_STRIDE+8, esi+8 - add edi, 2*FENC_STRIDE - lea eax, [eax+2*esi] - lea ecx, [ecx+2*esi] - lea edx, [edx+2*esi] -%endmacro - -%macro SAD_X3_2x8P 1 -%if %1 - SAD_X3_START - SAD_X3_START_1x8P -%else - SAD_X3_1x8P 0, 0 -%endif - SAD_X3_1x8P FENC_STRIDE, esi - add edi, 2*FENC_STRIDE - lea eax, [eax+2*esi] - lea ecx, [ecx+2*esi] - lea edx, [edx+2*esi] -%endmacro - -%macro SAD_X3_2x4P 1 -%if %1 - SAD_X3_START - SAD_X3_START_2x4P mm0, mm1, mm2 -%else - SAD_X3_START_2x4P mm4, mm5, mm6 - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm2, mm6 -%endif - add edi, 2*FENC_STRIDE - lea eax, [eax+2*esi] - lea ecx, [ecx+2*esi] - lea edx, [edx+2*esi] -%endmacro - -%macro SAD_X4_START 0 - push edi - push esi - push ebx - mov edi, [esp+16] - mov eax, [esp+20] - mov ebx, [esp+24] - mov ecx, [esp+28] - mov edx, [esp+32] - mov esi, [esp+36] -%endmacro - -%macro SAD_X4_START_1x8P 0 - movq mm7, [edi] - movq mm0, [eax] - movq mm1, [ebx] - movq mm2, [ecx] - movq mm3, [edx] - psadbw mm0, mm7 - psadbw mm1, mm7 - psadbw mm2, mm7 - psadbw mm3, mm7 -%endmacro - -%macro SAD_X4_1x8P 2 - movq mm7, [edi+%1] - movq mm4, [eax+%2] - movq mm5, [ebx+%2] - movq mm6, [ecx+%2] - psadbw mm4, mm7 - psadbw mm5, mm7 - psadbw mm6, mm7 - psadbw mm7, [edx+%2] - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm2, mm6 - paddw mm3, mm7 -%endmacro - -%macro SAD_X4_START_2x4P 0 - movd mm7, [edi] - movd mm0, [eax] - movd mm1, [ebx] - movd mm2, [ecx] - movd mm3, [edx] - punpckldq mm7, [edi+FENC_STRIDE] - punpckldq mm0, [eax+esi] - punpckldq mm1, [ebx+esi] - punpckldq mm2, [ecx+esi] - punpckldq mm3, [edx+esi] - psadbw mm0, mm7 - psadbw mm1, mm7 - psadbw mm2, mm7 - psadbw mm3, mm7 -%endmacro - -%macro SAD_X4_INC_2x4P 0 - movd mm7, [edi] - movd mm4, [eax] - movd mm5, [ebx] - punpckldq mm7, [edi+FENC_STRIDE] - punpckldq mm4, [eax+esi] - punpckldq mm5, [ebx+esi] - psadbw mm4, mm7 - psadbw mm5, mm7 - paddw mm0, mm4 - paddw mm1, mm5 - movd mm4, [ecx] - movd mm5, [edx] - punpckldq mm4, [ecx+esi] - punpckldq mm5, [edx+esi] - psadbw mm4, mm7 - psadbw mm5, mm7 - paddw mm2, mm4 - paddw mm3, mm5 -%endmacro - -%macro SAD_X4_2x16P 1 -%if %1 - SAD_X4_START - SAD_X4_START_1x8P -%else - SAD_X4_1x8P 0, 0 -%endif - SAD_X4_1x8P 8, 8 - SAD_X4_1x8P FENC_STRIDE, esi - SAD_X4_1x8P FENC_STRIDE+8, esi+8 - add edi, 2*FENC_STRIDE - lea eax, [eax+2*esi] - lea ebx, [ebx+2*esi] - lea ecx, [ecx+2*esi] - lea edx, [edx+2*esi] -%endmacro - -%macro SAD_X4_2x8P 1 -%if %1 - SAD_X4_START - SAD_X4_START_1x8P -%else - SAD_X4_1x8P 0, 0 -%endif - SAD_X4_1x8P FENC_STRIDE, esi - add edi, 2*FENC_STRIDE - lea eax, [eax+2*esi] - lea ebx, [ebx+2*esi] - lea ecx, [ecx+2*esi] - lea edx, [edx+2*esi] -%endmacro - -%macro SAD_X4_2x4P 1 -%if %1 - SAD_X4_START - SAD_X4_START_2x4P -%else - SAD_X4_INC_2x4P -%endif - add edi, 2*FENC_STRIDE - lea eax, [eax+2*esi] - lea ebx, [ebx+2*esi] - lea ecx, [ecx+2*esi] - lea edx, [edx+2*esi] -%endmacro - -%macro SAD_X3_END 0 - mov eax, [esp+32] - movd [eax+0], mm0 - movd [eax+4], mm1 - movd [eax+8], mm2 - pop esi - pop edi - ret -%endmacro - -%macro SAD_X4_END 0 - mov eax, [esp+40] - movd [eax+0], mm0 - movd [eax+4], mm1 - movd [eax+8], mm2 - movd [eax+12], mm3 - pop ebx - pop esi - pop edi - ret -%endmacro - -; ssd - -%macro SSD_INC_1x16P 0 - movq mm1, [eax] - movq mm2, [ecx] - movq mm3, [eax+8] - movq mm4, [ecx+8] - - movq mm5, mm2 - movq mm6, mm4 - psubusb mm2, mm1 - psubusb mm4, mm3 - psubusb mm1, mm5 - psubusb mm3, mm6 - por mm1, mm2 - por mm3, mm4 - - movq mm2, mm1 - movq mm4, mm3 - punpcklbw mm1, mm7 - punpcklbw mm3, mm7 - punpckhbw mm2, mm7 - punpckhbw mm4, mm7 - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - pmaddwd mm3, mm3 - pmaddwd mm4, mm4 - - add eax, ebx - add ecx, edx - paddd mm0, mm1 - paddd mm0, mm2 - paddd mm0, mm3 - paddd mm0, mm4 -%endmacro - -%macro SSD_INC_1x8P 0 - movq mm1, [eax] - movq mm2, [ecx] - - movq mm5, mm2 - psubusb mm2, mm1 - psubusb mm1, mm5 - por mm1, mm2 ; mm1 = 8bit abs diff - - movq mm2, mm1 - punpcklbw mm1, mm7 - punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - - add eax, ebx - add ecx, edx - paddd mm0, mm1 - paddd mm0, mm2 -%endmacro - -%macro SSD_INC_1x4P 0 - movd mm1, [eax] - movd mm2, [ecx] - - movq mm5, mm2 - psubusb mm2, mm1 - psubusb mm1, mm5 - por mm1, mm2 - punpcklbw mm1, mm7 - pmaddwd mm1, mm1 - - add eax, ebx - add ecx, edx - paddd mm0, mm1 -%endmacro - -; satd - -%macro SUMSUB_BADC 4 - paddw %1, %2 - paddw %3, %4 - paddw %2, %2 - paddw %4, %4 - psubw %2, %1 - psubw %4, %3 -%endmacro - -%macro HADAMARD4x4 4 - SUMSUB_BADC %1, %2, %3, %4 - SUMSUB_BADC %1, %3, %2, %4 -%endmacro - -%macro SBUTTERFLYwd 3 - movq %3, %1 - punpcklwd %1, %2 - punpckhwd %3, %2 -%endmacro - -%macro SBUTTERFLYdq 3 - movq %3, %1 - punpckldq %1, %2 - punpckhdq %3, %2 -%endmacro - -%macro TRANSPOSE4x4 5 ; abcd-t -> adtc - SBUTTERFLYwd %1, %2, %5 - SBUTTERFLYwd %3, %4, %2 - SBUTTERFLYdq %1, %3, %4 - SBUTTERFLYdq %5, %2, %3 -%endmacro - -%macro MMX_ABS 2 ; mma, tmp - pxor %2, %2 - psubw %2, %1 - pmaxsw %1, %2 -%endmacro - -%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1 - pxor %3, %3 - pxor %4, %4 - psubw %3, %1 - psubw %4, %2 - pmaxsw %1, %3 - pmaxsw %2, %4 -%endmacro - -%macro HADAMARD4x4_SUM 1 ; %1 - dest (row sum of one block) - HADAMARD4x4 mm4, mm5, mm6, mm7 - TRANSPOSE4x4 mm4, mm5, mm6, mm7, %1 - HADAMARD4x4 mm4, mm7, %1, mm6 - MMX_ABS_TWO mm4, mm7, mm3, mm5 - MMX_ABS_TWO %1, mm6, mm3, mm5 - paddw %1, mm4 - paddw mm6, mm7 - pavgw %1, mm6 -%endmacro - -%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy - movd %1, [eax+ebx*%4+%3] - movd %2, [ecx+edx*%4+%3] - punpcklbw %1, %2 - punpcklbw %2, %2 - psubw %1, %2 -%endmacro - -; in: %2 = horizontal offset -; in: %3 = whether we need to increment pix1 and pix2 -; clobber: mm3..mm7 -; out: %1 = satd -%macro LOAD_DIFF_HADAMARD_SUM 3 -%if %3 - LOAD_DIFF_4P mm4, mm3, %2, 0 - LOAD_DIFF_4P mm5, mm3, %2, 1 - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - LOAD_DIFF_4P mm6, mm3, %2, 0 - LOAD_DIFF_4P mm7, mm3, %2, 1 - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] -%else - LOAD_DIFF_4P mm4, mm3, %2, 0 - LOAD_DIFF_4P mm6, mm3, %2, 2 - add eax, ebx - add ecx, edx - LOAD_DIFF_4P mm5, mm3, %2, 0 - LOAD_DIFF_4P mm7, mm3, %2, 2 -%endif - HADAMARD4x4_SUM %1 -%endmacro - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -%macro SAD_START 0 - push ebx - - mov eax, [esp+ 8] ; pix1 - mov ebx, [esp+12] ; stride1 - mov ecx, [esp+16] ; pix2 - mov edx, [esp+20] ; stride2 - - pxor mm0, mm0 -%endmacro -%macro SAD_END 0 - movd eax, mm0 - - pop ebx - ret -%endmacro - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -%macro SAD 2 -cglobal x264_pixel_sad_%1x%2_mmxext - SAD_START -%rep %2/2 - SAD_INC_2x%1P -%endrep - SAD_END -%endmacro - -SAD 16, 16 -SAD 16, 8 -SAD 8, 16 -SAD 8, 8 -SAD 8, 4 -SAD 4, 8 -SAD 4, 4 - -;----------------------------------------------------------------------------- -; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, -; uint8_t *pix2, int i_stride, int scores[3] ) -;----------------------------------------------------------------------------- -%macro SAD_X 3 -cglobal x264_pixel_sad_x%1_%2x%3_mmxext - SAD_X%1_2x%2P 1 -%rep %3/2-1 - SAD_X%1_2x%2P 0 -%endrep - SAD_X%1_END -%endmacro - -SAD_X 3, 16, 16 -SAD_X 3, 16, 8 -SAD_X 3, 8, 16 -SAD_X 3, 8, 8 -SAD_X 3, 8, 4 -SAD_X 3, 4, 8 -SAD_X 3, 4, 4 -SAD_X 4, 16, 16 -SAD_X 4, 16, 8 -SAD_X 4, 8, 16 -SAD_X 4, 8, 8 -SAD_X 4, 8, 4 -SAD_X 4, 4, 8 -SAD_X 4, 4, 4 - - -%macro SSD_START 0 - push ebx - - mov eax, [esp+ 8] ; pix1 - mov ebx, [esp+12] ; stride1 - mov ecx, [esp+16] ; pix2 - mov edx, [esp+20] ; stride2 - - pxor mm7, mm7 ; zero - pxor mm0, mm0 ; mm0 holds the sum -%endmacro - -%macro SSD_END 0 - movq mm1, mm0 - psrlq mm1, 32 - paddd mm0, mm1 - movd eax, mm0 - - pop ebx - ret -%endmacro - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -%macro SSD 2 -cglobal x264_pixel_ssd_%1x%2_mmx - SSD_START -%rep %2 - SSD_INC_1x%1P -%endrep - SSD_END -%endmacro - -SSD 16, 16 -SSD 16, 8 -SSD 8, 16 -SSD 8, 8 -SSD 8, 4 -SSD 4, 8 -SSD 4, 4 - - - -%macro SATD_START 0 - push ebx - - mov eax, [esp+ 8] ; pix1 - mov ebx, [esp+12] ; stride1 - mov ecx, [esp+16] ; pix2 - mov edx, [esp+20] ; stride2 -%endmacro - -%macro SATD_END 0 - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - pshufw mm1, mm0, 10110001b - paddw mm0, mm1 - movd eax, mm0 - and eax, 0xffff - pop ebx - ret -%endmacro - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_4x4_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_4x8_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x4_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 - sub eax, ebx - sub ecx, edx - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x8_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm2 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_16x8_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm2 - - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 8, 0 - paddw mm0, mm2 - - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm2, 12, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 - paddw mm0, mm2 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x16_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm2, 0, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - paddw mm0, mm2 - - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 - paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm2 - paddw mm0, mm1 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_16x16_mmxext - SATD_START - LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm2, 0, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - paddw mm0, mm2 - - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 - paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 - paddw mm0, mm2 - - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 8, 1 - paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 8, 0 - paddw mm0, mm2 - - mov eax, [esp+ 8] ; pix1 - mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm2, 12, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 - paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm2, 12, 1 - paddw mm0, mm1 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 - paddw mm0, mm2 - paddw mm0, mm1 - - pxor mm3, mm3 - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - punpcklwd mm0, mm3 - pshufw mm1, mm0, 01001110b - paddd mm0, mm1 - movd eax, mm0 - pop ebx - ret - - -%macro LOAD_DIFF_4x8P 1 ; dx - LOAD_DIFF_4P mm0, mm7, %1, 0 - LOAD_DIFF_4P mm1, mm7, %1, 1 - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - LOAD_DIFF_4P mm2, mm7, %1, 0 - LOAD_DIFF_4P mm3, mm7, %1, 1 - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - LOAD_DIFF_4P mm4, mm7, %1, 0 - LOAD_DIFF_4P mm5, mm7, %1, 1 - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - LOAD_DIFF_4P mm6, mm7, %1, 0 - movq [spill], mm6 - LOAD_DIFF_4P mm7, mm6, %1, 1 - movq mm6, [spill] -%endmacro - -%macro HADAMARD1x8 8 - SUMSUB_BADC %1, %5, %2, %6 - SUMSUB_BADC %3, %7, %4, %8 - SUMSUB_BADC %1, %3, %2, %4 - SUMSUB_BADC %5, %7, %6, %8 - SUMSUB_BADC %1, %2, %3, %4 - SUMSUB_BADC %5, %6, %7, %8 -%endmacro - -%macro SUM4x8_MM 0 - movq [spill], mm6 - movq [spill+8], mm7 - MMX_ABS_TWO mm0, mm1, mm6, mm7 - MMX_ABS_TWO mm2, mm3, mm6, mm7 - paddw mm0, mm2 - paddw mm1, mm3 - movq mm6, [spill] - movq mm7, [spill+8] - MMX_ABS_TWO mm4, mm5, mm2, mm3 - MMX_ABS_TWO mm6, mm7, mm2, mm3 - paddw mm4, mm6 - paddw mm5, mm7 - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm0, mm1 -%endmacro - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_sa8d_8x8_mmxext - SATD_START - sub esp, 0x70 -%define args esp+0x74 -%define spill esp+0x60 ; +16 -%define trans esp+0 ; +96 - LOAD_DIFF_4x8P 0 - HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 - - movq [spill], mm0 - TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 - movq [trans+0x00], mm4 - movq [trans+0x08], mm7 - movq [trans+0x10], mm0 - movq [trans+0x18], mm6 - movq mm0, [spill] - TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4 - movq [trans+0x20], mm0 - movq [trans+0x28], mm3 - movq [trans+0x30], mm4 - movq [trans+0x38], mm2 - - mov eax, [args+4] - mov ecx, [args+12] - LOAD_DIFF_4x8P 4 - HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 - - movq [spill], mm7 - TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7 - movq [trans+0x40], mm0 - movq [trans+0x48], mm3 - movq [trans+0x50], mm7 - movq [trans+0x58], mm2 - movq mm7, [spill] - TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 - movq mm5, [trans+0x00] - movq mm1, [trans+0x08] - movq mm2, [trans+0x10] - movq mm3, [trans+0x18] - - HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 - SUM4x8_MM - movq [trans], mm0 - - movq mm0, [trans+0x20] - movq mm1, [trans+0x28] - movq mm2, [trans+0x30] - movq mm3, [trans+0x38] - movq mm4, [trans+0x40] - movq mm5, [trans+0x48] - movq mm6, [trans+0x50] - movq mm7, [trans+0x58] - - HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 - SUM4x8_MM - - pavgw mm0, [esp] - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - pshufw mm1, mm0, 10110001b - paddw mm0, mm1 - movd eax, mm0 - and eax, 0xffff - mov ecx, eax ; preserve rounding for 16x16 - add eax, 1 - shr eax, 1 - add esp, 0x70 - pop ebx - ret -%undef args -%undef spill -%undef trans - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -;; violates calling convention -cglobal x264_pixel_sa8d_16x16_mmxext - push esi - push edi - push ebp - mov esi, [esp+28] ; stride2 - mov edi, [esp+20] ; stride1 - push esi - push dword [esp+28] ; pix2 - push edi - push dword [esp+28] ; pix1 - call x264_pixel_sa8d_8x8_mmxext - mov ebp, ecx - shl edi, 3 - shl esi, 3 - add [esp+0], edi ; pix1+8*stride1 - add [esp+8], esi ; pix2+8*stride2 - call x264_pixel_sa8d_8x8_mmxext - add ebp, ecx - add dword [esp+0], 8 ; pix1+8*stride1+8 - add dword [esp+8], 8 ; pix2+8*stride2+8 - call x264_pixel_sa8d_8x8_mmxext - add ebp, ecx - sub [esp+0], edi ; pix1+8 - sub [esp+8], esi ; pix2+8 - call x264_pixel_sa8d_8x8_mmxext - lea eax, [ebp+ecx+1] - shr eax, 1 - add esp, 16 - pop ebp - pop edi - pop esi - ret - - -; in: fenc -; out: mm0..mm3 = hadamard coefs -%macro LOAD_HADAMARD 1 - pxor mm7, mm7 - movd mm0, [%1+0*FENC_STRIDE] - movd mm4, [%1+1*FENC_STRIDE] - movd mm3, [%1+2*FENC_STRIDE] - movd mm1, [%1+3*FENC_STRIDE] - punpcklbw mm0, mm7 - punpcklbw mm4, mm7 - punpcklbw mm3, mm7 - punpcklbw mm1, mm7 - HADAMARD4x4 mm0, mm4, mm3, mm1 - TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2 - HADAMARD4x4 mm0, mm1, mm2, mm3 -%endmacro - -%macro SCALAR_SUMSUB 4 - add %1, %2 - add %3, %4 - add %2, %2 - add %4, %4 - sub %2, %1 - sub %4, %3 -%endmacro - -%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op - pxor %7, %7 - pshufw %4, %1, 01001110b - pshufw %5, %2, 01001110b - pshufw %6, %3, 01001110b - paddusw %1, %4 - paddusw %2, %5 - paddusw %3, %6 - punpcklwd %1, %7 - punpcklwd %2, %7 - punpcklwd %3, %7 - pshufw %4, %1, 01001110b - pshufw %5, %2, 01001110b - pshufw %6, %3, 01001110b - %8 %1, %4 - %8 %2, %5 - %8 %3, %6 -%endmacro - -;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) -;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_4x4_mmxext - push ebx - push edi - push esi - sub esp, 16 -%define args esp+32 -%define top_1d esp+8 ; +8 -%define left_1d esp+0 ; +8 - - mov eax, [args+0] ; fenc - LOAD_HADAMARD eax - - mov edi, [args+4] ; fdec - movzx eax, byte [edi-1+0*FDEC_STRIDE] - movzx ebx, byte [edi-1+1*FDEC_STRIDE] - movzx ecx, byte [edi-1+2*FDEC_STRIDE] - movzx edx, byte [edi-1+3*FDEC_STRIDE] - SCALAR_SUMSUB eax, ebx, ecx, edx - SCALAR_SUMSUB eax, ecx, ebx, edx ; 1x4 hadamard - mov [left_1d+0], ax - mov [left_1d+2], bx - mov [left_1d+4], cx - mov [left_1d+6], dx - mov esi, eax ; dc - - movzx eax, byte [edi-FDEC_STRIDE+0] - movzx ebx, byte [edi-FDEC_STRIDE+1] - movzx ecx, byte [edi-FDEC_STRIDE+2] - movzx edx, byte [edi-FDEC_STRIDE+3] - SCALAR_SUMSUB eax, ebx, ecx, edx - SCALAR_SUMSUB eax, ecx, ebx, edx ; 4x1 hadamard - mov [top_1d+0], ax - mov [top_1d+2], bx - mov [top_1d+4], cx - mov [top_1d+6], dx - lea esi, [esi + eax + 4] ; dc - and esi, -8 - shl esi, 1 - - movq mm4, mm1 - movq mm5, mm2 - MMX_ABS_TWO mm4, mm5, mm6, mm7 - movq mm7, mm3 - paddw mm4, mm5 - MMX_ABS mm7, mm6 - paddw mm7, mm4 ; 3x4 sum - - movq mm4, [left_1d] - movd mm5, esi - psllw mm4, 2 - psubw mm4, mm0 - psubw mm5, mm0 - punpcklwd mm0, mm1 - punpcklwd mm2, mm3 - punpckldq mm0, mm2 ; transpose - movq mm1, [top_1d] - psllw mm1, 2 - psubw mm0, mm1 - MMX_ABS mm4, mm3 ; 1x4 sum - MMX_ABS mm5, mm2 ; 1x4 sum - MMX_ABS mm0, mm1 ; 4x1 sum - paddw mm4, mm7 - paddw mm5, mm7 - movq mm1, mm5 - psrlq mm1, 16 ; 4x3 sum - paddw mm0, mm1 - - SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw - mov eax, [args+8] ; res - movd [eax+0], mm0 ; i4x4_v satd - movd [eax+4], mm4 ; i4x4_h satd - movd [eax+8], mm5 ; i4x4_dc satd - - add esp, 16 - pop esi - pop edi - pop ebx - ret - -;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) -;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_16x16_mmxext - push ebx - push ebp - push edi - push esi - sub esp, 88 -%define args esp+108 -%define sums esp+64 ; +24 -%define top_1d esp+32 ; +32 -%define left_1d esp+0 ; +32 - - pxor mm0, mm0 - movq [sums+0], mm0 - movq [sums+8], mm0 - movq [sums+16], mm0 - - ; 1D hadamards - mov edi, [args+4] ; fdec - xor ebp, ebp - mov esi, 12 -.loop_edge: - ; left - shl esi, 5 ; log(FDEC_STRIDE) - movzx eax, byte [edi+esi-1+0*FDEC_STRIDE] - movzx ebx, byte [edi+esi-1+1*FDEC_STRIDE] - movzx ecx, byte [edi+esi-1+2*FDEC_STRIDE] - movzx edx, byte [edi+esi-1+3*FDEC_STRIDE] - shr esi, 5 - SCALAR_SUMSUB eax, ebx, ecx, edx - SCALAR_SUMSUB eax, ecx, ebx, edx - add ebp, eax - mov [left_1d+2*esi+0], ax - mov [left_1d+2*esi+2], bx - mov [left_1d+2*esi+4], cx - mov [left_1d+2*esi+6], dx - - ; top - movzx eax, byte [edi+esi-FDEC_STRIDE+0] - movzx ebx, byte [edi+esi-FDEC_STRIDE+1] - movzx ecx, byte [edi+esi-FDEC_STRIDE+2] - movzx edx, byte [edi+esi-FDEC_STRIDE+3] - SCALAR_SUMSUB eax, ebx, ecx, edx - SCALAR_SUMSUB eax, ecx, ebx, edx - add ebp, eax - mov [top_1d+2*esi+0], ax - mov [top_1d+2*esi+2], bx - mov [top_1d+2*esi+4], cx - mov [top_1d+2*esi+6], dx - sub esi, 4 - jge .loop_edge - - ; dc - shr ebp, 1 - add ebp, 8 - and ebp, -16 - - ; 2D hadamards - mov eax, [args+0] ; fenc - xor edi, edi -.loop_y: - xor esi, esi -.loop_x: - LOAD_HADAMARD eax - - movq mm4, mm1 - movq mm5, mm2 - MMX_ABS_TWO mm4, mm5, mm6, mm7 - movq mm7, mm3 - paddw mm4, mm5 - MMX_ABS mm7, mm6 - paddw mm7, mm4 ; 3x4 sum - - movq mm4, [left_1d+8*edi] - movd mm5, ebp - psllw mm4, 2 - psubw mm4, mm0 - psubw mm5, mm0 - punpcklwd mm0, mm1 - punpcklwd mm2, mm3 - punpckldq mm0, mm2 ; transpose - movq mm1, [top_1d+8*esi] - psllw mm1, 2 - psubw mm0, mm1 - MMX_ABS mm4, mm3 ; 1x4 sum - MMX_ABS mm5, mm2 ; 1x4 sum - MMX_ABS mm0, mm1 ; 4x1 sum - pavgw mm4, mm7 - pavgw mm5, mm7 - paddw mm0, [sums+0] ; i4x4_v satd - paddw mm4, [sums+8] ; i4x4_h satd - paddw mm5, [sums+16] ; i4x4_dc satd - movq [sums+0], mm0 - movq [sums+8], mm4 - movq [sums+16], mm5 - - add eax, 4 - inc esi - cmp esi, 4 - jl .loop_x - add eax, 4*FENC_STRIDE-16 - inc edi - cmp edi, 4 - jl .loop_y - -; horizontal sum - movq mm2, [sums+16] - movq mm0, [sums+0] - movq mm1, [sums+8] - movq mm7, mm2 - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - psrld mm0, 1 - pslld mm7, 16 - psrld mm7, 16 - paddd mm0, mm2 - psubd mm0, mm7 - mov eax, [args+8] ; res - movd [eax+0], mm0 ; i16x16_v satd - movd [eax+4], mm1 ; i16x16_h satd - movd [eax+8], mm2 ; i16x16_dc satd - - add esp, 88 - pop esi - pop edi - pop ebp - pop ebx - ret - -;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) -;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_8x8c_mmxext - push ebx - push ebp - push edi - push esi - sub esp, 72 -%define args esp+92 -%define sums esp+48 ; +24 -%define dc_1d esp+32 ; +16 -%define top_1d esp+16 ; +16 -%define left_1d esp+0 ; +16 - - pxor mm0, mm0 - movq [sums+0], mm0 - movq [sums+8], mm0 - movq [sums+16], mm0 - - ; 1D hadamards - mov edi, [args+4] ; fdec - xor ebp, ebp - mov esi, 12 -.loop_edge: - ; left - shl esi, 5 ; log(FDEC_STRIDE) - movzx eax, byte [edi+esi-1+0*FDEC_STRIDE] - movzx ebx, byte [edi+esi-1+1*FDEC_STRIDE] - movzx ecx, byte [edi+esi-1+2*FDEC_STRIDE] - movzx edx, byte [edi+esi-1+3*FDEC_STRIDE] - shr esi, 5 - SCALAR_SUMSUB eax, ebx, ecx, edx - SCALAR_SUMSUB eax, ecx, ebx, edx - mov [left_1d+2*esi+0], ax - mov [left_1d+2*esi+2], bx - mov [left_1d+2*esi+4], cx - mov [left_1d+2*esi+6], dx - - ; top - movzx eax, byte [edi+esi-FDEC_STRIDE+0] - movzx ebx, byte [edi+esi-FDEC_STRIDE+1] - movzx ecx, byte [edi+esi-FDEC_STRIDE+2] - movzx edx, byte [edi+esi-FDEC_STRIDE+3] - SCALAR_SUMSUB eax, ebx, ecx, edx - SCALAR_SUMSUB eax, ecx, ebx, edx - mov [top_1d+2*esi+0], ax - mov [top_1d+2*esi+2], bx - mov [top_1d+2*esi+4], cx - mov [top_1d+2*esi+6], dx - sub esi, 4 - jge .loop_edge - - ; dc - movzx eax, word [left_1d+0] - movzx ebx, word [top_1d+0] - movzx ecx, word [left_1d+8] - movzx edx, word [top_1d+8] - add eax, ebx - lea ebx, [ecx + edx] - lea eax, [2*eax + 8] - lea ebx, [2*ebx + 8] - lea ecx, [4*ecx + 8] - lea edx, [4*edx + 8] - and eax, -16 - and ebx, -16 - and ecx, -16 - and edx, -16 - mov [dc_1d+ 0], eax ; tl - mov [dc_1d+ 4], edx ; tr - mov [dc_1d+ 8], ecx ; bl - mov [dc_1d+12], ebx ; br - lea ebp, [dc_1d] - - ; 2D hadamards - mov eax, [args+0] ; fenc - xor edi, edi -.loop_y: - xor esi, esi -.loop_x: - LOAD_HADAMARD eax - - movq mm4, mm1 - movq mm5, mm2 - MMX_ABS_TWO mm4, mm5, mm6, mm7 - movq mm7, mm3 - paddw mm4, mm5 - MMX_ABS mm7, mm6 - paddw mm7, mm4 ; 3x4 sum - - movq mm4, [left_1d+8*edi] - movd mm5, [ebp] - psllw mm4, 2 - psubw mm4, mm0 - psubw mm5, mm0 - punpcklwd mm0, mm1 - punpcklwd mm2, mm3 - punpckldq mm0, mm2 ; transpose - movq mm1, [top_1d+8*esi] - psllw mm1, 2 - psubw mm0, mm1 - MMX_ABS mm4, mm3 ; 1x4 sum - MMX_ABS mm5, mm2 ; 1x4 sum - MMX_ABS mm0, mm1 ; 4x1 sum - pavgw mm4, mm7 - pavgw mm5, mm7 - paddw mm0, [sums+16] ; i4x4_v satd - paddw mm4, [sums+8] ; i4x4_h satd - paddw mm5, [sums+0] ; i4x4_dc satd - movq [sums+16], mm0 - movq [sums+8], mm4 - movq [sums+0], mm5 - - add eax, 4 - add ebp, 4 - inc esi - cmp esi, 2 - jl .loop_x - add eax, 4*FENC_STRIDE-8 - inc edi - cmp edi, 2 - jl .loop_y - -; horizontal sum - movq mm0, [sums+0] - movq mm1, [sums+8] - movq mm2, [sums+16] - movq mm6, mm0 - psrlq mm6, 15 - paddw mm2, mm6 - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm7, paddd - psrld mm2, 1 - mov eax, [args+8] ; res - movd [eax+0], mm0 ; i8x8c_dc satd - movd [eax+4], mm1 ; i8x8c_h satd - movd [eax+8], mm2 ; i8x8c_v satd - - add esp, 72 - pop esi - pop edi - pop ebp - pop ebx - ret - -%macro LOAD_4x8P 1 ; dx - pxor mm7, mm7 - movd mm6, [eax+%1+7*FENC_STRIDE] - movd mm0, [eax+%1+0*FENC_STRIDE] - movd mm1, [eax+%1+1*FENC_STRIDE] - movd mm2, [eax+%1+2*FENC_STRIDE] - movd mm3, [eax+%1+3*FENC_STRIDE] - movd mm4, [eax+%1+4*FENC_STRIDE] - movd mm5, [eax+%1+5*FENC_STRIDE] - punpcklbw mm6, mm7 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - movq [spill], mm6 - punpcklbw mm2, mm7 - punpcklbw mm3, mm7 - movd mm6, [eax+%1+6*FENC_STRIDE] - punpcklbw mm4, mm7 - punpcklbw mm5, mm7 - punpcklbw mm6, mm7 - movq mm7, [spill] -%endmacro - -;----------------------------------------------------------------------------- -; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res ) -;----------------------------------------------------------------------------- -cglobal x264_intra_sa8d_x3_8x8_core_mmxext - mov eax, [esp+4] - mov ecx, [esp+8] - sub esp, 0x70 -%define args esp+0x74 -%define spill esp+0x60 ; +16 -%define trans esp+0 ; +96 -%define sum esp+0 ; +32 - LOAD_4x8P 0 - HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 - - movq [spill], mm0 - TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 - movq [trans+0x00], mm4 - movq [trans+0x08], mm7 - movq [trans+0x10], mm0 - movq [trans+0x18], mm6 - movq mm0, [spill] - TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4 - movq [trans+0x20], mm0 - movq [trans+0x28], mm3 - movq [trans+0x30], mm4 - movq [trans+0x38], mm2 - - LOAD_4x8P 4 - HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 - - movq [spill], mm7 - TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7 - movq [trans+0x40], mm0 - movq [trans+0x48], mm3 - movq [trans+0x50], mm7 - movq [trans+0x58], mm2 - movq mm7, [spill] - TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 - movq mm5, [trans+0x00] - movq mm1, [trans+0x08] - movq mm2, [trans+0x10] - movq mm3, [trans+0x18] - - HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 - - movq [spill+0], mm5 - movq [spill+8], mm7 - MMX_ABS_TWO mm0, mm1, mm5, mm7 - MMX_ABS_TWO mm2, mm3, mm5, mm7 - paddw mm0, mm2 - paddw mm1, mm3 - paddw mm0, mm1 - MMX_ABS_TWO mm4, mm6, mm2, mm3 - movq mm5, [spill+0] - movq mm7, [spill+8] - paddw mm0, mm4 - paddw mm0, mm6 - MMX_ABS mm7, mm1 - paddw mm0, mm7 ; 7x4 sum - movq mm6, mm5 - movq mm7, [ecx+8] ; left bottom - psllw mm7, 3 - psubw mm6, mm7 - MMX_ABS_TWO mm5, mm6, mm2, mm3 - paddw mm5, mm0 - paddw mm6, mm0 - movq [sum+0], mm5 ; dc - movq [sum+8], mm6 ; left - - movq mm0, [trans+0x20] - movq mm1, [trans+0x28] - movq mm2, [trans+0x30] - movq mm3, [trans+0x38] - movq mm4, [trans+0x40] - movq mm5, [trans+0x48] - movq mm6, [trans+0x50] - movq mm7, [trans+0x58] - - HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 - - movd [sum+0x10], mm0 - movd [sum+0x12], mm1 - movd [sum+0x14], mm2 - movd [sum+0x16], mm3 - movd [sum+0x18], mm4 - movd [sum+0x1a], mm5 - movd [sum+0x1c], mm6 - movd [sum+0x1e], mm7 - - movq [spill], mm0 - movq [spill+8], mm1 - MMX_ABS_TWO mm2, mm3, mm0, mm1 - MMX_ABS_TWO mm4, mm5, mm0, mm1 - paddw mm2, mm3 - paddw mm4, mm5 - paddw mm2, mm4 - movq mm0, [spill] - movq mm1, [spill+8] - MMX_ABS_TWO mm6, mm7, mm4, mm5 - MMX_ABS mm1, mm4 - paddw mm2, mm7 - paddw mm1, mm6 - paddw mm2, mm1 ; 7x4 sum - movq mm1, mm0 - - movq mm7, [ecx+0] - psllw mm7, 3 ; left top - - movzx edx, word [ecx+0] - add dx, [ecx+16] - lea edx, [4*edx+32] - and edx, -64 - movd mm6, edx ; dc - - psubw mm1, mm7 - psubw mm0, mm6 - MMX_ABS_TWO mm0, mm1, mm5, mm6 - movq mm3, [sum+0] ; dc - paddw mm0, mm2 - paddw mm1, mm2 - movq mm2, mm0 - paddw mm0, mm3 - paddw mm1, [sum+8] ; h - psrlq mm2, 16 - paddw mm2, mm3 - - movq mm3, [ecx+16] ; top left - movq mm4, [ecx+24] ; top right - psllw mm3, 3 - psllw mm4, 3 - psubw mm3, [sum+16] - psubw mm4, [sum+24] - MMX_ABS_TWO mm3, mm4, mm5, mm6 - paddw mm2, mm3 - paddw mm2, mm4 ; v - - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - mov eax, [args+8] - movd ecx, mm2 - movd edx, mm1 - add ecx, 2 - add edx, 2 - shr ecx, 2 - shr edx, 2 - mov [eax+0], ecx ; i8x8_v satd - mov [eax+4], edx ; i8x8_h satd - movd ecx, mm0 - add ecx, 2 - shr ecx, 2 - mov [eax+8], ecx ; i8x8_dc satd - - add esp, 0x70 - ret -%undef args -%undef spill -%undef trans -%undef sum - - - -;----------------------------------------------------------------------------- -; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_4x4x2_core_mmxext - push ebx - push edi - mov ebx, [esp+16] - mov edx, [esp+24] - mov edi, 4 - pxor mm0, mm0 -.loop - mov eax, [esp+12] - mov ecx, [esp+20] - add eax, edi - add ecx, edi - pxor mm1, mm1 - pxor mm2, mm2 - pxor mm3, mm3 - pxor mm4, mm4 -%rep 4 - movd mm5, [eax] - movd mm6, [ecx] - punpcklbw mm5, mm0 - punpcklbw mm6, mm0 - paddw mm1, mm5 - paddw mm2, mm6 - movq mm7, mm5 - pmaddwd mm5, mm5 - pmaddwd mm7, mm6 - pmaddwd mm6, mm6 - paddd mm3, mm5 - paddd mm4, mm7 - paddd mm3, mm6 - add eax, ebx - add ecx, edx -%endrep - mov eax, [esp+28] - lea eax, [eax+edi*4] - pshufw mm5, mm1, 0xE - pshufw mm6, mm2, 0xE - paddusw mm1, mm5 - paddusw mm2, mm6 - punpcklwd mm1, mm2 - pshufw mm2, mm1, 0xE - pshufw mm5, mm3, 0xE - pshufw mm6, mm4, 0xE - paddusw mm1, mm2 - paddd mm3, mm5 - paddd mm4, mm6 - punpcklwd mm1, mm0 - punpckldq mm3, mm4 - movq [eax+0], mm1 - movq [eax+8], mm3 - sub edi, 4 - jge .loop - pop edi - pop ebx - emms - ret - - - -; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) -cglobal x264_pixel_ads_mvs - mov ebx, [ebp+24] ; mvs - mov ecx, esp ; masks - mov edi, [ebp+28] ; width - mov dword [ecx+edi], 0 - push esi - push ebp - xor eax, eax - xor esi, esi -.loopi: - mov ebp, [ecx+esi] - mov edx, [ecx+esi+4] - or edx, ebp - jz .nexti - xor edx, edx -%macro TEST 1 - mov [ebx+eax*2], si - test ebp, 0xff<<(%1*8) - setne dl - add eax, edx - inc esi -%endmacro - TEST 0 - TEST 1 - TEST 2 - TEST 3 - mov ebp, [ecx+esi] - TEST 0 - TEST 1 - TEST 2 - TEST 3 - cmp esi, edi - jl .loopi - jmp .end -.nexti: - add esi, 8 - cmp esi, edi - jl .loopi -.end: - pop ebp - pop esi - mov edi, [ebp-8] - mov ebx, [ebp-4] - leave - ret - -%macro ADS_START 0 - push ebp - mov ebp, esp - push ebx - push edi - mov eax, [ebp+12] ; sums - mov ebx, [ebp+16] ; delta - mov ecx, [ebp+20] ; cost_mvx - mov edx, [ebp+28] ; width - sub esp, edx - sub esp, 4 - and esp, ~15 - mov edi, esp - shl ebx, 1 -%endmacro - -%macro ADS_END 1 - add eax, 8*%1 - add ecx, 8*%1 - add edi, 4*%1 - sub edx, 4*%1 - jg .loop - jmp x264_pixel_ads_mvs -%endmacro - -;----------------------------------------------------------------------------- -; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, -; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ads4_mmxext - mov eax, [esp+4] - movq mm6, [eax] - movq mm4, [eax+8] - pshufw mm7, mm6, 0 - pshufw mm6, mm6, 0xAA - pshufw mm5, mm4, 0 - pshufw mm4, mm4, 0xAA - ADS_START -.loop: - movq mm0, [eax] - movq mm1, [eax+16] - psubw mm0, mm7 - psubw mm1, mm6 - MMX_ABS mm0, mm2 - MMX_ABS mm1, mm3 - movq mm2, [eax+ebx] - movq mm3, [eax+ebx+16] - psubw mm2, mm5 - psubw mm3, mm4 - paddw mm0, mm1 - MMX_ABS mm2, mm1 - MMX_ABS mm3, mm1 - paddw mm0, mm2 - paddw mm0, mm3 - pshufw mm1, [ebp+32], 0 - paddusw mm0, [ecx] - psubusw mm1, mm0 - packsswb mm1, mm1 - movd [edi], mm1 - ADS_END 1 - -cglobal x264_pixel_ads2_mmxext - mov eax, [esp+4] - movq mm6, [eax] - pshufw mm5, [esp+28], 0 - pshufw mm7, mm6, 0 - pshufw mm6, mm6, 0xAA - ADS_START -.loop: - movq mm0, [eax] - movq mm1, [eax+ebx] - psubw mm0, mm7 - psubw mm1, mm6 - MMX_ABS mm0, mm2 - MMX_ABS mm1, mm3 - paddw mm0, mm1 - paddusw mm0, [ecx] - movq mm4, mm5 - psubusw mm4, mm0 - packsswb mm4, mm4 - movd [edi], mm4 - ADS_END 1 - -cglobal x264_pixel_ads1_mmxext - mov eax, [esp+4] - pshufw mm7, [eax], 0 - pshufw mm6, [esp+28], 0 - ADS_START -.loop: - movq mm0, [eax] - movq mm1, [eax+8] - psubw mm0, mm7 - psubw mm1, mm7 - MMX_ABS mm0, mm2 - MMX_ABS mm1, mm3 - paddusw mm0, [ecx] - paddusw mm1, [ecx+8] - movq mm4, mm6 - movq mm5, mm6 - psubusw mm4, mm0 - psubusw mm5, mm1 - packsswb mm4, mm5 - movq [edi], mm4 - ADS_END 2 - -%macro ADS_SSE2 1 -cglobal x264_pixel_ads4_%1 - mov eax, [esp+4] ; enc_dc - movdqa xmm4, [eax] - pshuflw xmm7, xmm4, 0 - pshuflw xmm6, xmm4, 0xAA - pshufhw xmm5, xmm4, 0 - pshufhw xmm4, xmm4, 0xAA - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpckhqdq xmm5, xmm5 - punpckhqdq xmm4, xmm4 - ADS_START -.loop: - movdqu xmm0, [eax] - movdqu xmm1, [eax+16] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - MMX_ABS xmm0, xmm2 - MMX_ABS xmm1, xmm3 - movdqu xmm2, [eax+ebx] - movdqu xmm3, [eax+ebx+16] - psubw xmm2, xmm5 - psubw xmm3, xmm4 - paddw xmm0, xmm1 - MMX_ABS xmm2, xmm1 - MMX_ABS xmm3, xmm1 - paddw xmm0, xmm2 - paddw xmm0, xmm3 - movd xmm1, [ebp+32] ; thresh - movdqu xmm2, [ecx] - pshuflw xmm1, xmm1, 0 - punpcklqdq xmm1, xmm1 - paddusw xmm0, xmm2 - psubusw xmm1, xmm0 - packsswb xmm1, xmm1 - movq [edi], xmm1 - ADS_END 2 - -cglobal x264_pixel_ads2_%1 - mov eax, [esp+4] ; enc_dc - movq xmm6, [eax] - movd xmm5, [esp+28] ; thresh - pshuflw xmm7, xmm6, 0 - pshuflw xmm6, xmm6, 0xAA - pshuflw xmm5, xmm5, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpcklqdq xmm5, xmm5 - ADS_START -.loop: - movdqu xmm0, [eax] - movdqu xmm1, [eax+ebx] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - movdqu xmm4, [ecx] - MMX_ABS xmm0, xmm2 - MMX_ABS xmm1, xmm3 - paddw xmm0, xmm1 - paddusw xmm0, xmm4 - movdqa xmm1, xmm5 - psubusw xmm1, xmm0 - packsswb xmm1, xmm1 - movq [edi], xmm1 - ADS_END 2 - -cglobal x264_pixel_ads1_%1 - mov eax, [esp+4] ; enc_dc - movd xmm7, [eax] - movd xmm6, [esp+28] ; thresh - pshuflw xmm7, xmm7, 0 - pshuflw xmm6, xmm6, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - ADS_START -.loop: - movdqu xmm0, [eax] - movdqu xmm1, [eax+16] - psubw xmm0, xmm7 - psubw xmm1, xmm7 - movdqu xmm2, [ecx] - movdqu xmm3, [ecx+16] - MMX_ABS xmm0, xmm4 - MMX_ABS xmm1, xmm5 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 - movdqa xmm4, xmm6 - movdqa xmm5, xmm6 - psubusw xmm4, xmm0 - psubusw xmm5, xmm1 - packsswb xmm4, xmm5 - movdqa [edi], xmm4 - ADS_END 4 -%endmacro - -ADS_SSE2 sse2 -%ifdef HAVE_SSE3 -%macro MMX_ABS 2 - pabsw %1, %1 -%endmacro -ADS_SSE2 ssse3 -%endif diff --git a/common/i386/pixel-sse2.asm b/common/i386/pixel-sse2.asm deleted file mode 100644 index bb2e9c57..00000000 --- a/common/i386/pixel-sse2.asm +++ /dev/null @@ -1,1052 +0,0 @@ -;***************************************************************************** -;* pixel-sse2.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2005 x264 project -;* -;* Authors: Alex Izvorski -;* Loren Merritt -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -BITS 32 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "i386inc.asm" - -SECTION_RODATA - -pw_1: times 8 dw 1 -ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 -ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 -mask_ff: times 16 db 0xff - times 16 db 0 - - -SECTION .text - -%macro HADDW 2 ; sum junk - ; ebx is no longer used at this point, so no push needed - picgetgot ebx - pmaddwd %1, [pw_1 GLOBAL] - movhlps %2, %1 - paddd %1, %2 - pshuflw %2, %1, 0xE - paddd %1, %2 -%endmacro - -%macro SAD_START_SSE2 0 - push ebx - mov eax, [esp+ 8] ; pix1 - mov ebx, [esp+12] ; stride1 - mov ecx, [esp+16] ; pix2 - mov edx, [esp+20] ; stride2 -%endmacro - -%macro SAD_END_SSE2 0 - movhlps xmm1, xmm0 - paddw xmm0, xmm1 - movd eax, xmm0 - pop ebx - ret -%endmacro - -%macro SAD_W16 1 -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x16_%1 - SAD_START_SSE2 - movdqu xmm0, [ecx] - movdqu xmm1, [ecx+edx] - lea ecx, [ecx+2*edx] - movdqu xmm2, [ecx] - movdqu xmm3, [ecx+edx] - lea ecx, [ecx+2*edx] - psadbw xmm0, [eax] - psadbw xmm1, [eax+ebx] - lea eax, [eax+2*ebx] - movdqu xmm4, [ecx] - paddw xmm0, xmm1 - psadbw xmm2, [eax] - psadbw xmm3, [eax+ebx] - lea eax, [eax+2*ebx] - movdqu xmm5, [ecx+edx] - lea ecx, [ecx+2*edx] - paddw xmm2, xmm3 - movdqu xmm6, [ecx] - movdqu xmm7, [ecx+edx] - lea ecx, [ecx+2*edx] - paddw xmm0, xmm2 - psadbw xmm4, [eax] - psadbw xmm5, [eax+ebx] - lea eax, [eax+2*ebx] - movdqu xmm1, [ecx] - paddw xmm4, xmm5 - psadbw xmm6, [eax] - psadbw xmm7, [eax+ebx] - lea eax, [eax+2*ebx] - movdqu xmm2, [ecx+edx] - lea ecx, [ecx+2*edx] - paddw xmm6, xmm7 - movdqu xmm3, [ecx] - paddw xmm0, xmm4 - movdqu xmm4, [ecx+edx] - lea ecx, [ecx+2*edx] - paddw xmm0, xmm6 - psadbw xmm1, [eax] - psadbw xmm2, [eax+ebx] - lea eax, [eax+2*ebx] - movdqu xmm5, [ecx] - paddw xmm1, xmm2 - psadbw xmm3, [eax] - psadbw xmm4, [eax+ebx] - lea eax, [eax+2*ebx] - movdqu xmm6, [ecx+edx] - lea ecx, [ecx+2*edx] - paddw xmm3, xmm4 - movdqu xmm7, [ecx] - paddw xmm0, xmm1 - movdqu xmm1, [ecx+edx] - paddw xmm0, xmm3 - psadbw xmm5, [eax] - psadbw xmm6, [eax+ebx] - lea eax, [eax+2*ebx] - paddw xmm5, xmm6 - psadbw xmm7, [eax] - psadbw xmm1, [eax+ebx] - paddw xmm7, xmm1 - paddw xmm0, xmm5 - paddw xmm0, xmm7 - SAD_END_SSE2 - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x8_%1 - SAD_START_SSE2 - movdqu xmm0, [ecx] - movdqu xmm2, [ecx+edx] - lea ecx, [ecx+2*edx] - movdqu xmm3, [ecx] - movdqu xmm4, [ecx+edx] - psadbw xmm0, [eax] - psadbw xmm2, [eax+ebx] - lea eax, [eax+2*ebx] - psadbw xmm3, [eax] - psadbw xmm4, [eax+ebx] - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - paddw xmm0, xmm2 - paddw xmm3, xmm4 - paddw xmm0, xmm3 - movdqu xmm1, [ecx] - movdqu xmm2, [ecx+edx] - lea ecx, [ecx+2*edx] - movdqu xmm3, [ecx] - movdqu xmm4, [ecx+edx] - psadbw xmm1, [eax] - psadbw xmm2, [eax+ebx] - lea eax, [eax+2*ebx] - psadbw xmm3, [eax] - psadbw xmm4, [eax+ebx] - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - paddw xmm1, xmm2 - paddw xmm3, xmm4 - paddw xmm0, xmm1 - paddw xmm0, xmm3 - SAD_END_SSE2 -%endmacro - -SAD_W16 sse2 -%ifdef HAVE_SSE3 -%define movdqu lddqu -SAD_W16 sse3 -%undef movdqu -%endif - - -; sad x3 / x4 - -%macro SAD_X3_START_1x16P 0 - push edi - push esi - mov edi, [esp+12] - mov eax, [esp+16] - mov ecx, [esp+20] - mov edx, [esp+24] - mov esi, [esp+28] - movdqa xmm3, [edi] - movdqu xmm0, [eax] - movdqu xmm1, [ecx] - movdqu xmm2, [edx] - psadbw xmm0, xmm3 - psadbw xmm1, xmm3 - psadbw xmm2, xmm3 -%endmacro - -%macro SAD_X3_1x16P 2 - movdqa xmm3, [edi+%1] - movdqu xmm4, [eax+%2] - movdqu xmm5, [ecx+%2] - movdqu xmm6, [edx+%2] - psadbw xmm4, xmm3 - psadbw xmm5, xmm3 - psadbw xmm6, xmm3 - paddw xmm0, xmm4 - paddw xmm1, xmm5 - paddw xmm2, xmm6 -%endmacro - -%macro SAD_X3_2x16P 1 -%if %1 - SAD_X3_START_1x16P -%else - SAD_X3_1x16P 0, 0 -%endif - SAD_X3_1x16P FENC_STRIDE, esi - add edi, 2*FENC_STRIDE - lea eax, [eax+2*esi] - lea ecx, [ecx+2*esi] - lea edx, [edx+2*esi] -%endmacro - -%macro SAD_X4_START_1x16P 0 - push edi - push esi - push ebx - mov edi, [esp+16] - mov eax, [esp+20] - mov ebx, [esp+24] - mov ecx, [esp+28] - mov edx, [esp+32] - mov esi, [esp+36] - movdqa xmm7, [edi] - movdqu xmm0, [eax] - movdqu xmm1, [ebx] - movdqu xmm2, [ecx] - movdqu xmm3, [edx] - psadbw xmm0, xmm7 - psadbw xmm1, xmm7 - psadbw xmm2, xmm7 - psadbw xmm3, xmm7 -%endmacro - -%macro SAD_X4_1x16P 2 - movdqa xmm7, [edi+%1] - movdqu xmm4, [eax+%2] - movdqu xmm5, [ebx+%2] - movdqu xmm6, [ecx+%2] - psadbw xmm4, xmm7 - psadbw xmm5, xmm7 - paddw xmm0, xmm4 - psadbw xmm6, xmm7 - movdqu xmm4, [edx+%2] - paddw xmm1, xmm5 - psadbw xmm4, xmm7 - paddw xmm2, xmm6 - paddw xmm3, xmm4 -%endmacro - -%macro SAD_X4_2x16P 1 -%if %1 - SAD_X4_START_1x16P -%else - SAD_X4_1x16P 0, 0 -%endif - SAD_X4_1x16P FENC_STRIDE, esi - add edi, 2*FENC_STRIDE - lea eax, [eax+2*esi] - lea ebx, [ebx+2*esi] - lea ecx, [ecx+2*esi] - lea edx, [edx+2*esi] -%endmacro - -%macro SAD_X3_END 0 - mov eax, [esp+32] - pshufd xmm4, xmm0, 2 - pshufd xmm5, xmm1, 2 - pshufd xmm6, xmm2, 2 - paddw xmm0, xmm4 - paddw xmm1, xmm5 - paddw xmm2, xmm6 - movd [eax+0], xmm0 - movd [eax+4], xmm1 - movd [eax+8], xmm2 - pop esi - pop edi - ret -%endmacro - -%macro SAD_X4_END 0 - mov eax, [esp+40] - psllq xmm1, 32 - psllq xmm3, 32 - paddw xmm0, xmm1 - paddw xmm2, xmm3 - pshufd xmm1, xmm0, 14 - pshufd xmm3, xmm2, 14 - paddw xmm0, xmm1 - paddw xmm2, xmm3 - movq [eax+0], xmm0 - movq [eax+8], xmm2 - pop ebx - pop esi - pop edi - ret -%endmacro - -;----------------------------------------------------------------------------- -; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, -; uint8_t *pix2, int i_stride, int scores[3] ) -;----------------------------------------------------------------------------- -%macro SAD_X 4 -cglobal x264_pixel_sad_x%1_%2x%3_%4 - SAD_X%1_2x%2P 1 -%rep %3/2-1 - SAD_X%1_2x%2P 0 -%endrep - SAD_X%1_END -%endmacro - -SAD_X 3, 16, 16, sse2 -SAD_X 3, 16, 8, sse2 -SAD_X 4, 16, 16, sse2 -SAD_X 4, 16, 8, sse2 - -%ifdef HAVE_SSE3 -%define movdqu lddqu -SAD_X 3, 16, 16, sse3 -SAD_X 3, 16, 8, sse3 -SAD_X 4, 16, 16, sse3 -SAD_X 4, 16, 8, sse3 -%undef movdqu -%endif - - -; Core2 (Conroe) can load unaligned data just as quickly as aligned data... -; unless the unaligned data spans the border between 2 cachelines, in which -; case it's really slow. The exact numbers may differ, but all Intel cpus -; have a large penalty for cacheline splits. -; (8-byte alignment exactly half way between two cachelines is ok though.) -; LDDQU was supposed to fix this, but it only works on Pentium 4. -; So in the split case we load aligned data and explicitly perform the -; alignment between registers. Like on archs that have only aligned loads, -; except complicated by the fact that PALIGNR takes only an immediate, not -; a variable alignment. - -; computed jump assumes this loop is exactly 80 bytes -%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment -ALIGN 16 -sad_w16_align%1_sse2: - movdqa xmm1, [ecx+16] - movdqa xmm2, [ecx+edx+16] - movdqa xmm3, [ecx] - movdqa xmm4, [ecx+edx] - pslldq xmm1, 16-%1 - pslldq xmm2, 16-%1 - psrldq xmm3, %1 - psrldq xmm4, %1 - por xmm1, xmm3 - por xmm2, xmm4 - psadbw xmm1, [eax] - psadbw xmm2, [eax+ebx] - paddw xmm0, xmm1 - paddw xmm0, xmm2 - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - dec esi - jg sad_w16_align%1_sse2 - ret -%endmacro - -; computed jump assumes this loop is exactly 64 bytes -%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment -ALIGN 16 -sad_w16_align%1_ssse3: - movdqa xmm1, [ecx+16] - movdqa xmm2, [ecx+edx+16] - palignr xmm1, [ecx], %1 - palignr xmm2, [ecx+edx], %1 - psadbw xmm1, [eax] - psadbw xmm2, [eax+ebx] - paddw xmm0, xmm1 - paddw xmm0, xmm2 - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - dec esi - jg sad_w16_align%1_ssse3 - ret -%endmacro - -%macro SAD16_CACHELINE_FUNC 2 ; cpu, height -cglobal x264_pixel_sad_16x%2_cache64_%1 - mov eax, [esp+12] - and eax, 0x37 - cmp eax, 0x30 - jle x264_pixel_sad_16x%2_sse2 - mov eax, [esp+12] - push ebx - push edi - push esi - and eax, 15 -%ifidn %1, ssse3 - shl eax, 6 -%else - lea eax, [eax*5] - shl eax, 4 -%endif - picgetgot ebx - lea edi, [sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1) + eax GLOBAL] - mov eax, [esp+16] - mov ebx, [esp+20] - mov ecx, [esp+24] - mov edx, [esp+28] - and ecx, ~15 - mov esi, %2/2 - pxor xmm0, xmm0 - call edi - pop esi - pop edi - SAD_END_SSE2 -%endmacro - -%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline - mov eax, [esp+12] - and eax, 0x17|%2|(%4>>1) - cmp eax, 0x10|%2|(%4>>1) - jle x264_pixel_sad_%1x%2_mmxext - push ebx - push esi - and eax, 7 - shl eax, 3 - mov ecx, 64 - sub ecx, eax - movd mm7, eax - movd mm6, ecx - mov eax, [esp+12] - mov ebx, [esp+16] - mov ecx, [esp+20] - mov edx, [esp+24] - and ecx, ~7 - mov esi, %3 - pxor mm0, mm0 -%endmacro - -%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline -cglobal x264_pixel_sad_16x%1_cache%2_mmxext - SAD_CACHELINE_START_MMX2 16, %1, %1, %2 -.loop: - movq mm1, [ecx] - movq mm2, [ecx+8] - movq mm3, [ecx+16] - movq mm4, mm2 - psrlq mm1, mm7 - psllq mm2, mm6 - psllq mm3, mm6 - psrlq mm4, mm7 - por mm1, mm2 - por mm3, mm4 - psadbw mm1, [eax] - psadbw mm3, [eax+8] - paddw mm0, mm1 - paddw mm0, mm3 - add ecx, edx - add eax, ebx - dec esi - jg .loop - pop esi - pop ebx - movd eax, mm0 - ret -%endmacro - -%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline -cglobal x264_pixel_sad_8x%1_cache%2_mmxext - SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 -.loop: - movq mm1, [ecx+8] - movq mm2, [ecx+edx+8] - movq mm3, [ecx] - movq mm4, [ecx+edx] - psllq mm1, mm6 - psllq mm2, mm6 - psrlq mm3, mm7 - psrlq mm4, mm7 - por mm1, mm3 - por mm2, mm4 - psadbw mm1, [eax] - psadbw mm2, [eax+ebx] - paddw mm0, mm1 - paddw mm0, mm2 - lea ecx, [ecx+2*edx] - lea eax, [eax+2*ebx] - dec esi - jg .loop - pop esi - pop ebx - movd eax, mm0 - ret -%endmacro - - -; sad_x3/x4_cache64: check each mv. -; if they're all within a cacheline, use normal sad_x3/x4. -; otherwise, send them individually to sad_cache64. -%macro CHECK_SPLIT 3 ; pix, width, cacheline - mov eax, %1 - and eax, 0x17|%2|(%3>>1) - cmp eax, 0x10|%2|(%3>>1) - jg .split -%endmacro - -%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver -cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5 - CHECK_SPLIT [esp+8], %1, %3 - CHECK_SPLIT [esp+12], %1, %3 - CHECK_SPLIT [esp+16], %1, %3 - jmp x264_pixel_sad_x3_%1x%2_%4 -.split: - push edi - mov edi, [esp+28] - push dword [esp+24] - push dword [esp+16] - push dword 16 - push dword [esp+20] - call x264_pixel_sad_%1x%2_cache%3_%5 - mov ecx, [esp+32] - mov [edi], eax - mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 - mov ecx, [esp+36] - mov [edi+4], eax - mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 - mov [edi+8], eax - add esp, 16 - pop edi - ret -%endmacro - -%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver -cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5 - CHECK_SPLIT [esp+8], %1, %3 - CHECK_SPLIT [esp+12], %1, %3 - CHECK_SPLIT [esp+16], %1, %3 - CHECK_SPLIT [esp+20], %1, %3 - jmp x264_pixel_sad_x4_%1x%2_%4 -.split: - push edi - mov edi, [esp+32] - push dword [esp+28] - push dword [esp+16] - push dword 16 - push dword [esp+20] - call x264_pixel_sad_%1x%2_cache%3_%5 - mov ecx, [esp+32] - mov [edi], eax - mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 - mov ecx, [esp+36] - mov [edi+4], eax - mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 - mov ecx, [esp+40] - mov [edi+8], eax - mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 - mov [edi+12], eax - add esp, 16 - pop edi - ret -%endmacro - -%macro SADX34_CACHELINE_FUNC 5 - SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5 - SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5 -%endmacro - -cextern x264_pixel_sad_16x16_mmxext -cextern x264_pixel_sad_16x8_mmxext -cextern x264_pixel_sad_8x16_mmxext -cextern x264_pixel_sad_8x8_mmxext -cextern x264_pixel_sad_8x4_mmxext -cextern x264_pixel_sad_x3_16x16_mmxext -cextern x264_pixel_sad_x3_16x8_mmxext -cextern x264_pixel_sad_x3_8x16_mmxext -cextern x264_pixel_sad_x3_8x8_mmxext -cextern x264_pixel_sad_x4_16x16_mmxext -cextern x264_pixel_sad_x4_16x8_mmxext -cextern x264_pixel_sad_x4_8x16_mmxext -cextern x264_pixel_sad_x4_8x8_mmxext - -; instantiate the aligned sads - -SAD16_CACHELINE_FUNC sse2, 8 -SAD16_CACHELINE_FUNC sse2, 16 -%assign i 1 -%rep 15 -SAD16_CACHELINE_LOOP_SSE2 i -%assign i i+1 -%endrep - -SAD16_CACHELINE_FUNC_MMX2 16, 32 -SAD8_CACHELINE_FUNC_MMX2 4, 32 -SAD8_CACHELINE_FUNC_MMX2 8, 32 -SAD8_CACHELINE_FUNC_MMX2 16, 32 -SAD16_CACHELINE_FUNC_MMX2 8, 64 -SAD16_CACHELINE_FUNC_MMX2 16, 64 -SAD8_CACHELINE_FUNC_MMX2 4, 64 -SAD8_CACHELINE_FUNC_MMX2 8, 64 -SAD8_CACHELINE_FUNC_MMX2 16, 64 -SAD16_CACHELINE_FUNC_MMX2 8, 32 - -SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext -SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext -SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext -SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext -SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext -SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext -SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext -SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext -SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2 -SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2 - -%ifdef HAVE_SSE3 - -SAD16_CACHELINE_FUNC ssse3, 8 -SAD16_CACHELINE_FUNC ssse3, 16 -%assign i 1 -%rep 15 -SAD16_CACHELINE_LOOP_SSSE3 i -%assign i i+1 -%endrep - -SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3 -SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3 - -%endif ; HAVE_SSE3 - - -%macro SSD_INC_2x16P_SSE2 0 - movdqu xmm1, [eax] - movdqu xmm2, [ecx] - movdqu xmm3, [eax+ebx] - movdqu xmm4, [ecx+edx] - - movdqa xmm5, xmm1 - movdqa xmm6, xmm3 - psubusb xmm1, xmm2 - psubusb xmm3, xmm4 - psubusb xmm2, xmm5 - psubusb xmm4, xmm6 - por xmm1, xmm2 - por xmm3, xmm4 - - movdqa xmm2, xmm1 - movdqa xmm4, xmm3 - punpcklbw xmm1, xmm7 - punpckhbw xmm2, xmm7 - punpcklbw xmm3, xmm7 - punpckhbw xmm4, xmm7 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - pmaddwd xmm4, xmm4 - - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - - paddd xmm1, xmm2 - paddd xmm3, xmm4 - paddd xmm0, xmm1 - paddd xmm0, xmm3 -%endmacro - -%macro SSD_START_SSE2 0 - push ebx - - mov eax, [esp+ 8] ; pix1 - mov ebx, [esp+12] ; stride1 - mov ecx, [esp+16] ; pix2 - mov edx, [esp+20] ; stride2 - - pxor xmm7, xmm7 ; zero - pxor xmm0, xmm0 ; mm0 holds the sum -%endmacro - -%macro SSD_END_SSE2 0 - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddd xmm0, xmm1 - - movdqa xmm1, xmm0 - psrldq xmm1, 4 - paddd xmm0, xmm1 - - movd eax, xmm0 - - pop ebx - ret -%endmacro - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ssd_16x16_sse2 - SSD_START_SSE2 -%rep 8 - SSD_INC_2x16P_SSE2 -%endrep - SSD_END_SSE2 - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ssd_16x8_sse2 - SSD_START_SSE2 -%rep 4 - SSD_INC_2x16P_SSE2 -%endrep - SSD_END_SSE2 - - - -%macro SUMSUB_BADC 4 - paddw %1, %2 - paddw %3, %4 - paddw %2, %2 - paddw %4, %4 - psubw %2, %1 - psubw %4, %3 -%endmacro - -%macro HADAMARD1x4 4 - SUMSUB_BADC %1, %2, %3, %4 - SUMSUB_BADC %1, %3, %2, %4 -%endmacro - -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 -%endmacro - -%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers - mov%1 %5, %3 - punpckh%2 %3, %4 - punpckl%2 %5, %4 -%endmacro - -%macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC - SBUTTERFLY dqa, dq, %1, %2, %5 - SBUTTERFLY dqa, dq, %3, %4, %2 - SBUTTERFLY dqa, qdq, %1, %3, %4 - SBUTTERFLY dqa, qdq, %5, %2, %3 -%endmacro - -%macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD - SBUTTERFLY dqa, wd, %1, %2, %5 - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, dq, %1, %3, %4 - SBUTTERFLY2 dqa, dq, %5, %2, %3 - SBUTTERFLY dqa, qdq, %1, %3, %2 - SBUTTERFLY2 dqa, qdq, %4, %5, %3 -%endmacro - -%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2] - movq %1, %3 - movq %2, %4 - punpcklbw %1, %2 - punpcklbw %2, %2 - psubw %1, %2 -%endmacro - -%macro SUM4x4_SSE2 4 ; 02 13 junk sum - pxor %3, %3 - psubw %3, %1 - pmaxsw %1, %3 - - pxor %3, %3 - psubw %3, %2 - pmaxsw %2, %3 - - paddusw %4, %1 - paddusw %4, %2 -%endmacro - -%macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum - pxor %3, %3 - pxor %6, %6 - psubw %3, %1 - psubw %6, %4 - pmaxsw %1, %3 - pmaxsw %4, %6 - pxor %3, %3 - pxor %6, %6 - psubw %3, %2 - psubw %6, %5 - pmaxsw %2, %3 - pmaxsw %5, %6 - paddusw %1, %2 - paddusw %4, %5 - paddusw %7, %1 - paddusw %7, %4 -%endmacro - -%macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum - pabsw %1, %1 - pabsw %2, %2 - pabsw %4, %4 - pabsw %5, %5 - paddusw %1, %2 - paddusw %4, %5 - paddusw %7, %1 - paddusw %7, %4 -%endmacro - -%macro SATD_TWO_SSE2 0 - LOAD_DIFF_8P xmm0, xmm4, [eax], [ecx] - LOAD_DIFF_8P xmm1, xmm5, [eax+ebx], [ecx+edx] - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - LOAD_DIFF_8P xmm2, xmm4, [eax], [ecx] - LOAD_DIFF_8P xmm3, xmm5, [eax+ebx], [ecx+edx] - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - - HADAMARD1x4 xmm0, xmm1, xmm2, xmm3 - TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4 - HADAMARD1x4 xmm0, xmm1, xmm2, xmm3 - SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6 -%endmacro - -%macro SATD_START 0 - push ebx - - mov eax, [esp+ 8] ; pix1 - mov ebx, [esp+12] ; stride1 - mov ecx, [esp+16] ; pix2 - mov edx, [esp+20] ; stride2 - - pxor xmm6, xmm6 -%endmacro - -%macro SATD_END 0 - ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. - psrlw xmm6, 1 - HADDW xmm6, xmm7 - movd eax, xmm6 - pop ebx - ret -%endmacro - -%macro SATDS 1 -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_16x16_%1 - SATD_START - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - mov eax, [esp+ 8] - mov ecx, [esp+16] - add eax, 8 - add ecx, 8 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x16_%1 - SATD_START - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_16x8_%1 - SATD_START - SATD_TWO_SSE2 - SATD_TWO_SSE2 - mov eax, [esp+ 8] - mov ecx, [esp+16] - add eax, 8 - add ecx, 8 - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x8_%1 - SATD_START - SATD_TWO_SSE2 - SATD_TWO_SSE2 - SATD_END - -;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_satd_8x4_%1 - SATD_START - SATD_TWO_SSE2 - SATD_END -%endmacro ; SATDS - -%define SUM8x4 SUM8x4_SSE2 -SATDS sse2 -%ifdef HAVE_SSE3 -%define SUM8x4 SUM8x4_SSSE3 -SATDS ssse3 -%endif - - - -;----------------------------------------------------------------------------- -; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_4x4x2_core_sse2 - push ebx - mov eax, [esp+ 8] - mov ebx, [esp+12] - mov ecx, [esp+16] - mov edx, [esp+20] - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm2, xmm2 - pxor xmm3, xmm3 - pxor xmm4, xmm4 -%rep 4 - movq xmm5, [eax] - movq xmm6, [ecx] - punpcklbw xmm5, xmm0 - punpcklbw xmm6, xmm0 - paddw xmm1, xmm5 - paddw xmm2, xmm6 - movdqa xmm7, xmm5 - pmaddwd xmm5, xmm5 - pmaddwd xmm7, xmm6 - pmaddwd xmm6, xmm6 - paddd xmm3, xmm5 - paddd xmm4, xmm7 - paddd xmm3, xmm6 - add eax, ebx - add ecx, edx -%endrep - ; PHADDW xmm1, xmm2 - ; PHADDD xmm3, xmm4 - mov eax, [esp+24] - picgetgot ebx - movdqa xmm7, [pw_1 GLOBAL] - pshufd xmm5, xmm3, 0xB1 - pmaddwd xmm1, xmm7 - pmaddwd xmm2, xmm7 - pshufd xmm6, xmm4, 0xB1 - packssdw xmm1, xmm2 - paddd xmm3, xmm5 - pshufd xmm1, xmm1, 0xD8 - paddd xmm4, xmm6 - pmaddwd xmm1, xmm7 - movdqa xmm5, xmm3 - punpckldq xmm3, xmm4 - punpckhdq xmm5, xmm4 - movq [eax+ 0], xmm1 - movq [eax+ 8], xmm3 - psrldq xmm1, 8 - movq [eax+16], xmm1 - movq [eax+24], xmm5 - pop ebx - ret - -;----------------------------------------------------------------------------- -; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_end4_sse2 - mov eax, [esp+ 4] - mov ecx, [esp+ 8] - mov edx, [esp+12] - picpush ebx - picgetgot ebx - movdqa xmm0, [eax+ 0] - movdqa xmm1, [eax+16] - movdqa xmm2, [eax+32] - movdqa xmm3, [eax+48] - movdqa xmm4, [eax+64] - paddd xmm0, [ecx+ 0] - paddd xmm1, [ecx+16] - paddd xmm2, [ecx+32] - paddd xmm3, [ecx+48] - paddd xmm4, [ecx+64] - paddd xmm0, xmm1 - paddd xmm1, xmm2 - paddd xmm2, xmm3 - paddd xmm3, xmm4 - movdqa xmm5, [ssim_c1 GLOBAL] - movdqa xmm6, [ssim_c2 GLOBAL] - TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4 - -; s1=mm0, s2=mm3, ss=mm4, s12=mm2 - movdqa xmm1, xmm3 - pslld xmm3, 16 - pmaddwd xmm1, xmm0 ; s1*s2 - por xmm0, xmm3 - pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2 - pslld xmm1, 1 - pslld xmm2, 7 - pslld xmm4, 6 - psubd xmm2, xmm1 ; covar*2 - psubd xmm4, xmm0 ; vars - paddd xmm0, xmm5 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm4, xmm6 - cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1) - cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1) - cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2) - cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2) - mulps xmm1, xmm2 - mulps xmm0, xmm4 - divps xmm1, xmm0 ; ssim - - neg edx - movdqu xmm3, [mask_ff + edx*4 + 16 GLOBAL] - pand xmm1, xmm3 - movhlps xmm0, xmm1 - addps xmm0, xmm1 - pshuflw xmm1, xmm0, 0xE - addss xmm0, xmm1 - - movd [picesp+4], xmm0 - fld dword [picesp+4] - picpop ebx - ret - diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm deleted file mode 100644 index 4c58357d..00000000 --- a/common/i386/predict-a.asm +++ /dev/null @@ -1,629 +0,0 @@ -;***************************************************************************** -;* predict-a.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2005 x264 project -;* -;* Authors: Loren Merritt -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -BITS 32 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "i386inc.asm" - -%macro STORE8x8 2 - movq [edx + 0*FDEC_STRIDE], %1 - movq [edx + 1*FDEC_STRIDE], %1 - movq [edx + 2*FDEC_STRIDE], %1 - movq [edx + 3*FDEC_STRIDE], %1 - movq [edx + 4*FDEC_STRIDE], %2 - movq [edx + 5*FDEC_STRIDE], %2 - movq [edx + 6*FDEC_STRIDE], %2 - movq [edx + 7*FDEC_STRIDE], %2 -%endmacro - -%macro STORE16x16 2 - mov eax, 4 -.loop: - movq [edx + 0*FDEC_STRIDE], %1 - movq [edx + 1*FDEC_STRIDE], %1 - movq [edx + 2*FDEC_STRIDE], %1 - movq [edx + 3*FDEC_STRIDE], %1 - movq [edx + 0*FDEC_STRIDE + 8], %2 - movq [edx + 1*FDEC_STRIDE + 8], %2 - movq [edx + 2*FDEC_STRIDE + 8], %2 - movq [edx + 3*FDEC_STRIDE + 8], %2 - add edx, 4*FDEC_STRIDE - dec eax - jg .loop - nop -%endmacro - -%macro STORE16x16_SSE2 1 - mov eax, 4 -.loop: - movdqa [edx + 0*FDEC_STRIDE], %1 - movdqa [edx + 1*FDEC_STRIDE], %1 - movdqa [edx + 2*FDEC_STRIDE], %1 - movdqa [edx + 3*FDEC_STRIDE], %1 - add edx, 4*FDEC_STRIDE - dec eax - jg .loop - nop -%endmacro - -SECTION_RODATA - -ALIGN 16 -pb_1: times 16 db 1 -pw_2: times 4 dw 2 -pw_4: times 4 dw 4 -pw_8: times 8 dw 8 -pw_76543210: -pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 -pb_00s_ff: times 8 db 0 -pb_0s_ff: times 7 db 0 - db 0xff - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -; dest, left, right, src, tmp -; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 -; dest, left, right, src, tmp -; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 -%macro PRED8x8_LOWPASS0 6 - mov%6 %5, %2 - pavgb %2, %3 - pxor %3, %5 - mov%6 %1, %4 - pand %3, [pb_1 GLOBAL] - psubusb %2, %3 - pavgb %1, %2 -%endmacro -%macro PRED8x8_LOWPASS 5 - PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q -%endmacro -%macro PRED8x8_LOWPASS_XMM 5 - PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa -%endmacro - - -;----------------------------------------------------------------------------- -; void predict_4x4_ddl_mmxext( uint8_t *src ) -;----------------------------------------------------------------------------- -cglobal predict_4x4_ddl_mmxext - mov eax, [esp + 4] - picgetgot ecx - movq mm3, [eax - FDEC_STRIDE ] - movq mm1, [eax - FDEC_STRIDE - 1] - movq mm2, mm3 - movq mm4, [pb_0s_ff GLOBAL] - psrlq mm2, 8 - pand mm4, mm3 - por mm2, mm4 - PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 -%assign Y 0 -%rep 4 - psrlq mm0, 8 - movd [eax + Y * FDEC_STRIDE], mm0 -%assign Y (Y+1) -%endrep - ret - -;----------------------------------------------------------------------------- -; void predict_4x4_vl_mmxext( uint8_t *src ) -;----------------------------------------------------------------------------- -cglobal predict_4x4_vl_mmxext - mov eax, [esp + 4] - picgetgot ecx - movq mm1, [eax - FDEC_STRIDE] - movq mm3, mm1 - movq mm2, mm1 - psrlq mm3, 8 - psrlq mm2, 16 - movq mm4, mm3 - pavgb mm4, mm1 - PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 - movd [eax + 0*FDEC_STRIDE], mm4 - movd [eax + 1*FDEC_STRIDE], mm0 - psrlq mm4, 8 - psrlq mm0, 8 - movd [eax + 2*FDEC_STRIDE], mm4 - movd [eax + 3*FDEC_STRIDE], mm0 - - ret - - -;----------------------------------------------------------------------------- -; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- -cglobal predict_8x8_v_mmxext - mov eax, [esp+8] - mov edx, [esp+4] - movq mm0, [eax+16] - STORE8x8 mm0, mm0 - ret - -;----------------------------------------------------------------------------- -; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- -cglobal predict_8x8_dc_mmxext - picgetgot ecx - mov eax, [esp + 8] - mov edx, [esp + 4] - pxor mm0, mm0 - pxor mm1, mm1 - psadbw mm0, [eax+7] - psadbw mm1, [eax+16] - paddw mm0, [pw_8 GLOBAL] - paddw mm0, mm1 - psrlw mm0, 4 - pshufw mm0, mm0, 0 - packuswb mm0, mm0 - STORE8x8 mm0, mm0 - ret - -;----------------------------------------------------------------------------- -; void predict_8x8_top_mmxext( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- -%macro PRED8x8_DC 2 -cglobal %1 - picgetgot ecx - mov eax, [esp + 8] - mov edx, [esp + 4] - pxor mm0, mm0 - psadbw mm0, [eax+%2] - paddw mm0, [pw_4 GLOBAL] - psrlw mm0, 3 - pshufw mm0, mm0, 0 - packuswb mm0, mm0 - STORE8x8 mm0, mm0 - ret -%endmacro - -PRED8x8_DC predict_8x8_dc_top_mmxext, 16 -PRED8x8_DC predict_8x8_dc_left_mmxext, 7 - -;----------------------------------------------------------------------------- -; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl_mmxext - picgetgot ecx - mov eax, [esp + 8] - mov edx, [esp + 4] - movq mm1, [eax + 15] - movq mm2, [eax + 17] - movq mm3, [eax + 23] - movq mm4, [eax + 25] - PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 16], mm7 - PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 24], mm6 - -%assign Y 7 -%rep 6 - movq [edx + Y*FDEC_STRIDE], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 -%assign Y (Y-1) -%endrep - movq [edx + Y*FDEC_STRIDE], mm1 - psllq mm1, 8 - psrlq mm0, 56 - por mm1, mm0 -%assign Y (Y-1) - movq [edx + Y*FDEC_STRIDE], mm1 - - ret - -;----------------------------------------------------------------------------- -; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- -cglobal predict_8x8_ddr_mmxext - picgetgot ecx - mov eax, [esp + 8] - mov edx, [esp + 4] - movq mm1, [eax + 7] - movq mm2, [eax + 9] - movq mm3, [eax + 15] - movq mm4, [eax + 17] - PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 8], mm7 - PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 16], mm6 - -%assign Y 7 -%rep 6 - movq [edx + Y*FDEC_STRIDE], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 -%assign Y (Y-1) -%endrep - movq [edx + Y*FDEC_STRIDE], mm0 - psrlq mm0, 8 - psllq mm1, 56 - por mm0, mm1 -%assign Y (Y-1) - movq [edx + Y*FDEC_STRIDE], mm0 - - ret - -;----------------------------------------------------------------------------- -; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- - -; fills only some pixels: -; f01234567 -; 0........ -; 1,,,,,,,, -; 2 ....... -; 3 ,,,,,,, -; 4 ...... -; 5 ,,,,,, -; 6 ..... -; 7 ,,,,, - -cglobal predict_8x8_vr_core_mmxext - picgetgot ecx - mov eax, [esp + 8] - mov edx, [esp + 4] - movq mm2, [eax + 16] - movq mm3, [eax + 15] - movq mm1, [eax + 14] - movq mm4, mm3 - pavgb mm3, mm2 - PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7 - -%assign Y 0 -%rep 3 - movq [edx + Y *FDEC_STRIDE], mm3 - movq [edx + (Y+1)*FDEC_STRIDE], mm0 - psllq mm3, 8 - psllq mm0, 8 -%assign Y (Y+2) -%endrep - movq [edx + Y *FDEC_STRIDE], mm3 - movq [edx + (Y+1)*FDEC_STRIDE], mm0 - - ret - -;----------------------------------------------------------------------------- -; void predict_8x8c_v_mmx( uint8_t *src ) -;----------------------------------------------------------------------------- -cglobal predict_8x8c_v_mmx - mov edx, [esp + 4] - movq mm0, [edx - FDEC_STRIDE] - STORE8x8 mm0, mm0 - ret - -;----------------------------------------------------------------------------- -; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ) -;----------------------------------------------------------------------------- -cglobal predict_8x8c_dc_core_mmxext - picgetgot ecx - - mov edx, [esp + 4] - - movq mm0, [edx - FDEC_STRIDE] - pxor mm1, mm1 - pxor mm2, mm2 - punpckhbw mm1, mm0 - punpcklbw mm0, mm2 - psadbw mm1, mm2 ; s1 - psadbw mm0, mm2 ; s0 - - paddw mm0, [esp + 8] - pshufw mm2, [esp + 12], 0 - psrlw mm0, 3 - paddw mm1, [pw_2 GLOBAL] - movq mm3, mm2 - pshufw mm1, mm1, 0 - pshufw mm0, mm0, 0 ; dc0 (w) - paddw mm3, mm1 - psrlw mm3, 3 ; dc3 (w) - psrlw mm2, 2 ; dc2 (w) - psrlw mm1, 2 ; dc1 (w) - - packuswb mm0, mm1 ; dc0,dc1 (b) - packuswb mm2, mm3 ; dc2,dc3 (b) - - STORE8x8 mm0, mm2 - ret - -;----------------------------------------------------------------------------- -; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) -;----------------------------------------------------------------------------- -cglobal predict_8x8c_p_core_mmxext - picgetgot ecx - - mov edx, [esp + 4] - pshufw mm0, [esp + 8], 0 - pshufw mm2, [esp +12], 0 - pshufw mm4, [esp +16], 0 - movq mm1, mm2 - pmullw mm2, [pw_3210 GLOBAL] - psllw mm1, 2 - paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} - paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} - - mov eax, 8 -ALIGN 4 -.loop: - movq mm5, mm0 - movq mm6, mm1 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [edx], mm5 - - paddsw mm0, mm4 - paddsw mm1, mm4 - add edx, FDEC_STRIDE - dec eax - jg .loop - - nop - ret - -;----------------------------------------------------------------------------- -; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) -;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_mmxext - picgetgot ecx - - mov edx, [esp + 4] - pshufw mm0, [esp + 8], 0 - pshufw mm2, [esp +12], 0 - pshufw mm4, [esp +16], 0 - movq mm5, mm2 - movq mm1, mm2 - pmullw mm5, [pw_3210 GLOBAL] - psllw mm2, 3 - psllw mm1, 2 - movq mm3, mm2 - paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} - paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} - paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} - paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} - - mov eax, 16 -ALIGN 4 -.loop: - movq mm5, mm0 - movq mm6, mm1 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [edx], mm5 - - movq mm5, mm2 - movq mm6, mm3 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [edx+8], mm5 - - paddsw mm0, mm4 - paddsw mm1, mm4 - paddsw mm2, mm4 - paddsw mm3, mm4 - add edx, FDEC_STRIDE - dec eax - jg .loop - - nop - ret - -;----------------------------------------------------------------------------- -; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ) -;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_sse2 - picgetgot ecx - - mov edx, [esp + 4 ] - movd xmm0, [esp + 8 ] - movd xmm1, [esp + 12] - movd xmm2, [esp + 16] - pshuflw xmm0, xmm0, 0 - pshuflw xmm1, xmm1, 0 - pshuflw xmm2, xmm2, 0 - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - movdqa xmm3, xmm1 - pmullw xmm3, [pw_76543210 GLOBAL] - psllw xmm1, 3 - paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} - paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} - - mov eax, 16 -ALIGN 4 -.loop: - movdqa xmm3, xmm0 - movdqa xmm4, xmm1 - psraw xmm3, 5 - psraw xmm4, 5 - packuswb xmm3, xmm4 - movdqa [edx], xmm3 - - paddsw xmm0, xmm2 - paddsw xmm1, xmm2 - add edx, FDEC_STRIDE - dec eax - jg .loop - - nop - ret - -;----------------------------------------------------------------------------- -; void predict_16x16_v_mmx( uint8_t *src ) -;----------------------------------------------------------------------------- -cglobal predict_16x16_v_mmx - mov edx, [esp + 4] - movq mm0, [edx - FDEC_STRIDE] - movq mm1, [edx + 8 - FDEC_STRIDE] - STORE16x16 mm0, mm1 - ret - -;----------------------------------------------------------------------------- -; void predict_16x16_v_sse2( uint8_t *src ) -;----------------------------------------------------------------------------- -cglobal predict_16x16_v_sse2 - mov edx, [esp + 4] - movdqa xmm0, [edx - FDEC_STRIDE] - STORE16x16_SSE2 xmm0 - ret - -;----------------------------------------------------------------------------- -; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) -;----------------------------------------------------------------------------- - -%macro PRED16x16_DC 2 - mov edx, [esp+4] - pxor mm0, mm0 - pxor mm1, mm1 - psadbw mm0, [edx - FDEC_STRIDE] - psadbw mm1, [edx - FDEC_STRIDE + 8] - paddusw mm0, mm1 - paddusw mm0, %1 - psrlw mm0, %2 ; dc - pshufw mm0, mm0, 0 - packuswb mm0, mm0 ; dc in bytes - STORE16x16 mm0, mm0 -%endmacro - -cglobal predict_16x16_dc_core_mmxext - PRED16x16_DC [esp+8], 5 - ret - -cglobal predict_16x16_dc_top_mmxext - picgetgot ecx - PRED16x16_DC [pw_8 GLOBAL], 4 - ret - -;----------------------------------------------------------------------------- -; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ) -;----------------------------------------------------------------------------- - -%macro PRED16x16_DC_SSE2 2 - mov edx, [esp+4] - pxor xmm0, xmm0 - psadbw xmm0, [edx - FDEC_STRIDE] - movhlps xmm1, xmm0 - paddw xmm0, xmm1 - paddusw xmm0, %1 - psrlw xmm0, %2 ; dc - pshuflw xmm0, xmm0, 0 - punpcklqdq xmm0, xmm0 - packuswb xmm0, xmm0 ; dc in bytes - STORE16x16_SSE2 xmm0 -%endmacro - -cglobal predict_16x16_dc_core_sse2 - movd xmm2, [esp+8] - PRED16x16_DC_SSE2 xmm2, 5 - ret - -cglobal predict_16x16_dc_top_sse2 - picgetgot ecx - PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 - ret - -;----------------------------------------------------------------------------- -; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- -cglobal predict_8x8_ddr_sse2 - mov edx, [esp + 8] - mov eax, [esp + 4] - picgetgot ecx - movdqu xmm3, [edx + 8] - movdqu xmm1, [edx + 7] - movdqa xmm2, xmm3 - psrldq xmm2, 1 - PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 - movdqa xmm1, xmm0 - psrldq xmm1, 1 -%assign Y 7 -%rep 3 - movq [eax + Y * FDEC_STRIDE], xmm0 - movq [eax + (Y-1) * FDEC_STRIDE], xmm1 - psrldq xmm0, 2 - psrldq xmm1, 2 -%assign Y (Y-2) -%endrep - movq [eax + 1 * FDEC_STRIDE], xmm0 - movq [eax + 0 * FDEC_STRIDE], xmm1 - ret - -;----------------------------------------------------------------------------- -; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl_sse2 - mov edx, [esp + 8] - mov eax, [esp + 4] - picgetgot ecx - movdqa xmm3, [edx + 16] - movdqu xmm2, [edx + 17] - movdqa xmm1, xmm3 - pslldq xmm1, 1 - PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 -%assign Y 0 -%rep 8 - psrldq xmm0, 1 - movq [eax + Y * FDEC_STRIDE], xmm0 -%assign Y (Y+1) -%endrep - ret - -;----------------------------------------------------------------------------- -; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge ) -;----------------------------------------------------------------------------- -cglobal predict_8x8_vl_sse2 - mov edx, [esp + 8] - mov eax, [esp + 4] - picgetgot ecx - movdqa xmm4, [edx + 16] - movdqa xmm2, xmm4 - movdqa xmm1, xmm4 - movdqa xmm3, xmm4 - psrldq xmm2, 1 - pslldq xmm1, 1 - pavgb xmm3, xmm2 - PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5 -; xmm0: (t0 + 2*t1 + t2 + 2) >> 2 -; xmm3: (t0 + t1 + 1) >> 1 -%assign Y 0 -%rep 3 - psrldq xmm0, 1 - movq [eax + Y * FDEC_STRIDE], xmm3 - movq [eax + (Y+1) * FDEC_STRIDE], xmm0 - psrldq xmm3, 1 -%assign Y (Y+2) -%endrep - psrldq xmm0, 1 - movq [eax + Y * FDEC_STRIDE], xmm3 - movq [eax + (Y+1) * FDEC_STRIDE], xmm0 - ret diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm deleted file mode 100644 index 0c59361f..00000000 --- a/common/i386/quant-a.asm +++ /dev/null @@ -1,298 +0,0 @@ -;***************************************************************************** -;* quant-a.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2005 x264 project -;* -;* Authors: Loren Merritt -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. -;***************************************************************************** - -BITS 32 - -%include "i386inc.asm" - -SECTION_RODATA -pd_1: times 2 dd 1 - -SECTION .text - -%macro QUANT_AC_START 0 - mov eax, [esp+ 4] ; dct - mov ecx, [esp+ 8] ; mf - mov edx, [esp+12] ; bias -%endmacro - -%macro MMX_QUANT_DC_START 0 - mov eax, [esp+ 4] ; dct - movd mm6, [esp+ 8] ; mf - movd mm7, [esp+12] ; bias - pshufw mm6, mm6, 0 - pshufw mm7, mm7, 0 -%endmacro - -%macro SSE2_QUANT_DC_START 0 - mov eax, [esp+ 4] ; dct - movd xmm6, [esp+ 8] ; mf - movd xmm7, [esp+12] ; bias - pshuflw xmm6, xmm6, 0 - pshuflw xmm7, xmm7, 0 - punpcklqdq xmm6, xmm6 - punpcklqdq xmm7, xmm7 -%endmacro - -%macro QUANT_ONE 5 -;;; %1 (m64) dct[y][x] -;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t) -;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t) - - mov%1 %2m0, %3 ; load dct coeffs - pxor %2m1, %2m1 - pcmpgtw %2m1, %2m0 ; sign(coeff) - pxor %2m0, %2m1 - psubw %2m0, %2m1 ; abs(coeff) - paddusw %2m0, %5 ; round - pmulhuw %2m0, %4 ; divide - pxor %2m0, %2m1 ; restore sign - psubw %2m0, %2m1 - mov%1 %3, %2m0 ; store -%endmacro -%macro MMX_QUANT_1x4 3 - QUANT_ONE q, m, %1, %2, %3 -%endmacro -%macro SSE2_QUANT_1x8 3 - QUANT_ONE dqa, xm, %1, %2, %3 -%endmacro - -%macro SSSE3_QUANT_1x8 3 - movdqa xmm1, %1 ; load dct coeffs - pabsw xmm0, xmm1 - paddusw xmm0, %3 ; round - pmulhuw xmm0, %2 ; divide - psignw xmm0, xmm1 ; restore sign - movdqa %1, xmm0 ; store -%endmacro - -;----------------------------------------------------------------------------- -; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias ) -;----------------------------------------------------------------------------- -cglobal x264_quant_2x2_dc_mmxext - MMX_QUANT_DC_START - MMX_QUANT_1x4 [eax], mm6, mm7 - ret - -;----------------------------------------------------------------------------- -; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias ) -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_mmxext - MMX_QUANT_DC_START -%assign x 0 -%rep 4 - MMX_QUANT_1x4 [eax+x], mm6, mm7 -%assign x (x+8) -%endrep - ret - -;----------------------------------------------------------------------------- -; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_mmx - QUANT_AC_START -%assign x 0 -%rep 4 - MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x] -%assign x (x+8) -%endrep - ret - -;----------------------------------------------------------------------------- -; void x264_quant_8x8_mmx( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) -;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_mmx - QUANT_AC_START -%assign x 0 -%rep 16 - MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x] -%assign x (x+8) -%endrep - ret - -%macro QUANT_SSE 1 -;----------------------------------------------------------------------------- -; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias ) -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_%1 - SSE2_QUANT_DC_START -%assign x 0 -%rep 2 - QUANT_1x8 [eax+x], xmm6, xmm7 -%assign x (x+16) -%endrep - ret - -;----------------------------------------------------------------------------- -; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_%1 - QUANT_AC_START -%assign x 0 -%rep 2 - QUANT_1x8 [eax+x], [ecx+x], [edx+x] -%assign x (x+16) -%endrep - ret - -;----------------------------------------------------------------------------- -; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) -;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_%1 - QUANT_AC_START -%assign x 0 -%rep 8 - QUANT_1x8 [eax+x], [ecx+x], [edx+x] -%assign x (x+16) -%endrep - ret -%endmacro - -%define QUANT_1x8 SSE2_QUANT_1x8 -QUANT_SSE sse2 -%ifdef HAVE_SSE3 -%define QUANT_1x8 SSSE3_QUANT_1x8 -QUANT_SSE ssse3 -%endif - - -;============================================================================= -; dequant -;============================================================================= - -%macro DEQUANT16_L_1x4 3 -;;; %1 dct[y][x] -;;; %2,%3 dequant_mf[i_mf][y][x] -;;; mm5 i_qbits - - movq mm1, %2 - movq mm2, %3 - movq mm0, %1 - packssdw mm1, mm2 - pmullw mm0, mm1 - psllw mm0, mm5 - movq %1, mm0 -%endmacro - -%macro DEQUANT16_R_1x4 3 -;;; %1 dct[y][x] -;;; %2,%3 dequant_mf[i_mf][y][x] -;;; mm5 -i_qbits -;;; mm6 f as words - - movq mm1, %2 - movq mm2, %3 - movq mm0, %1 - packssdw mm1, mm2 - pmullw mm0, mm1 - paddw mm0, mm6 - psraw mm0, mm5 - movq %1, mm0 -%endmacro - -%macro DEQUANT32_R_1x4 3 -;;; %1 dct[y][x] -;;; %2,%3 dequant_mf[i_mf][y][x] -;;; mm5 -i_qbits -;;; mm6 f as dwords -;;; mm7 0 - - movq mm0, %1 - movq mm1, mm0 - punpcklwd mm0, mm0 - punpckhwd mm1, mm1 - - movq mm2, mm0 - movq mm3, mm1 - pmulhw mm0, %2 - pmulhw mm1, %3 - pmullw mm2, %2 - pmullw mm3, %3 - pslld mm0, 16 - pslld mm1, 16 - paddd mm0, mm2 - paddd mm1, mm3 - - paddd mm0, mm6 - paddd mm1, mm6 - psrad mm0, mm5 - psrad mm1, mm5 - - packssdw mm0, mm1 - movq %1, mm0 -%endmacro - -;----------------------------------------------------------------------------- -; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) -;----------------------------------------------------------------------------- -%macro DEQUANT_WxH 3 -cglobal %1 - mov edx, [esp+12] ; i_qp - imul eax, edx, 0x2b - shr eax, 8 ; i_qbits = i_qp / 6 - lea ecx, [eax+eax*2] - sub edx, ecx - sub edx, ecx ; i_mf = i_qp % 6 - shl edx, %3+2 - add edx, [esp+8] ; dequant_mf[i_mf] - mov ecx, [esp+4] ; dct - - sub eax, %3 - jl .rshift32 ; negative qbits => rightshift - -.lshift: - movd mm5, eax - - mov eax, 8*(%2-1) -.loopl16 -%rep 2 - DEQUANT16_L_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8] - sub eax, byte 8 -%endrep - jge .loopl16 - - nop - ret - -.rshift32: - neg eax - movd mm5, eax - picgetgot eax - movq mm6, [pd_1 GLOBAL] - pxor mm7, mm7 - pslld mm6, mm5 - psrld mm6, 1 - - mov eax, 8*(%2-1) -.loopr32 -%rep 2 - DEQUANT32_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8] - sub eax, byte 8 -%endrep - jge .loopr32 - - nop - ret -%endmacro - -DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4 -DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6 diff --git a/common/mc.c b/common/mc.c index cc7032f0..1fd3aeca 100644 --- a/common/mc.c +++ b/common/mc.c @@ -25,7 +25,7 @@ #include "clip1.h" #ifdef HAVE_MMX -#include "i386/mc.h" +#include "x86/mc.h" #endif #ifdef ARCH_PPC #include "ppc/mc.h" @@ -376,8 +376,6 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) #ifdef HAVE_MMX x264_mc_init_mmx( cpu, pf ); - if( cpu&X264_CPU_MMXEXT ) - pf->mc_chroma = x264_mc_chroma_mmxext; #endif #ifdef ARCH_PPC if( cpu&X264_CPU_ALTIVEC ) @@ -390,9 +388,9 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) const int b_interlaced = h->sh.b_mbaff; const int stride = frame->i_stride[0] << b_interlaced; const int width = frame->i_width[0]; - int start = (mb_y*16 >> b_interlaced) - 8; + int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8 int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8; - int offs = start*stride - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8 + int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd int x, y; if( mb_y & b_interlaced ) diff --git a/common/pixel.c b/common/pixel.c index 046e01f3..91039411 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -25,7 +25,7 @@ #include "clip1.h" #ifdef HAVE_MMX -# include "i386/pixel.h" +# include "x86/pixel.h" #endif #ifdef ARCH_PPC # include "ppc/pixel.h" @@ -614,10 +614,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( ssd, _sse2 ); pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; - -#ifdef ARCH_X86_64 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; +#ifdef ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif } @@ -636,9 +635,13 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( satd_x3, _ssse3 ); INIT5( satd_x4, _ssse3 ); INIT_ADS( _ssse3 ); -#ifdef ARCH_X86_64 pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3; +#ifdef ARCH_X86_64 + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3; #endif if( cpu&X264_CPU_CACHELINE_SPLIT ) { diff --git a/common/predict.c b/common/predict.c index 57f96215..1ffa5229 100644 --- a/common/predict.c +++ b/common/predict.c @@ -32,7 +32,7 @@ #undef HAVE_MMX /* not finished now */ #endif #ifdef HAVE_MMX -# include "i386/predict.h" +# include "x86/predict.h" #endif #ifdef ARCH_PPC # include "ppc/predict.h" diff --git a/common/quant.c b/common/quant.c index a4d853e5..ed1148cb 100644 --- a/common/quant.c +++ b/common/quant.c @@ -23,7 +23,7 @@ #include "common.h" #ifdef HAVE_MMX -#include "i386/quant.h" +#include "x86/quant.h" #endif #ifdef ARCH_PPC # include "ppc/quant.h" diff --git a/common/i386/cpu-a.asm b/common/x86/cpu-32.asm similarity index 68% rename from common/i386/cpu-a.asm rename to common/x86/cpu-32.asm index 235acd4a..9c5f7c8c 100644 --- a/common/i386/cpu-a.asm +++ b/common/x86/cpu-32.asm @@ -1,8 +1,7 @@ ;***************************************************************************** -;* cpu.asm: h264 encoder library +;* cpu-32.asm: h264 encoder library ;***************************************************************************** -;* Copyright (C) 2003 x264 project -;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ +;* Copyright (C) 2003-2008 x264 project ;* ;* Authors: Laurent Aimar ;* @@ -21,22 +20,13 @@ ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** -BITS 32 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "i386inc.asm" - -;============================================================================= -; Code -;============================================================================= +%include "x86inc.asm" SECTION .text ;----------------------------------------------------------------------------- -; int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported +; int x264_cpu_cpuid_test( void ) +; return 0 if unsupported ;----------------------------------------------------------------------------- cglobal x264_cpu_cpuid_test pushfd @@ -44,7 +34,6 @@ cglobal x264_cpu_cpuid_test push ebp push esi push edi - pushfd pop eax mov ebx, eax @@ -54,7 +43,6 @@ cglobal x264_cpu_cpuid_test pushfd pop eax xor eax, ebx - pop edi pop esi pop ebp @@ -63,39 +51,23 @@ cglobal x264_cpu_cpuid_test ret ;----------------------------------------------------------------------------- -; int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) +; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- -cglobal x264_cpu_cpuid - - push ebp - mov ebp, esp - push ebx - push esi - push edi - - mov eax, [ebp + 8] +cglobal x264_cpu_cpuid, 0,6 + mov eax, r0m cpuid - - mov esi, [ebp + 12] + mov esi, r1m mov [esi], eax - - mov esi, [ebp + 16] + mov esi, r2m mov [esi], ebx - - mov esi, [ebp + 20] + mov esi, r3m mov [esi], ecx - - mov esi, [ebp + 24] + mov esi, r4m mov [esi], edx - - pop edi - pop esi - pop ebx - pop ebp - ret + RET ;----------------------------------------------------------------------------- -; void __cdecl x264_emms( void ) +; void x264_emms( void ) ;----------------------------------------------------------------------------- cglobal x264_emms emms @@ -113,7 +85,6 @@ cglobal x264_stack_align mov edx, [ebp+12] mov [esp], edx call ecx - mov esp, ebp - pop ebp + leave ret diff --git a/common/amd64/cpu-a.asm b/common/x86/cpu-64.asm similarity index 58% rename from common/amd64/cpu-a.asm rename to common/x86/cpu-64.asm index 7137a4c6..b453abff 100644 --- a/common/amd64/cpu-a.asm +++ b/common/x86/cpu-64.asm @@ -1,8 +1,7 @@ ;***************************************************************************** -;* cpu.asm: h264 encoder library +;* cpu-64.asm: h264 encoder library ;***************************************************************************** -;* Copyright (C) 2003 x264 project -;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ +;* Copyright (C) 2003-2008 x264 project ;* ;* Authors: Laurent Aimar ;* @@ -21,64 +20,25 @@ ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** -BITS 64 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "amd64inc.asm" - -;============================================================================= -; Code -;============================================================================= +%include "x86inc.asm" SECTION .text ;----------------------------------------------------------------------------- -; int x264_cpu_cpuid_test( void ) return 0 if unsupported -;----------------------------------------------------------------------------- -cglobal x264_cpu_cpuid_test - firstpush rbx - pushreg rbx - push rbp - pushreg rbp - mov rbp, rsp - setframe rbp, 0 - endprolog - - pushfq - pop rax - mov ebx, eax - xor eax, 0x200000 - push rax - - popfq - pushfq - pop rax - xor eax, ebx - - lea rsp, [rbp] - pop rbp - pop rbx - ret - endfunc - -;----------------------------------------------------------------------------- -; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) +; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- cglobal x264_cpu_cpuid firstpush rbx pushreg rbx endprolog - + mov r10, parm4q mov r11, parm3q mov r9, parm2q %ifdef WIN64 mov r8, [rsp+40+8] -%endif - +%endif + mov eax, parm1d cpuid @@ -92,7 +52,7 @@ cglobal x264_cpu_cpuid endfunc ;----------------------------------------------------------------------------- -; void x264_emms( void ) +; void x264_emms( void ) ;----------------------------------------------------------------------------- cglobal x264_emms emms diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm new file mode 100644 index 00000000..b7de1780 --- /dev/null +++ b/common/x86/dct-32.asm @@ -0,0 +1,560 @@ +;***************************************************************************** +;* dct-32.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003-2008 x264 project +;* +;* Authors: Laurent Aimar (initial version) +;* Min Chen (converted to nasm) +;* Christian Heine (dct8/idct8 functions) +;* Loren Merritt (misc) +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA + +pw_32: times 8 dw 32 + +SECTION .text + +%macro SUMSUB_BA 2 + paddw %1, %2 + paddw %2, %2 + psubw %2, %1 +%endmacro + +%macro SUMSUB_BADC 4 + paddw %1, %2 + paddw %3, %4 + paddw %2, %2 + paddw %4, %4 + psubw %2, %1 + psubw %4, %3 +%endmacro + +%macro SBUTTERFLY 5 + mov%1 %5, %3 + punpckl%2 %3, %4 + punpckh%2 %5, %4 +%endmacro + +; input ABCD output ADTC +%macro TRANSPOSE4x4W 5 + SBUTTERFLY q, wd, %1, %2, %5 + SBUTTERFLY q, wd, %3, %4, %2 + SBUTTERFLY q, dq, %1, %3, %4 + SBUTTERFLY q, dq, %5, %2, %3 +%endmacro + +; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2) +%macro LOAD_DIFF_8P 7 + movq %1, %5 + movq %2, %1 + punpcklbw %1, %7 + punpckhbw %2, %7 + movq %3, %6 + movq %4, %3 + punpcklbw %3, %7 + punpckhbw %4, %7 + psubw %1, %3 + psubw %2, %4 +%endmacro + +%macro LOADSUMSUB 4 ; returns %1=%3+%4, %2=%3-%4 + movq %2, %3 + movq %1, %4 + SUMSUB_BA %1, %2 +%endmacro + +%macro STORE_DIFF_8P 4 + psraw %1, 6 + movq %3, %2 + punpcklbw %3, %4 + paddsw %1, %3 + packuswb %1, %1 + movq %2, %1 +%endmacro + + +;----------------------------------------------------------------------------- +; void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 ); +;----------------------------------------------------------------------------- +ALIGN 16 +x264_pixel_sub_8x8_mmx: + pxor mm7, mm7 + %assign i 0 + %rep 8 + LOAD_DIFF_8P mm0, mm1, mm2, mm3, [r1], [r2], mm7 + movq [r0+i], mm0 + movq [r0+i+8], mm1 + add r1, FENC_STRIDE + add r2, FDEC_STRIDE + %assign i i+16 + %endrep + ret + +;----------------------------------------------------------------------------- +; void x264_ydct8_mmx( int16_t dest[8][8] ); +;----------------------------------------------------------------------------- +ALIGN 16 +x264_ydct8_mmx: + ;------------------------------------------------------------------------- + ; vertical dct ( compute 4 columns at a time -> 2 loops ) + ;------------------------------------------------------------------------- + %assign i 0 + %rep 2 + + LOADSUMSUB mm2, mm3, [r0+i+0*16], [r0+i+7*16] ; mm2 = s07, mm3 = d07 + LOADSUMSUB mm1, mm5, [r0+i+1*16], [r0+i+6*16] ; mm1 = s16, mm5 = d16 + LOADSUMSUB mm0, mm6, [r0+i+2*16], [r0+i+5*16] ; mm0 = s25, mm6 = d25 + LOADSUMSUB mm4, mm7, [r0+i+3*16], [r0+i+4*16] ; mm4 = s34, mm7 = d34 + + SUMSUB_BA mm4, mm2 ; mm4 = a0, mm2 = a2 + SUMSUB_BA mm0, mm1 ; mm0 = a1, mm1 = a3 + SUMSUB_BA mm0, mm4 ; mm0 = dst0, mm1 = dst4 + + movq [r0+i+0*16], mm0 + movq [r0+i+4*16], mm4 + + movq mm0, mm1 ; a3 + psraw mm0, 1 ; a3>>1 + paddw mm0, mm2 ; a2 + (a3>>1) + psraw mm2, 1 ; a2>>1 + psubw mm2, mm1 ; (a2>>1) - a3 + + movq [r0+i+2*16], mm0 + movq [r0+i+6*16], mm2 + + movq mm0, mm6 + psraw mm0, 1 + paddw mm0, mm6 ; d25+(d25>>1) + movq mm1, mm3 + psubw mm1, mm7 ; a5 = d07-d34-(d25+(d25>>1)) + psubw mm1, mm0 + + movq mm0, mm5 + psraw mm0, 1 + paddw mm0, mm5 ; d16+(d16>>1) + movq mm2, mm3 + paddw mm2, mm7 ; a6 = d07+d34-(d16+(d16>>1)) + psubw mm2, mm0 + + movq mm0, mm3 + psraw mm0, 1 + paddw mm0, mm3 ; d07+(d07>>1) + paddw mm0, mm5 + paddw mm0, mm6 ; a4 = d16+d25+(d07+(d07>>1)) + + movq mm3, mm7 + psraw mm3, 1 + paddw mm3, mm7 ; d34+(d34>>1) + paddw mm3, mm5 + psubw mm3, mm6 ; a7 = d16-d25+(d34+(d34>>1)) + + movq mm7, mm3 + psraw mm7, 2 + paddw mm7, mm0 ; a4 + (a7>>2) + + movq mm6, mm2 + psraw mm6, 2 + paddw mm6, mm1 ; a5 + (a6>>2) + + psraw mm0, 2 + psraw mm1, 2 + psubw mm0, mm3 ; (a4>>2) - a7 + psubw mm2, mm1 ; a6 - (a5>>2) + + movq [r0+i+1*16], mm7 + movq [r0+i+3*16], mm6 + movq [r0+i+5*16], mm2 + movq [r0+i+7*16], mm0 + + %assign i i+8 + %endrep + ret + +;----------------------------------------------------------------------------- +; void x264_yidct8_mmx( int16_t dest[8][8] ); +;----------------------------------------------------------------------------- +ALIGN 16 +x264_yidct8_mmx: + ;------------------------------------------------------------------------- + ; vertical idct ( compute 4 columns at a time -> 2 loops ) + ;------------------------------------------------------------------------- + %assign i 0 + %rep 2 + + movq mm1, [r0+i+1*16] ; mm1 = d1 + movq mm3, [r0+i+3*16] ; mm3 = d3 + movq mm5, [r0+i+5*16] ; mm5 = d5 + movq mm7, [r0+i+7*16] ; mm7 = d7 + + movq mm4, mm7 + psraw mm4, 1 + movq mm0, mm5 + psubw mm0, mm7 + psubw mm0, mm4 + psubw mm0, mm3 ; mm0 = e1 + + movq mm6, mm3 + psraw mm6, 1 + movq mm2, mm7 + psubw mm2, mm6 + psubw mm2, mm3 + paddw mm2, mm1 ; mm2 = e3 + + movq mm4, mm5 + psraw mm4, 1 + paddw mm4, mm5 + paddw mm4, mm7 + psubw mm4, mm1 ; mm4 = e5 + + movq mm6, mm1 + psraw mm6, 1 + paddw mm6, mm1 + paddw mm6, mm5 + paddw mm6, mm3 ; mm6 = e7 + + movq mm1, mm0 + movq mm3, mm4 + movq mm5, mm2 + movq mm7, mm6 + psraw mm6, 2 + psraw mm3, 2 + psraw mm5, 2 + psraw mm0, 2 + paddw mm1, mm6 ; mm1 = f1 + paddw mm3, mm2 ; mm3 = f3 + psubw mm5, mm4 ; mm5 = f5 + psubw mm7, mm0 ; mm7 = f7 + + movq mm2, [r0+i+2*16] ; mm2 = d2 + movq mm6, [r0+i+6*16] ; mm6 = d6 + movq mm4, mm2 + movq mm0, mm6 + psraw mm4, 1 + psraw mm6, 1 + psubw mm4, mm0 ; mm4 = a4 + paddw mm6, mm2 ; mm6 = a6 + + movq mm2, [r0+i+0*16] ; mm2 = d0 + movq mm0, [r0+i+4*16] ; mm0 = d4 + SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2 + + SUMSUB_BADC mm6, mm0, mm4, mm2 ; mm6 = f0, mm0 = f6 + ; mm4 = f2, mm2 = f4 + + SUMSUB_BADC mm7, mm6, mm5, mm4 ; mm7 = g0, mm6 = g7 + ; mm5 = g1, mm4 = g6 + SUMSUB_BADC mm3, mm2, mm1, mm0 ; mm3 = g2, mm2 = g5 + ; mm1 = g3, mm0 = g4 + + movq [r0+i+0*16], mm7 + movq [r0+i+1*16], mm5 + movq [r0+i+2*16], mm3 + movq [r0+i+3*16], mm1 + movq [r0+i+4*16], mm0 + movq [r0+i+5*16], mm2 + movq [r0+i+6*16], mm4 + movq [r0+i+7*16], mm6 + + %assign i i+8 + %endrep + ret + +;----------------------------------------------------------------------------- +; void x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] ); +;----------------------------------------------------------------------------- +ALIGN 16 +x264_pixel_add_8x8_mmx: + pxor mm7, mm7 + %assign i 0 + %rep 8 + movq mm0, [r0] + movq mm2, [r1+i] + movq mm3, [r1+i+8] + movq mm1, mm0 + psraw mm2, 6 + psraw mm3, 6 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + packuswb mm0, mm1 + movq [r0], mm0 + add r0, FDEC_STRIDE + %assign i i+16 + %endrep + ret + +;----------------------------------------------------------------------------- +; void x264_transpose_8x8_mmx( int16_t src[8][8] ); +;----------------------------------------------------------------------------- +ALIGN 16 +x264_transpose_8x8_mmx: + movq mm0, [r0 ] + movq mm1, [r0+ 16] + movq mm2, [r0+ 32] + movq mm3, [r0+ 48] + TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 + movq [r0 ], mm0 + movq [r0+ 16], mm3 + movq [r0+ 32], mm4 + movq [r0+ 48], mm2 + + movq mm0, [r0+ 72] + movq mm1, [r0+ 88] + movq mm2, [r0+104] + movq mm3, [r0+120] + TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 + movq [r0+ 72], mm0 + movq [r0+ 88], mm3 + movq [r0+104], mm4 + movq [r0+120], mm2 + + movq mm0, [r0+ 8] + movq mm1, [r0+ 24] + movq mm2, [r0+ 40] + movq mm3, [r0+ 56] + TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 + movq mm1, [r0+ 64] + movq mm5, [r0+ 80] + movq mm6, [r0+ 96] + movq mm7, [r0+112] + + movq [r0+ 64], mm0 + movq [r0+ 80], mm3 + movq [r0+ 96], mm4 + movq [r0+112], mm2 + TRANSPOSE4x4W mm1, mm5, mm6, mm7, mm4 + movq [r0+ 8], mm1 + movq [r0+ 24], mm7 + movq [r0+ 40], mm4 + movq [r0+ 56], mm6 + ret + +;----------------------------------------------------------------------------- +; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) +;----------------------------------------------------------------------------- +cglobal x264_sub8x8_dct8_mmx, 3,3 + call x264_pixel_sub_8x8_mmx + call x264_ydct8_mmx + call x264_transpose_8x8_mmx + jmp x264_ydct8_mmx + +;----------------------------------------------------------------------------- +; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] ) +;----------------------------------------------------------------------------- +cglobal x264_add8x8_idct8_mmx, 0,1 + mov r0, r1m + add word [r0], 32 + call x264_yidct8_mmx + call x264_transpose_8x8_mmx + call x264_yidct8_mmx + mov r1, r0 + mov r0, r0m + jmp x264_pixel_add_8x8_mmx + +%macro IDCT8_1D 8 + movdqa %1, %3 + movdqa %5, %7 + psraw %3, 1 + psraw %7, 1 + psubw %3, %5 + paddw %7, %1 + movdqa %5, %2 + psraw %5, 1 + paddw %5, %2 + paddw %5, %4 + paddw %5, %6 + movdqa %1, %6 + psraw %1, 1 + paddw %1, %6 + paddw %1, %8 + psubw %1, %2 + psubw %2, %4 + psubw %6, %4 + paddw %2, %8 + psubw %6, %8 + psraw %4, 1 + psraw %8, 1 + psubw %2, %4 + psubw %6, %8 + movdqa %4, %5 + movdqa %8, %1 + psraw %4, 2 + psraw %8, 2 + paddw %4, %6 + paddw %8, %2 + psraw %6, 2 + psraw %2, 2 + psubw %5, %6 + psubw %2, %1 + movdqa %1, [eax+0x00] + movdqa %6, [eax+0x40] + SUMSUB_BA %6, %1 + SUMSUB_BA %7, %6 + SUMSUB_BA %3, %1 + SUMSUB_BA %5, %7 + SUMSUB_BA %2, %3 + SUMSUB_BA %8, %1 + SUMSUB_BA %4, %6 +%endmacro + +%macro TRANSPOSE8 9 + movdqa [%9], %8 + SBUTTERFLY dqa, wd, %1, %2, %8 + movdqa [%9+16], %8 + movdqa %8, [%9] + SBUTTERFLY dqa, wd, %3, %4, %2 + SBUTTERFLY dqa, wd, %5, %6, %4 + SBUTTERFLY dqa, wd, %7, %8, %6 + SBUTTERFLY dqa, dq, %1, %3, %8 + movdqa [%9], %8 + movdqa %8, [16+%9] + SBUTTERFLY dqa, dq, %8, %2, %3 + SBUTTERFLY dqa, dq, %5, %7, %2 + SBUTTERFLY dqa, dq, %4, %6, %7 + SBUTTERFLY dqa, qdq, %1, %5, %6 + SBUTTERFLY dqa, qdq, %8, %4, %5 + movdqa [%9+16], %8 + movdqa %8, [%9] + SBUTTERFLY dqa, qdq, %8, %2, %4 + SBUTTERFLY dqa, qdq, %3, %7, %2 + movdqa %7, [%9+16] +%endmacro + +;----------------------------------------------------------------------------- +; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) +;----------------------------------------------------------------------------- +cglobal x264_add8x8_idct8_sse2 + mov ecx, [esp+4] + mov eax, [esp+8] + movdqa xmm1, [eax+0x10] + movdqa xmm2, [eax+0x20] + movdqa xmm3, [eax+0x30] + movdqa xmm5, [eax+0x50] + movdqa xmm6, [eax+0x60] + movdqa xmm7, [eax+0x70] + IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + TRANSPOSE8 xmm4, xmm1, xmm7, xmm3, xmm5, xmm0, xmm2, xmm6, eax + picgetgot edx + paddw xmm4, [pw_32 GLOBAL] + movdqa [eax+0x00], xmm4 + movdqa [eax+0x40], xmm2 + IDCT8_1D xmm4, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7, xmm1 + movdqa [eax+0x60], xmm6 + movdqa [eax+0x70], xmm7 + pxor xmm7, xmm7 + STORE_DIFF_8P xmm2, [ecx+FDEC_STRIDE*0], xmm6, xmm7 + STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*1], xmm6, xmm7 + STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*2], xmm6, xmm7 + STORE_DIFF_8P xmm3, [ecx+FDEC_STRIDE*3], xmm6, xmm7 + STORE_DIFF_8P xmm5, [ecx+FDEC_STRIDE*4], xmm6, xmm7 + STORE_DIFF_8P xmm4, [ecx+FDEC_STRIDE*5], xmm6, xmm7 + movdqa xmm0, [eax+0x60] + movdqa xmm1, [eax+0x70] + STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7 + STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7 + ret + +;----------------------------------------------------------------------------- +; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) +;----------------------------------------------------------------------------- +%macro SUB_NxN_DCT 4 +cglobal %1 + mov edx, [esp+12] + mov ecx, [esp+ 8] + mov eax, [esp+ 4] + add edx, %4 + add ecx, %4 + add eax, %3 + push edx + push ecx + push eax + call %2 + add dword [esp+0], %3 + add dword [esp+4], %4*FENC_STRIDE-%4 + add dword [esp+8], %4*FDEC_STRIDE-%4 + call %2 + add dword [esp+0], %3 + add dword [esp+4], %4 + add dword [esp+8], %4 + call %2 + add esp, 12 + jmp %2 +%endmacro + +;----------------------------------------------------------------------------- +; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) +;----------------------------------------------------------------------------- +%macro ADD_NxN_IDCT 4 +cglobal %1 + mov ecx, [esp+8] + mov eax, [esp+4] + add ecx, %3 + add eax, %4 + push ecx + push eax + call %2 + add dword [esp+0], %4*FDEC_STRIDE-%4 + add dword [esp+4], %3 + call %2 + add dword [esp+0], %4 + add dword [esp+4], %3 + call %2 + add esp, 8 + jmp %2 +%endmacro + +SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx, 128, 8 +ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8 + +ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8 + +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_zigzag_scan_4x4_field_mmx + mov edx, [esp+8] + mov ecx, [esp+4] + punpcklwd mm0, [edx] + punpckhwd mm1, [edx] + punpcklwd mm2, [edx+8] + punpckhwd mm3, [edx+8] + punpcklwd mm4, [edx+16] + punpckhwd mm5, [edx+16] + punpcklwd mm6, [edx+24] + punpckhwd mm7, [edx+24] + psrad mm0, 16 + psrad mm1, 16 + psrad mm2, 16 + psrad mm3, 16 + psrad mm4, 16 + psrad mm5, 16 + psrad mm6, 16 + psrad mm7, 16 + movq [ecx ], mm0 + movq [ecx+16], mm2 + movq [ecx+24], mm3 + movq [ecx+32], mm4 + movq [ecx+40], mm5 + movq [ecx+48], mm6 + movq [ecx+56], mm7 + movq [ecx+12], mm1 + movd [ecx+ 8], mm2 + ret diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm new file mode 100644 index 00000000..e3825204 --- /dev/null +++ b/common/x86/dct-64.asm @@ -0,0 +1,243 @@ +;***************************************************************************** +;* dct-64.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003-2008 x264 project +;* +;* Authors: Laurent Aimar (initial version) +;* Min Chen (converted to nasm) +;* Loren Merritt (dct8) +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA +pw_32: times 8 dw 32 + +SECTION .text + +%macro LOAD_DIFF_8P 5 + movq %1, %4 + punpcklbw %1, %3 + movq %2, %5 + punpcklbw %2, %3 + psubw %1, %2 +%endmacro + +%macro SUMSUB_BA 2 + paddw %1, %2 + paddw %2, %2 + psubw %2, %1 +%endmacro + +%macro SBUTTERFLY 5 + mov%1 %5, %3 + punpckl%2 %3, %4 + punpckh%2 %5, %4 +%endmacro + +;----------------------------------------------------------------------------- +; input ABCDEFGH output AFHDTECB +;----------------------------------------------------------------------------- +%macro TRANSPOSE8x8W 9 + SBUTTERFLY dqa, wd, %1, %2, %9 + SBUTTERFLY dqa, wd, %3, %4, %2 + SBUTTERFLY dqa, wd, %5, %6, %4 + SBUTTERFLY dqa, wd, %7, %8, %6 + SBUTTERFLY dqa, dq, %1, %3, %8 + SBUTTERFLY dqa, dq, %9, %2, %3 + SBUTTERFLY dqa, dq, %5, %7, %2 + SBUTTERFLY dqa, dq, %4, %6, %7 + SBUTTERFLY dqa, qdq, %1, %5, %6 + SBUTTERFLY dqa, qdq, %9, %4, %5 + SBUTTERFLY dqa, qdq, %8, %2, %4 + SBUTTERFLY dqa, qdq, %3, %7, %2 +%endmacro + +%macro STORE_DIFF_8P 4 + psraw %1, 6 + movq %2, %4 + punpcklbw %2, %3 + paddsw %1, %2 + packuswb %1, %1 + movq %4, %1 +%endmacro + +SECTION .text + +; in: ABCDEFGH +; out: FBCGEDHI +%macro DCT8_1D 10 + SUMSUB_BA %8, %1 ; %8=s07, %1=d07 + SUMSUB_BA %7, %2 ; %7=s16, %2=d16 + SUMSUB_BA %6, %3 ; %6=s25, %3=d25 + SUMSUB_BA %5, %4 ; %5=s34, %4=d34 + + SUMSUB_BA %5, %8 ; %5=a0, %8=a2 + SUMSUB_BA %6, %7 ; %6=a1, %7=a3 + + movdqa %9, %1 + psraw %9, 1 + paddw %9, %1 + paddw %9, %2 + paddw %9, %3 ; %9=a4 + + movdqa %10, %4 + psraw %10, 1 + paddw %10, %4 + paddw %10, %2 + psubw %10, %3 ; %10=a7 + + SUMSUB_BA %4, %1 + psubw %1, %3 + psubw %4, %2 + psraw %3, 1 + psraw %2, 1 + psubw %1, %3 ; %1=a5 + psubw %4, %2 ; %4=a6 + + SUMSUB_BA %6, %5 ; %6=b0, %5=b4 + + movdqa %2, %10 + psraw %2, 2 + paddw %2, %9 ; %2=b1 + psraw %9, 2 + psubw %9, %10 ; %9=b7 + + movdqa %3, %7 + psraw %3, 1 + paddw %3, %8 ; %3=b2 + psraw %8, 1 + psubw %8, %7 ; %8=b6 + + movdqa %7, %4 + psraw %7, 2 + paddw %7, %1 ; %7=b3 + psraw %1, 2 + psubw %4, %1 ; %4=b5 +%endmacro + +;----------------------------------------------------------------------------- +; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) +;----------------------------------------------------------------------------- +cglobal x264_sub8x8_dct8_sse2 + pxor xmm9, xmm9 + LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE] + LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE] + LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE] + LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE] + LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE] + LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE] + LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE] + LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE] + + DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9 + TRANSPOSE8x8W xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0 + DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9 + + movdqa [parm1q+0x00], xmm4 + movdqa [parm1q+0x10], xmm3 + movdqa [parm1q+0x20], xmm8 + movdqa [parm1q+0x30], xmm2 + movdqa [parm1q+0x40], xmm0 + movdqa [parm1q+0x50], xmm6 + movdqa [parm1q+0x60], xmm1 + movdqa [parm1q+0x70], xmm7 + ret + + +; in: ABCDEFGH +; out: IBHDEACG +%macro IDCT8_1D 10 + SUMSUB_BA %5, %1 ; %5=a0, %1=a2 + movdqa %10, %3 + psraw %3, 1 + psubw %3, %7 ; %3=a4 + psraw %7, 1 + paddw %7, %10 ; %7=a6 + + movdqa %9, %2 + psraw %9, 1 + paddw %9, %2 + paddw %9, %4 + paddw %9, %6 ; %9=a7 + + movdqa %10, %6 + psraw %10, 1 + paddw %10, %6 + paddw %10, %8 + psubw %10, %2 ; %10=a5 + + psubw %2, %4 + psubw %6, %4 + paddw %2, %8 + psubw %6, %8 + psraw %4, 1 + psraw %8, 1 + psubw %2, %4 ; %2=a3 + psubw %6, %8 ; %6=a1 + + SUMSUB_BA %7, %5 ; %7=b0, %5=b6 + SUMSUB_BA %3, %1 ; %3=b2, %1=b4 + + movdqa %4, %9 + psraw %4, 2 + paddw %4, %6 ; %4=b1 + psraw %6, 2 + psubw %9, %6 ; %9=b7 + + movdqa %8, %10 + psraw %8, 2 + paddw %8, %2 ; %8=b3 + psraw %2, 2 + psubw %2, %10 ; %2=b5 + + SUMSUB_BA %9, %7 ; %9=c0, %7=c7 + SUMSUB_BA %2, %3 ; %2=c1, %3=c6 + SUMSUB_BA %8, %1 ; %8=c2, %1=c5 + SUMSUB_BA %4, %5 ; %4=c3, %5=c4 +%endmacro + +;----------------------------------------------------------------------------- +; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) +;----------------------------------------------------------------------------- +cglobal x264_add8x8_idct8_sse2 + movdqa xmm0, [parm2q+0x00] + movdqa xmm1, [parm2q+0x10] + movdqa xmm2, [parm2q+0x20] + movdqa xmm3, [parm2q+0x30] + movdqa xmm4, [parm2q+0x40] + movdqa xmm5, [parm2q+0x50] + movdqa xmm6, [parm2q+0x60] + movdqa xmm7, [parm2q+0x70] + + IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8 + TRANSPOSE8x8W xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5 + paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end + IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2 + + pxor xmm15, xmm15 + STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE] + STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE] + STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE] + STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE] + STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE] + STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE] + STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE] + STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE] + ret + + diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm new file mode 100644 index 00000000..4d3efcb5 --- /dev/null +++ b/common/x86/dct-a.asm @@ -0,0 +1,295 @@ +;***************************************************************************** +;* dct-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003-2008 x264 project +;* +;* Authors: Laurent Aimar +;* Min Chen +;* Loren Merritt +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 +pw_32: times 8 dw 32 + +SECTION .text + +%macro LOAD_DIFF_4P 5 + movd %1, %4 + punpcklbw %1, %3 + movd %2, %5 + punpcklbw %2, %3 + psubw %1, %2 +%endmacro + +%macro SUMSUB_BA 2 + paddw %1, %2 + paddw %2, %2 + psubw %2, %1 +%endmacro + +%macro SUMSUB_BADC 4 + paddw %1, %2 + paddw %3, %4 + paddw %2, %2 + paddw %4, %4 + psubw %2, %1 + psubw %4, %3 +%endmacro + +%macro SUMSUB2_AB 3 + movq %3, %1 + paddw %1, %1 + paddw %1, %2 + psubw %3, %2 + psubw %3, %2 +%endmacro + +%macro SUMSUBD2_AB 4 + movq %4, %1 + movq %3, %2 + psraw %2, 1 + psraw %4, 1 + paddw %1, %2 + psubw %4, %3 +%endmacro + +%macro SBUTTERFLY 5 + mov%1 %5, %3 + punpckl%2 %3, %4 + punpckh%2 %5, %4 +%endmacro + +;----------------------------------------------------------------------------- +; input ABCD output ADTC +;----------------------------------------------------------------------------- +%macro TRANSPOSE4x4W 5 + SBUTTERFLY q, wd, %1, %2, %5 + SBUTTERFLY q, wd, %3, %4, %2 + SBUTTERFLY q, dq, %1, %3, %4 + SBUTTERFLY q, dq, %5, %2, %3 +%endmacro + +%macro STORE_DIFF_4P 5 + paddw %1, %3 + psraw %1, 6 + movd %2, %5 + punpcklbw %2, %4 + paddsw %1, %2 + packuswb %1, %1 + movd %5, %1 +%endmacro + +;----------------------------------------------------------------------------- +; void x264_dct4x4dc_mmx( int16_t d[4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_dct4x4dc_mmx, 1,1,1 + movq mm0, [r0+ 0] + movq mm1, [r0+ 8] + movq mm2, [r0+16] + movq mm3, [r0+24] + + SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 + SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 + + TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 + + SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 + SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 + + movq mm6, [pw_1 GLOBAL] + paddw mm0, mm6 + paddw mm2, mm6 + psraw mm0, 1 + movq [r0+ 0], mm0 + psraw mm2, 1 + movq [r0+ 8], mm2 + paddw mm3, mm6 + paddw mm4, mm6 + psraw mm3, 1 + movq [r0+16], mm3 + psraw mm4, 1 + movq [r0+24], mm4 + RET + +;----------------------------------------------------------------------------- +; void x264_idct4x4dc_mmx( int16_t d[4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_idct4x4dc_mmx, 1,1 + movq mm0, [r0+ 0] + movq mm1, [r0+ 8] + movq mm2, [r0+16] + movq mm3, [r0+24] + + SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 + SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 + + TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 + + SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 + SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 + + movq [r0+ 0], mm0 + movq [r0+ 8], mm2 + movq [r0+16], mm3 + movq [r0+24], mm4 + RET + +;----------------------------------------------------------------------------- +; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) +;----------------------------------------------------------------------------- +cglobal x264_sub4x4_dct_mmx, 3,3 +.skip_prologue: + pxor mm7, mm7 + + ; Load 4 lines + LOAD_DIFF_4P mm0, mm6, mm7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] + LOAD_DIFF_4P mm1, mm6, mm7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] + LOAD_DIFF_4P mm2, mm6, mm7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] + LOAD_DIFF_4P mm3, mm6, mm7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] + + SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12 + + SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12 + SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12 + + ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 + TRANSPOSE4x4W mm2, mm0, mm3, mm4, mm1 + + SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12 + + SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12 + SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12 + + movq [r0+ 0], mm1 + movq [r0+ 8], mm2 + movq [r0+16], mm3 + movq [r0+24], mm0 + RET + +;----------------------------------------------------------------------------- +; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_add4x4_idct_mmx, 2,2,1 +.skip_prologue: + ; Load dct coeffs + movq mm0, [r1+ 0] ; dct + movq mm1, [r1+ 8] + movq mm2, [r1+16] + movq mm3, [r1+24] + + SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02 + SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) + + SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 + + ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0 + TRANSPOSE4x4W mm1, mm4, mm0, mm2, mm3 + + SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02 + SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) + + SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 + + pxor mm7, mm7 + movq mm6, [pw_32 GLOBAL] + + STORE_DIFF_4P mm2, mm0, mm6, mm7, [r0+0*FDEC_STRIDE] + STORE_DIFF_4P mm4, mm0, mm6, mm7, [r0+1*FDEC_STRIDE] + STORE_DIFF_4P mm1, mm0, mm6, mm7, [r0+2*FDEC_STRIDE] + STORE_DIFF_4P mm3, mm0, mm6, mm7, [r0+3*FDEC_STRIDE] + + RET + + + +;----------------------------------------------------------------------------- +; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) +;----------------------------------------------------------------------------- +%macro SUB_NxN_DCT 6 +cglobal %1, 3,3 +.skip_prologue: + call %2 + add r0, %3 + add r1, %4-%5*FENC_STRIDE + add r2, %4-%5*FDEC_STRIDE + call %2 + add r0, %3 + add r1, %4*FENC_STRIDE-%6 + add r2, %4*FDEC_STRIDE-%6 + call %2 + add r0, %3 + add r1, %4-%5*FENC_STRIDE + add r2, %4-%5*FDEC_STRIDE + jmp %2 +%endmacro + +;----------------------------------------------------------------------------- +; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) +;----------------------------------------------------------------------------- +%macro ADD_NxN_IDCT 6 +cglobal %1, 2,2,1 +.skip_prologue: + call %2 + add r0, %4-%5*FDEC_STRIDE + add r1, %3 + call %2 + add r0, %4*FDEC_STRIDE-%6 + add r1, %3 + call %2 + add r0, %4-%5*FDEC_STRIDE + add r1, %3 + jmp %2 +%endmacro + +SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 4 +ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 4 + +SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 4, 4, 12 +ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 4, 4, 12 + +%ifdef ARCH_X86_64 +cextern x264_sub8x8_dct8_sse2 +cextern x264_add8x8_idct8_sse2 +SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8 +ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8 +%endif + + +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_zigzag_scan_4x4_field_sse2, 2,2 + punpcklwd xmm0, [r1] + punpckhwd xmm1, [r1] + punpcklwd xmm2, [r1+16] + punpckhwd xmm3, [r1+16] + psrad xmm0, 16 + psrad xmm1, 16 + psrad xmm2, 16 + psrad xmm3, 16 + movq [r0 ], xmm0 + movdqa [r0+16], xmm1 + movdqa [r0+32], xmm2 + movhlps xmm0, xmm0 + movdqa [r0+48], xmm3 + movq [r0+12], xmm0 + movd [r0+ 8], xmm1 + RET + diff --git a/common/i386/dct.h b/common/x86/dct.h similarity index 97% rename from common/i386/dct.h rename to common/x86/dct.h index 3ca9b82c..5e93a99f 100644 --- a/common/i386/dct.h +++ b/common/x86/dct.h @@ -2,7 +2,6 @@ * dct.h: h264 encoder library ***************************************************************************** * Copyright (C) 2003 Laurent Aimar - * $Id: dct.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $ * * Authors: Laurent Aimar * diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm new file mode 100644 index 00000000..8071678b --- /dev/null +++ b/common/x86/deblock-a.asm @@ -0,0 +1,620 @@ +;***************************************************************************** +;* deblock-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2005-2008 x264 project +;* +;* Authors: Loren Merritt +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA +pb_01: times 16 db 0x01 +pb_03: times 16 db 0x03 +pb_a1: times 16 db 0xa1 + +SECTION .text + +%macro INIT_MMX 0 + %undef movq + %define m0 mm0 + %define m1 mm1 + %define m2 mm2 + %define m3 mm3 + %define m4 mm4 + %define m5 mm5 + %define m6 mm6 + %define m7 mm7 + %undef m8 + %undef m9 +%endmacro + +%macro INIT_XMM 0 + %define movq movdqa + %define m0 xmm0 + %define m1 xmm1 + %define m2 xmm2 + %define m3 xmm3 + %define m4 xmm4 + %define m5 xmm5 + %define m6 xmm6 + %define m7 xmm7 + %define m8 xmm8 + %define m9 xmm9 +%endmacro + +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ + [base], [base+stride], [base+stride*2], [base3], \ + [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] + +; in: 8 rows of 4 bytes in %1..%8 +; out: 4 rows of 8 bytes in m0..m3 +%macro TRANSPOSE4x8_LOAD 8 + movd m0, %1 + movd m2, %2 + movd m1, %3 + movd m3, %4 + punpcklbw m0, m2 + punpcklbw m1, m3 + movq m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + + movd m4, %5 + movd m6, %6 + movd m5, %7 + movd m7, %8 + punpcklbw m4, m6 + punpcklbw m5, m7 + movq m6, m4 + punpcklwd m4, m5 + punpckhwd m6, m5 + + movq m1, m0 + movq m3, m2 + punpckldq m0, m4 + punpckhdq m1, m4 + punpckldq m2, m6 + punpckhdq m3, m6 +%endmacro + +; in: 4 rows of 8 bytes in m0..m3 +; out: 8 rows of 4 bytes in %1..%8 +%macro TRANSPOSE8x4_STORE 8 + movq m4, m0 + movq m5, m1 + movq m6, m2 + punpckhdq m4, m4 + punpckhdq m5, m5 + punpckhdq m6, m6 + + punpcklbw m0, m1 + punpcklbw m2, m3 + movq m1, m0 + punpcklwd m0, m2 + punpckhwd m1, m2 + movd %1, m0 + punpckhdq m0, m0 + movd %2, m0 + movd %3, m1 + punpckhdq m1, m1 + movd %4, m1 + + punpckhdq m3, m3 + punpcklbw m4, m5 + punpcklbw m6, m3 + movq m5, m4 + punpcklwd m4, m6 + punpckhwd m5, m6 + movd %5, m4 + punpckhdq m4, m4 + movd %6, m4 + movd %7, m5 + punpckhdq m5, m5 + movd %8, m5 +%endmacro + +%macro SBUTTERFLY 4 + movq %4, %2 + punpckl%1 %2, %3 + punpckh%1 %4, %3 +%endmacro + +; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 +; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] +%macro TRANSPOSE6x8_MEM 9 + movq m0, %1 + movq m1, %3 + movq m2, %5 + movq m3, %7 + SBUTTERFLY bw, m0, %2, m4 + SBUTTERFLY bw, m1, %4, m5 + SBUTTERFLY bw, m2, %6, m6 + movq [%9+0x10], m5 + SBUTTERFLY bw, m3, %8, m7 + SBUTTERFLY wd, m0, m1, m5 + SBUTTERFLY wd, m2, m3, m1 + punpckhdq m0, m2 + movq [%9+0x00], m0 + SBUTTERFLY wd, m4, [%9+0x10], m3 + SBUTTERFLY wd, m6, m7, m2 + SBUTTERFLY dq, m4, m6, m0 + SBUTTERFLY dq, m5, m1, m7 + punpckldq m3, m2 + movq [%9+0x10], m5 + movq [%9+0x20], m7 + movq [%9+0x30], m4 + movq [%9+0x40], m0 + movq [%9+0x50], m3 +%endmacro + +; out: %4 = |%1-%2|>%3 +; clobbers: %5 +%macro DIFF_GT 5 + movq %5, %2 + movq %4, %1 + psubusb %5, %1 + psubusb %4, %2 + por %4, %5 + psubusb %4, %3 +%endmacro + +; out: %4 = |%1-%2|>%3 +; clobbers: %5 +%macro DIFF_GT2 5 + movq %5, %2 + movq %4, %1 + psubusb %5, %1 + psubusb %4, %2 + psubusb %5, %3 + psubusb %4, %3 + pcmpeqb %4, %5 +%endmacro + +%macro SPLATW 1 +%ifidn m0, xmm0 + pshuflw %1, %1, 0 + punpcklqdq %1, %1 +%else + pshufw %1, %1, 0 +%endif +%endmacro + +; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 +; out: m5=beta-1, m7=mask +; clobbers: m4,m6 +%macro LOAD_MASK 2 + movd m4, %1 + movd m5, %2 + SPLATW m4 + SPLATW m5 + packuswb m4, m4 ; 16x alpha-1 + packuswb m5, m5 ; 16x beta-1 + DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 + DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 + por m7, m4 + DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 + por m7, m4 + pxor m6, m6 + pcmpeqb m7, m6 +%endmacro + +; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) +; out: m1=p0' m2=q0' +; clobbers: m0,3-6 +%macro DEBLOCK_P0_Q0 0 + movq m5, m1 + pxor m5, m2 ; p0^q0 + pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 + pcmpeqb m4, m4 + pxor m3, m4 + pavgb m3, m0 ; (p1 - q1 + 256)>>1 + pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 + pxor m4, m1 + pavgb m4, m2 ; (q0 - p0 + 256)>>1 + pavgb m3, m5 + paddusb m3, m4 ; d+128+33 + movq m6, [pb_a1 GLOBAL] + psubusb m6, m3 + psubusb m3, [pb_a1 GLOBAL] + pminub m6, m7 + pminub m3, m7 + psubusb m1, m6 + psubusb m2, m3 + paddusb m1, m3 + paddusb m2, m6 +%endmacro + +; in: m1=p0 m2=q0 +; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp +; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) +; clobbers: q2, tmp, tc0 +%macro LUMA_Q1 6 + movq %6, m1 + pavgb %6, m2 + pavgb %2, %6 ; avg(p2,avg(p0,q0)) + pxor %6, %3 + pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 + psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 + movq %6, %1 + psubusb %6, %5 + paddusb %5, %1 + pmaxub %2, %6 + pminub %2, %5 + movq %4, %2 +%endmacro + +;----------------------------------------------------------------------------- +; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +%ifdef ARCH_X86_64 +INIT_XMM +cglobal x264_deblock_v_luma_sse2 + movd m8, [r4] ; tc0 + lea r4, [r1*3] + dec r2d ; alpha-1 + neg r4 + dec r3d ; beta-1 + add r4, r0 ; pix-3*stride + + movdqa m0, [r4+r1] ; p1 + movdqa m1, [r4+2*r1] ; p0 + movdqa m2, [r0] ; q0 + movdqa m3, [r0+r1] ; q1 + LOAD_MASK r2d, r3d + + punpcklbw m8, m8 + punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] + pcmpeqb m9, m9 + pcmpeqb m9, m8 + pandn m9, m7 + pand m8, m9 + + movdqa m3, [r4] ; p2 + DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 + pand m6, m9 + movdqa m7, m8 + psubb m7, m6 + pand m6, m8 + LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 + + movdqa m4, [r0+2*r1] ; q2 + DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 + pand m6, m9 + pand m8, m6 + psubb m7, m6 + movdqa m3, [r0+r1] + LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 + + DEBLOCK_P0_Q0 + movdqa [r4+2*r1], m1 + movdqa [r0], m2 + ret + +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal x264_deblock_h_luma_sse2 + movsxd r10, esi + lea r11, [r10+r10*2] + lea rax, [r0-4] + lea r9, [r0-4+r11] + sub rsp, 0x68 + %define pix_tmp rsp + + ; transpose 6x16 -> tmp space + TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp + lea rax, [rax+r10*8] + lea r9, [r9 +r10*8] + TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 + + ; vertical filter + ; alpha, beta, tc0 are still in r2d, r3d, r4 + ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + lea r0, [pix_tmp+0x30] + mov esi, 0x10 + call x264_deblock_v_luma_sse2 + + ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) + add rax, 2 + add r9, 2 + movq m0, [pix_tmp+0x18] + movq m1, [pix_tmp+0x28] + movq m2, [pix_tmp+0x38] + movq m3, [pix_tmp+0x48] + TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + + shl r10, 3 + sub rax, r10 + sub r9, r10 + shr r10, 3 + movq m0, [pix_tmp+0x10] + movq m1, [pix_tmp+0x20] + movq m2, [pix_tmp+0x30] + movq m3, [pix_tmp+0x40] + TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + + add rsp, 0x68 + ret + +%else + +%macro DEBLOCK_LUMA 3 +;----------------------------------------------------------------------------- +; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_%2_luma_%1, 5,5,1 + lea r4, [r1*3] + dec r2 ; alpha-1 + neg r4 + dec r3 ; beta-1 + add r4, r0 ; pix-3*stride + + movq m0, [r4+r1] ; p1 + movq m1, [r4+2*r1] ; p0 + movq m2, [r0] ; q0 + movq m3, [r0+r1] ; q1 + LOAD_MASK r2, r3 + + mov r3, r4m +%if %3 == 16 + mov r2, esp + and esp, -16 + sub esp, 32 +%else + sub esp, 16 +%endif + + movd m4, [r3] ; tc0 + punpcklbw m4, m4 + punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] + movq [esp+%3], m4 ; tc + pcmpeqb m3, m3 + pcmpgtb m4, m3 + pand m4, m7 + movq [esp], m4 ; mask + + movq m3, [r4] ; p2 + DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 + pand m6, m4 + pand m4, [esp+%3] ; tc + movq m7, m4 + psubb m7, m6 + pand m6, m4 + LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 + + movq m4, [r0+2*r1] ; q2 + DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 + movq m5, [esp] ; mask + pand m6, m5 + movq m5, [esp+%3] ; tc + pand m5, m6 + psubb m7, m6 + movq m3, [r0+r1] + LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 + + DEBLOCK_P0_Q0 + movq [r4+2*r1], m1 + movq [r0], m2 + +%if %3 == 16 + mov esp, r2 +%else + add esp, 16 +%endif + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal x264_deblock_h_luma_%1, 0,6 + mov r0, r0m + mov r3, r1m + lea r4, [r3*3] + sub r0, 4 + lea r1, [r0+r4] + SUB esp, 0x6c + lea r5, [esp+12] + and r5, -16 +%define pix_tmp r5 + + ; transpose 6x16 -> tmp space + TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp + lea r0, [r0+r3*8] + lea r1, [r1+r3*8] + TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 + + ; vertical filter + lea r0, [pix_tmp+0x30] + PUSH dword r4m + PUSH dword r3m + PUSH dword r2m + PUSH dword 16 + PUSH dword r0 + call x264_deblock_%2_luma_%1 +%ifidn %2, v8 + add dword [esp ], 8 ; pix_tmp+0x38 + add dword [esp+16], 2 ; tc0+2 + call x264_deblock_%2_luma_%1 +%endif + ADD esp, 20 + + ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) + mov r0, r0m + sub r0, 2 + lea r1, [r0+r4] + + movq m0, [pix_tmp+0x10] + movq m1, [pix_tmp+0x20] + movq m2, [pix_tmp+0x30] + movq m3, [pix_tmp+0x40] + TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + + lea r0, [r0+r3*8] + lea r1, [r1+r3*8] + movq m0, [pix_tmp+0x18] + movq m1, [pix_tmp+0x28] + movq m2, [pix_tmp+0x38] + movq m3, [pix_tmp+0x48] + TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + + ADD esp, 0x6c + RET +%endmacro ; DEBLOCK_LUMA + +INIT_MMX +DEBLOCK_LUMA mmxext, v8, 8 +INIT_XMM +DEBLOCK_LUMA sse2, v, 16 + +%endif ; ARCH + + + +INIT_MMX + +%macro CHROMA_V_START 0 + dec r2d ; alpha-1 + dec r3d ; beta-1 + mov t5, r0 + sub t5, r1 + sub t5, r1 +%endmacro + +%macro CHROMA_H_START 0 + dec r2d + dec r3d + sub r0, 2 + lea t6, [r1*3] + mov t5, r0 + add r0, t6 +%endmacro + +%define t5 r5 +%define t6 r6 + +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_v_chroma_mmxext, 5,6 + CHROMA_V_START + + movq m0, [t5] + movq m1, [t5+r1] + movq m2, [r0] + movq m3, [r0+r1] + + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + picgetgot r4 + DEBLOCK_P0_Q0 + + movq [t5+r1], m1 + movq [r0], m2 + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_chroma_mmxext, 5,7 +%ifdef ARCH_X86_64 + %define buf0 [rsp-16] + %define buf1 [rsp-8] +%else + %define buf0 r0m + %define buf1 r2m +%endif + CHROMA_H_START + + TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + movq buf0, m0 + movq buf1, m3 + + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + picgetgot r4 + DEBLOCK_P0_Q0 + + movq m0, buf0 + movq m3, buf1 + TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + RET + + + +; in: %1=p0 %2=p1 %3=q1 +; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 +%macro CHROMA_INTRA_P0 3 + movq m4, %1 + pxor m4, %3 + pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1 + pavgb %1, %3 + psubusb %1, m4 + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) +%endmacro + +%macro CHROMA_INTRA_BODY 0 + LOAD_MASK r2d, r3d + movq m5, m1 + movq m6, m2 + CHROMA_INTRA_P0 m1, m0, m3 + CHROMA_INTRA_P0 m2, m3, m0 + psubb m1, m5 + psubb m2, m6 + pand m1, m7 + pand m2, m7 + paddb m1, m5 + paddb m2, m6 +%endmacro + +%define t5 r4 +%define t6 r5 + +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_v_chroma_intra_mmxext, 4,5,1 + CHROMA_V_START + + movq m0, [t5] + movq m1, [t5+r1] + movq m2, [r0] + movq m3, [r0+r1] + + CHROMA_INTRA_BODY + + movq [t5+r1], m1 + movq [r0], m2 + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_chroma_intra_mmxext, 4,6,1 + CHROMA_H_START + TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + CHROMA_INTRA_BODY + TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + RET + diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm new file mode 100644 index 00000000..02959222 --- /dev/null +++ b/common/x86/mc-a.asm @@ -0,0 +1,637 @@ +;***************************************************************************** +;* mc-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003-2008 x264 project +;* +;* Authors: Loren Merritt +;* Laurent Aimar +;* Min Chen +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA + +pw_4: times 4 dw 4 +pw_8: times 4 dw 8 +pw_32: times 4 dw 32 +pw_64: times 4 dw 64 + +SECTION .text + +;============================================================================= +; pixel avg +;============================================================================= + +;----------------------------------------------------------------------------- +; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride, +; uint8_t *src, int src_stride ); +;----------------------------------------------------------------------------- +%macro AVGH 2 +%assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space +cglobal x264_pixel_avg_%1x%2_mmxext + mov eax, %2 + jmp x264_pixel_avg_w%1_mmxext +%assign function_align 16 +%endmacro + +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride, +; uint8_t *src, int src_stride, +; int height ); +;----------------------------------------------------------------------------- +%ifdef ARCH_X86_64 + %define t0 r0 + %define t1 r1 + %define t2 r2 + %define t3 r3 + %macro AVG_START 1 + cglobal %1, 4,5 + .height_loop: + %endmacro +%else + %define t0 r1 + %define t1 r2 + %define t2 r3 + %define t3 r4 + %macro AVG_START 1 + cglobal %1, 0,5 + mov t0, r0m + mov t1, r1m + mov t2, r2m + mov t3, r3m + .height_loop: + %endmacro +%endif + +%macro AVG_END 0 + sub eax, 2 + lea t2, [t2+t3*2] + lea t0, [t0+t1*2] + jg .height_loop + REP_RET +%endmacro + +AVG_START x264_pixel_avg_w4_mmxext + movd mm0, [t2] + movd mm1, [t2+t3] + pavgb mm0, [t0] + pavgb mm1, [t0+t1] + movd [t0], mm0 + movd [t0+t1], mm1 +AVG_END + +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_START x264_pixel_avg_w8_mmxext + movq mm0, [t2] + movq mm1, [t2+t3] + pavgb mm0, [t0] + pavgb mm1, [t0+t1] + movq [t0], mm0 + movq [t0+t1], mm1 +AVG_END + +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_START x264_pixel_avg_w16_mmxext + movq mm0, [t2 ] + movq mm1, [t2+8] + movq mm2, [t2+t3 ] + movq mm3, [t2+t3+8] + pavgb mm0, [t0 ] + pavgb mm1, [t0+8] + pavgb mm2, [t0+t1 ] + pavgb mm3, [t0+t1+8] + movq [t0 ], mm0 + movq [t0+8], mm1 + movq [t0+t1 ], mm2 + movq [t0+t1+8], mm3 +AVG_END + +AVGH 16, 16 +AVGH 16, 8 + +AVG_START x264_pixel_avg_w16_sse2 + movdqu xmm0, [t2] + movdqu xmm1, [t2+t3] + pavgb xmm0, [t0] + pavgb xmm1, [t0+t1] + movdqa [t0], xmm0 + movdqa [t0+t1], xmm1 +AVG_END + + + +;============================================================================= +; pixel avg2 +;============================================================================= + +;----------------------------------------------------------------------------- +; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride, +; uint8_t *src1, int src_stride, +; uint8_t *src2, int height ); +;----------------------------------------------------------------------------- +%macro AVG2_W8 2 +cglobal x264_pixel_avg2_w%1_mmxext, 6,7 + sub r4, r2 + lea r6, [r4+r3] +.height_loop: + %2 mm0, [r2] + %2 mm1, [r2+r3] + pavgb mm0, [r2+r4] + pavgb mm1, [r2+r6] + %2 [r0], mm0 + %2 [r0+r1], mm1 + sub r5d, 2 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + jg .height_loop + REP_RET +%endmacro + +AVG2_W8 4, movd +AVG2_W8 8, movq + +%macro AVG2_W16 2 +cglobal x264_pixel_avg2_w%1_mmxext, 6,7 + sub r4, r2 + lea r6, [r4+r3] +.height_loop: + movq mm0, [r2] + %2 mm1, [r2+8] + movq mm2, [r2+r3] + %2 mm3, [r2+r3+8] + pavgb mm0, [r2+r4] + pavgb mm1, [r2+r4+8] + pavgb mm2, [r2+r6] + pavgb mm3, [r2+r6+8] + movq [r0], mm0 + %2 [r0+8], mm1 + movq [r0+r1], mm2 + %2 [r0+r1+8], mm3 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + REP_RET +%endmacro + +AVG2_W16 12, movd +AVG2_W16 16, movq + +cglobal x264_pixel_avg2_w20_mmxext, 6,7 + sub r4, r2 + lea r6, [r4+r3] +.height_loop: + movq mm0, [r2] + movq mm1, [r2+8] + movd mm2, [r2+16] + movq mm3, [r2+r3] + movq mm4, [r2+r3+8] + movd mm5, [r2+r3+16] + pavgb mm0, [r2+r4] + pavgb mm1, [r2+r4+8] + pavgb mm2, [r2+r4+16] + pavgb mm3, [r2+r6] + pavgb mm4, [r2+r6+8] + pavgb mm5, [r2+r6+16] + movq [r0], mm0 + movq [r0+8], mm1 + movd [r0+16], mm2 + movq [r0+r1], mm3 + movq [r0+r1+8], mm4 + movd [r0+r1+16], mm5 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + REP_RET + + + +;============================================================================= +; pixel copy +;============================================================================= + +%macro COPY4 3 + %1 mm0, [r2] + %1 mm1, [r2+r3] + %1 mm2, [r2+r3*2] + %1 mm3, [r2+%3] + %1 [r0], mm0 + %1 [r0+r1], mm1 + %1 [r0+r1*2], mm2 + %1 [r0+%2], mm3 +%endmacro + +;----------------------------------------------------------------------------- +; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride, +; uint8_t *src, int i_src_stride, int i_height ) +;----------------------------------------------------------------------------- +cglobal x264_mc_copy_w4_mmx, 4,6 + cmp r4m, dword 4 + lea r5, [r3*3] + lea r4, [r1*3] + je .end + COPY4 movd, r4, r5 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] +.end: + COPY4 movd, r4, r5 + RET + +cglobal x264_mc_copy_w8_mmx, 5,7 + lea r6, [r3*3] + lea r5, [r1*3] +.height_loop: + COPY4 movq, r5, r6 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r4d, 4 + jg .height_loop + REP_RET + +cglobal x264_mc_copy_w16_mmx, 5,7 + lea r6, [r3*3] + lea r5, [r1*3] +.height_loop: + movq mm0, [r2] + movq mm1, [r2+8] + movq mm2, [r2+r3] + movq mm3, [r2+r3+8] + movq mm4, [r2+r3*2] + movq mm5, [r2+r3*2+8] + movq mm6, [r2+r6] + movq mm7, [r2+r6+8] + movq [r0], mm0 + movq [r0+8], mm1 + movq [r0+r1], mm2 + movq [r0+r1+8], mm3 + movq [r0+r1*2], mm4 + movq [r0+r1*2+8], mm5 + movq [r0+r5], mm6 + movq [r0+r5+8], mm7 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r4d, 4 + jg .height_loop + REP_RET + + + +;============================================================================= +; weighted prediction +;============================================================================= +; implicit bipred only: +; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 + +%macro BIWEIGHT_4P_MMX 2 + movd mm0, %1 + movd mm1, %2 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + pmullw mm0, mm4 + pmullw mm1, mm5 + paddw mm0, mm1 + paddw mm0, mm6 + psraw mm0, 6 + pmaxsw mm0, mm7 + packuswb mm0, mm0 + movd %1, mm0 +%endmacro + +%macro BIWEIGHT_START_MMX 1 +%ifidn r4m, r4d + movd mm4, r4m + pshufw mm4, mm4, 0 ; weight_dst +%else + pshufw mm4, r4m, 0 +%endif + picgetgot r4 + movq mm5, [pw_64 GLOBAL] + psubw mm5, mm4 ; weight_src + movq mm6, [pw_32 GLOBAL] ; rounding + pxor mm7, mm7 +%if %1 +%ifidn r5m, r5d + %define t0 r5d +%else + %define t0 r4d + mov r4d, r5m +%endif +%endif +.height_loop: +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_avg_weight_w16_mmxext, 4,5 + BIWEIGHT_START_MMX 1 + BIWEIGHT_4P_MMX [r0 ], [r2 ] + BIWEIGHT_4P_MMX [r0+ 4], [r2+ 4] + BIWEIGHT_4P_MMX [r0+ 8], [r2+ 8] + BIWEIGHT_4P_MMX [r0+12], [r2+12] + add r0, r1 + add r2, r3 + dec t0 + jg .height_loop + REP_RET + +;----------------------------------------------------------------------------- +; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_avg_weight_w8_mmxext, 4,5 + BIWEIGHT_START_MMX 1 + BIWEIGHT_4P_MMX [r0 ], [r2 ] + BIWEIGHT_4P_MMX [r0+4], [r2+4] + add r0, r1 + add r2, r3 + dec t0 + jg .height_loop + REP_RET + +;----------------------------------------------------------------------------- +; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4 + BIWEIGHT_START_MMX 0 + BIWEIGHT_4P_MMX [r0 ], [r2 ] + BIWEIGHT_4P_MMX [r0+r1 ], [r2+r3 ] + BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2] + add r0, r1 + add r2, r3 + BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2] + RET + + + +;============================================================================= +; prefetch +;============================================================================= +; FIXME assumes 64 byte cachelines + +;----------------------------------------------------------------------------- +; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, +; uint8_t *pix_uv, int stride_uv, int mb_x ) +;----------------------------------------------------------------------------- +%ifdef ARCH_X86_64 +cglobal x264_prefetch_fenc_mmxext, 5,5 + mov eax, r4d + and eax, 3 + imul eax, r1d + lea r0, [r0+rax*4+64] + prefetcht0 [r0] + prefetcht0 [r0+r1] + lea r0, [r0+r1*2] + prefetcht0 [r0] + prefetcht0 [r0+r1] + + and r4d, 6 + imul r4d, r3d + lea r2, [r2+r4+64] + prefetcht0 [r2] + prefetcht0 [r2+r3] + ret + +%else +cglobal x264_prefetch_fenc_mmxext + mov r2, [esp+20] + mov r1, [esp+8] + mov r0, [esp+4] + and r2, 3 + imul r2, r1 + lea r0, [r0+r2*4+64] + prefetcht0 [r0] + prefetcht0 [r0+r1] + lea r0, [r0+r1*2] + prefetcht0 [r0] + prefetcht0 [r0+r1] + + mov r2, [esp+20] + mov r1, [esp+16] + mov r0, [esp+12] + and r2, 6 + imul r2, r1 + lea r0, [r0+r2+64] + prefetcht0 [r0] + prefetcht0 [r0+r1] + ret +%endif ; ARCH_X86_64 + +;----------------------------------------------------------------------------- +; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) +;----------------------------------------------------------------------------- +cglobal x264_prefetch_ref_mmxext, 3,3 + dec r2d + and r2d, r1d + lea r0, [r0+r2*8+64] + lea r2, [r1*3] + prefetcht0 [r0] + prefetcht0 [r0+r1] + prefetcht0 [r0+r1*2] + prefetcht0 [r0+r2] + lea r0, [r0+r1*4] + prefetcht0 [r0] + prefetcht0 [r0+r1] + prefetcht0 [r0+r1*2] + prefetcht0 [r0+r2] + ret + + + +;============================================================================= +; chroma MC +;============================================================================= + +;----------------------------------------------------------------------------- +; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src, int i_src_stride, +; int dx, int dy, +; int i_width, int i_height ) +;----------------------------------------------------------------------------- +cglobal x264_mc_chroma_mmxext, 0,6,1 +%ifdef ARCH_X86_64 + %define t0 r10d +%else + %define t0 r1d +%endif + movifnidn r2d, r2m + movifnidn r3d, r3m + movifnidn r4d, r4m + movifnidn r5d, r5m + mov eax, r5d + mov t0, r4d + sar eax, 3 + sar t0, 3 + imul eax, r3d + pxor mm3, mm3 + add eax, t0 + movsxdifnidn rax, eax + add r2, rax ; src += (dx>>3) + (dy>>3) * src_stride + and r4d, 7 ; dx &= 7 + je .mc1d + and r5d, 7 ; dy &= 7 + je .mc1d + + movd mm0, r4d + movd mm1, r5d + pshufw mm5, mm0, 0 ; mm5 = dx + pshufw mm6, mm1, 0 ; mm6 = dy + + movq mm4, [pw_8 GLOBAL] + movq mm0, mm4 + psubw mm4, mm5 ; mm4 = 8-dx + psubw mm0, mm6 ; mm0 = 8-dy + + movq mm7, mm5 + pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB + pmullw mm7, mm6 ; mm7 = dx*dy = cD + pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC + pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA + + mov r4d, r7m +%ifdef ARCH_X86_64 + mov r10, r0 + mov r11, r2 +%else + mov r0, r0m + mov r1, r1m + mov r5, r2 +%endif + +ALIGN 4 +.height_loop + + movd mm1, [r2+r3] + movd mm0, [r2] + punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 + punpcklbw mm0, mm3 + pmullw mm1, mm6 ; 2nd line * cC + pmullw mm0, mm4 ; 1st line * cA + paddw mm0, mm1 ; mm0 <- result + + movd mm2, [r2+1] + movd mm1, [r2+r3+1] + punpcklbw mm2, mm3 + punpcklbw mm1, mm3 + + paddw mm0, [pw_32 GLOBAL] + + pmullw mm2, mm5 ; line * cB + pmullw mm1, mm7 ; line * cD + paddw mm0, mm2 + paddw mm0, mm1 + psrlw mm0, 6 + + packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4 + movd [r0], mm0 + + add r2, r3 + add r0, r1 ; i_dst_stride + dec r4d + jnz .height_loop + + sub dword r6m, 8 + jnz .finish ; width != 8 so assume 4 + +%ifdef ARCH_X86_64 + lea r0, [r10+4] ; dst + lea r2, [r11+4] ; src +%else + mov r0, r0m + lea r2, [r5+4] + add r0, 4 +%endif + mov r4d, r7m ; i_height + jmp .height_loop + +.finish + REP_RET + +ALIGN 4 +.mc1d + mov eax, r4d + or eax, r5d + and eax, 7 + cmp r4d, 0 + mov r5d, 1 + cmove r5, r3 ; pel_offset = dx ? 1 : src_stride + movd mm6, eax + movq mm5, [pw_8 GLOBAL] + pshufw mm6, mm6, 0 + movq mm7, [pw_4 GLOBAL] + psubw mm5, mm6 + + cmp dword r6m, 8 + movifnidn r0d, r0m + movifnidn r1d, r1m + mov r4d, r7m + je .height_loop1_w8 + +ALIGN 4 +.height_loop1_w4 + movd mm0, [r2+r5] + movd mm1, [r2] + punpcklbw mm0, mm3 + punpcklbw mm1, mm3 + pmullw mm0, mm6 + pmullw mm1, mm5 + paddw mm0, mm7 + paddw mm0, mm1 + psrlw mm0, 3 + packuswb mm0, mm3 + movd [r0], mm0 + add r2, r3 + add r0, r1 + dec r4d + jnz .height_loop1_w4 + REP_RET + +ALIGN 4 +.height_loop1_w8 + movq mm0, [r2+r5] + movq mm1, [r2] + movq mm2, mm0 + movq mm4, mm1 + punpcklbw mm0, mm3 + punpcklbw mm1, mm3 + punpckhbw mm2, mm3 + punpckhbw mm4, mm3 + pmullw mm0, mm6 + pmullw mm1, mm5 + pmullw mm2, mm6 + pmullw mm4, mm5 + paddw mm0, mm7 + paddw mm2, mm7 + paddw mm0, mm1 + paddw mm2, mm4 + psrlw mm0, 3 + psrlw mm2, 3 + packuswb mm0, mm2 + movq [r0], mm0 + add r2, r3 + add r0, r1 + dec r4d + jnz .height_loop1_w8 + REP_RET + diff --git a/common/i386/mc-a2.asm b/common/x86/mc-a2.asm similarity index 62% rename from common/i386/mc-a2.asm rename to common/x86/mc-a2.asm index fb003997..a55859b1 100644 --- a/common/i386/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -1,7 +1,10 @@ ;***************************************************************************** ;* mc-a2.asm: h264 encoder library ;***************************************************************************** -;* Copyright (C) 2005 x264 project +;* Copyright (C) 2005-2008 x264 project +;* +;* Authors: Loren Merritt +;* Mathieu Monnier ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -18,28 +21,15 @@ ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** -BITS 32 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "i386inc.asm" - -;============================================================================= -; Read only data -;============================================================================= +%include "x86inc.asm" SECTION_RODATA -ALIGN 16 pw_1: times 4 dw 1 pw_16: times 4 dw 16 pw_32: times 4 dw 32 -;============================================================================= -; Macros -;============================================================================= +SECTION .text %macro LOAD_ADD 3 movd %1, %2 @@ -87,68 +77,78 @@ pw_32: times 4 dw 32 packuswb mm1, mm4 %endmacro - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - ;----------------------------------------------------------------------------- ; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, ; int i_stride, int i_width, int i_height ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_mmxext - push ebp - mov ebp, esp - push ebx - push esi - push edi - picgetgot ebx - - %define tdsth ebp + 8 - %define tdstv ebp + 12 - %define tdstc ebp + 16 - %define tsrc ebp + 20 - %define tstride ebp + 24 - %define twidth ebp + 28 - %define theight ebp + 32 - %define tpw_1 ebp - 36 - %define tpw_16 ebp - 28 - %define tpw_32 ebp - 20 - %define tbuffer esp + 8 - - %define x eax - %define dsth ebx - %define dstv ebx - %define dstc ebx - %define src ecx - %define src3 edx - %define stride esi - %define width edi - - mov stride, [tstride] - mov width, [twidth] - lea eax, [stride*2 + 24 + 24] - sub esp, eax +cglobal x264_hpel_filter_mmxext, 0,7 + %define x r0 + %define xd r0d + %define dsth r1 + %define dstv r1 + %define dstc r1 + %define src r2 + %define src3 r3 + %define stride r4 + %define width r5d + %define tbuffer rsp+8 + +%ifdef ARCH_X86_64 + PUSH rbp + PUSH r12 + PUSH r13 + PUSH r14 + %define tdsth r10 ; FIXME r8,9 + %define tdstv r11 + %define tdstc r12 + %define tsrc r13 + %define theight r14d + mov tdsth, r0 + mov tdstv, r1 + mov tdstc, r2 + mov tsrc, r3 + mov theight, r6m +%else + %define tdsth [rbp + 20] + %define tdstv [rbp + 24] + %define tdstc [rbp + 28] + %define tsrc [rbp + 32] + %define theight [rbp + 44] +%endif + + movifnidn r4d, r4m + movifnidn r5d, r5m + mov rbp, rsp + lea rax, [stride*2 + 24] + sub rsp, rax pxor mm0, mm0 - ; mov globals onto the stack, to free up ebx + %define tpw_1 [pw_1 GLOBAL] + %define tpw_16 [pw_16 GLOBAL] + %define tpw_32 [pw_32 GLOBAL] +%ifdef PIC32 + ; mov globals onto the stack, to free up PIC pointer + %define tpw_1 [ebp - 24] + %define tpw_16 [ebp - 16] + %define tpw_32 [ebp - 8] + picgetgot ebx + sub esp, 24 movq mm1, [pw_1 GLOBAL] movq mm2, [pw_16 GLOBAL] movq mm3, [pw_32 GLOBAL] - movq [tpw_1], mm1 - movq [tpw_16], mm2 - movq [tpw_32], mm3 + movq tpw_1, mm1 + movq tpw_16, mm2 + movq tpw_32, mm3 +%endif .loopy: - mov src, [tsrc] - mov dstv, [tdstv] + mov src, tsrc + mov dstv, tdstv lea src3, [src + stride] sub src, stride sub src, stride - xor x, x + xor xd, xd ALIGN 16 .vertical_filter: @@ -163,7 +163,7 @@ ALIGN 16 FILT_V - movq mm7, [tpw_16] + movq mm7, tpw_16 movq [tbuffer + x*2], mm1 movq [tbuffer + x*2 + 8], mm4 paddw mm1, mm7 @@ -173,19 +173,19 @@ ALIGN 16 packuswb mm1, mm4 movntq [dstv + x], mm1 - add x, 8 + add xd, 8 add src, 8 add src3, 8 - cmp x, width + cmp xd, width jle .vertical_filter pshufw mm2, [tbuffer], 0 movq [tbuffer - 8], mm2 ; pad left ; no need to pad right, since vertical_filter already did 4 extra pixels - mov dstc, [tdstc] - xor x, x - movq mm7, [tpw_32] + mov dstc, tdstc + xor xd, xd + movq mm7, tpw_32 .center_filter: movq mm1, [tbuffer + x*2 - 4 ] @@ -205,13 +205,13 @@ ALIGN 16 FILT_PACK 6 movntq [dstc + x], mm1 - add x, 8 - cmp x, width + add xd, 8 + cmp xd, width jl .center_filter - mov dsth, [tdsth] - mov src, [tsrc] - xor x, x + mov dsth, tdsth + mov src, tsrc + xor xd, xd .horizontal_filter: movd mm1, [src + x - 2] @@ -241,29 +241,30 @@ ALIGN 16 punpcklbw mm6, mm0 paddw mm6, mm7 ; a1 - movq mm7, [tpw_1] + movq mm7, tpw_1 FILT_H FILT_PACK 1 movntq [dsth + x], mm1 - add x, 8 - cmp x, width + add xd, 8 + cmp xd, width jl .horizontal_filter - add [tsrc], stride - add [tdsth], stride - add [tdstv], stride - add [tdstc], stride - dec dword [theight] + add tsrc, stride + add tdsth, stride + add tdstv, stride + add tdstc, stride + dec dword theight jg .loopy - lea esp, [ebp-12] - pop edi - pop esi - pop ebx - pop ebp - ret - + mov rsp, rbp +%ifdef ARCH_X86_64 + pop r14 + pop r13 + pop r12 + pop rbp +%endif + RET @@ -271,57 +272,67 @@ ALIGN 16 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, ; uint8_t *src, int i_src, int w, int h) ;----------------------------------------------------------------------------- -cglobal x264_plane_copy_mmxext - push edi - push esi - push ebx - mov edi, [esp+16] ; dst - mov ebx, [esp+20] ; i_dst - mov esi, [esp+24] ; src - mov eax, [esp+28] ; i_src - mov edx, [esp+32] ; w - add edx, 3 - and edx, ~3 - sub ebx, edx - sub eax, edx +cglobal x264_plane_copy_mmxext, 6,7 + movsxdifnidn r1, r1d + movsxdifnidn r3, r3d + add r4d, 3 + and r4d, ~3 + mov r6d, r4d + and r6d, ~15 + sub r1, r6 + sub r3, r6 .loopy: - mov ecx, edx - sub ecx, 64 + mov r6d, r4d + sub r6d, 64 jl .endx .loopx: - prefetchnta [esi+256] - movq mm0, [esi ] - movq mm1, [esi+ 8] - movq mm2, [esi+16] - movq mm3, [esi+24] - movq mm4, [esi+32] - movq mm5, [esi+40] - movq mm6, [esi+48] - movq mm7, [esi+56] - movntq [edi ], mm0 - movntq [edi+ 8], mm1 - movntq [edi+16], mm2 - movntq [edi+24], mm3 - movntq [edi+32], mm4 - movntq [edi+40], mm5 - movntq [edi+48], mm6 - movntq [edi+56], mm7 - add esi, 64 - add edi, 64 - sub ecx, 64 + prefetchnta [r2+256] + movq mm0, [r2 ] + movq mm1, [r2+ 8] + movq mm2, [r2+16] + movq mm3, [r2+24] + movq mm4, [r2+32] + movq mm5, [r2+40] + movq mm6, [r2+48] + movq mm7, [r2+56] + movntq [r0 ], mm0 + movntq [r0+ 8], mm1 + movntq [r0+16], mm2 + movntq [r0+24], mm3 + movntq [r0+32], mm4 + movntq [r0+40], mm5 + movntq [r0+48], mm6 + movntq [r0+56], mm7 + add r2, 64 + add r0, 64 + sub r6d, 64 jge .loopx .endx: - prefetchnta [esi+256] - add ecx, 64 - shr ecx, 2 - rep movsd - add edi, ebx - add esi, eax - sub dword [esp+36], 1 + prefetchnta [r2+256] + add r6d, 48 + jl .end16 +.loop16: + movq mm0, [r2 ] + movq mm1, [r2+8] + movntq [r0 ], mm0 + movntq [r0+8], mm1 + add r2, 16 + add r0, 16 + sub r6d, 16 + jge .loop16 +.end16: + add r6d, 12 + jl .end4 +.loop4: + movd mm2, [r2+r6] + movd [r0+r6], mm2 + sub r6d, 4 + jge .loop4 +.end4: + add r2, r3 + add r0, r1 + dec r5d jg .loopy - pop ebx - pop esi - pop edi emms - ret + RET diff --git a/common/i386/mc-c.c b/common/x86/mc-c.c similarity index 93% rename from common/i386/mc-c.c rename to common/x86/mc-c.c index 212a8b22..5ab5909c 100644 --- a/common/i386/mc-c.c +++ b/common/x86/mc-c.c @@ -1,10 +1,10 @@ /***************************************************************************** - * mc.c: h264 encoder library (Motion Compensation) + * mc-c.c: h264 encoder library (Motion Compensation) ***************************************************************************** - * Copyright (C) 2003 Laurent Aimar - * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $ + * Copyright (C) 2003-2008 x264 project * * Authors: Laurent Aimar + * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,18 +38,22 @@ extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); +extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ); -extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ); extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); -extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); +extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ); +extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ); extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_prefetch_ref_mmxext( uint8_t *, int, int ); +extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); +extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int i_stride, int i_width, int i_height ); @@ -69,7 +73,7 @@ static void (* const x264_pixel_avg_wtab_mmxext[6])( uint8_t *, int, uint8_t *, NULL, x264_pixel_avg2_w4_mmxext, x264_pixel_avg2_w8_mmxext, - x264_pixel_avg2_w16_mmxext, + x264_pixel_avg2_w12_mmxext, x264_pixel_avg2_w16_mmxext, x264_pixel_avg2_w20_mmxext, }; @@ -146,6 +150,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->mc_luma = mc_luma_mmxext; pf->get_ref = get_ref_mmxext; + pf->mc_chroma = x264_mc_chroma_mmxext; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext; diff --git a/common/i386/mc.h b/common/x86/mc.h similarity index 96% rename from common/i386/mc.h rename to common/x86/mc.h index 40f23596..7006aeae 100644 --- a/common/i386/mc.h +++ b/common/x86/mc.h @@ -2,7 +2,6 @@ * mc.h: h264 encoder library ***************************************************************************** * Copyright (C) 2003 Laurent Aimar - * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $ * * Authors: Laurent Aimar * diff --git a/common/x86/pixel-32.asm b/common/x86/pixel-32.asm new file mode 100644 index 00000000..98071787 --- /dev/null +++ b/common/x86/pixel-32.asm @@ -0,0 +1,460 @@ +;***************************************************************************** +;* pixel-32.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003-2008 x264 project +;* +;* Authors: Laurent Aimar +;* Loren Merritt +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION .text + +%macro SUMSUB_BADC 4 + paddw %1, %2 + paddw %3, %4 + paddw %2, %2 + paddw %4, %4 + psubw %2, %1 + psubw %4, %3 +%endmacro + +%macro SBUTTERFLY 5 + mov%1 %5, %3 + punpckl%2 %3, %4 + punpckh%2 %5, %4 +%endmacro + +%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc + SBUTTERFLY q, wd, %1, %2, %5 + SBUTTERFLY q, wd, %3, %4, %2 + SBUTTERFLY q, dq, %1, %3, %4 + SBUTTERFLY q, dq, %5, %2, %3 +%endmacro + +%macro ABS1 2 ; mma, tmp + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 +%endmacro + +%macro ABS2 4 ; mma, mmb, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + psubw %3, %1 + psubw %4, %2 + pmaxsw %1, %3 + pmaxsw %2, %4 +%endmacro + +%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy + movd %1, [eax+ebx*%4+%3] + movd %2, [ecx+edx*%4+%3] + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 +%endmacro + +%macro LOAD_DIFF_4x8P 1 ; dx + LOAD_DIFF_4P mm0, mm7, %1, 0 + LOAD_DIFF_4P mm1, mm7, %1, 1 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + LOAD_DIFF_4P mm2, mm7, %1, 0 + LOAD_DIFF_4P mm3, mm7, %1, 1 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + LOAD_DIFF_4P mm4, mm7, %1, 0 + LOAD_DIFF_4P mm5, mm7, %1, 1 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + LOAD_DIFF_4P mm6, mm7, %1, 0 + movq [spill], mm6 + LOAD_DIFF_4P mm7, mm6, %1, 1 + movq mm6, [spill] +%endmacro + +%macro HADAMARD8_1D 8 + SUMSUB_BADC %1, %5, %2, %6 + SUMSUB_BADC %3, %7, %4, %8 + SUMSUB_BADC %1, %3, %2, %4 + SUMSUB_BADC %5, %7, %6, %8 + SUMSUB_BADC %1, %2, %3, %4 + SUMSUB_BADC %5, %6, %7, %8 +%endmacro + +%macro SUM4x8_MM 0 + movq [spill], mm6 + movq [spill+8], mm7 + ABS2 mm0, mm1, mm6, mm7 + ABS2 mm2, mm3, mm6, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm6, [spill] + movq mm7, [spill+8] + ABS2 mm4, mm5, mm2, mm3 + ABS2 mm6, mm7, mm2, mm3 + paddw mm4, mm6 + paddw mm5, mm7 + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm0, mm1 +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_sa8d_8x8_mmxext + push ebx + mov eax, [esp+ 8] ; pix1 + mov ebx, [esp+12] ; stride1 + mov ecx, [esp+16] ; pix2 + mov edx, [esp+20] ; stride2 + sub esp, 0x70 +%define args esp+0x74 +%define spill esp+0x60 ; +16 +%define trans esp+0 ; +96 + LOAD_DIFF_4x8P 0 + HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movq [spill], mm0 + TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 + movq [trans+0x00], mm4 + movq [trans+0x08], mm7 + movq [trans+0x10], mm0 + movq [trans+0x18], mm6 + movq mm0, [spill] + TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 + movq [trans+0x20], mm0 + movq [trans+0x28], mm3 + movq [trans+0x30], mm4 + movq [trans+0x38], mm2 + + mov eax, [args+4] + mov ecx, [args+12] + LOAD_DIFF_4x8P 4 + HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movq [spill], mm7 + TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7 + movq [trans+0x40], mm0 + movq [trans+0x48], mm3 + movq [trans+0x50], mm7 + movq [trans+0x58], mm2 + movq mm7, [spill] + TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 + movq mm5, [trans+0x00] + movq mm1, [trans+0x08] + movq mm2, [trans+0x10] + movq mm3, [trans+0x18] + + HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 + SUM4x8_MM + movq [trans], mm0 + + movq mm0, [trans+0x20] + movq mm1, [trans+0x28] + movq mm2, [trans+0x30] + movq mm3, [trans+0x38] + movq mm4, [trans+0x40] + movq mm5, [trans+0x48] + movq mm6, [trans+0x50] + movq mm7, [trans+0x58] + + HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + SUM4x8_MM + + pavgw mm0, [esp] + pshufw mm1, mm0, 01001110b + paddw mm0, mm1 + pshufw mm1, mm0, 10110001b + paddw mm0, mm1 + movd eax, mm0 + and eax, 0xffff + mov ecx, eax ; preserve rounding for 16x16 + add eax, 1 + shr eax, 1 + add esp, 0x70 + pop ebx + ret +%undef args +%undef spill +%undef trans + +%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op + pxor %7, %7 + pshufw %4, %1, 01001110b + pshufw %5, %2, 01001110b + pshufw %6, %3, 01001110b + paddusw %1, %4 + paddusw %2, %5 + paddusw %3, %6 + punpcklwd %1, %7 + punpcklwd %2, %7 + punpcklwd %3, %7 + pshufw %4, %1, 01001110b + pshufw %5, %2, 01001110b + pshufw %6, %3, 01001110b + %8 %1, %4 + %8 %2, %5 + %8 %3, %6 +%endmacro + +%macro LOAD_4x8P 1 ; dx + pxor mm7, mm7 + movd mm6, [eax+%1+7*FENC_STRIDE] + movd mm0, [eax+%1+0*FENC_STRIDE] + movd mm1, [eax+%1+1*FENC_STRIDE] + movd mm2, [eax+%1+2*FENC_STRIDE] + movd mm3, [eax+%1+3*FENC_STRIDE] + movd mm4, [eax+%1+4*FENC_STRIDE] + movd mm5, [eax+%1+5*FENC_STRIDE] + punpcklbw mm6, mm7 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + movq [spill], mm6 + punpcklbw mm2, mm7 + punpcklbw mm3, mm7 + movd mm6, [eax+%1+6*FENC_STRIDE] + punpcklbw mm4, mm7 + punpcklbw mm5, mm7 + punpcklbw mm6, mm7 + movq mm7, [spill] +%endmacro + +;----------------------------------------------------------------------------- +; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res ) +;----------------------------------------------------------------------------- +cglobal x264_intra_sa8d_x3_8x8_core_mmxext + mov eax, [esp+4] + mov ecx, [esp+8] + sub esp, 0x70 +%define args esp+0x74 +%define spill esp+0x60 ; +16 +%define trans esp+0 ; +96 +%define sum esp+0 ; +32 + LOAD_4x8P 0 + HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movq [spill], mm0 + TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 + movq [trans+0x00], mm4 + movq [trans+0x08], mm7 + movq [trans+0x10], mm0 + movq [trans+0x18], mm6 + movq mm0, [spill] + TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 + movq [trans+0x20], mm0 + movq [trans+0x28], mm3 + movq [trans+0x30], mm4 + movq [trans+0x38], mm2 + + LOAD_4x8P 4 + HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movq [spill], mm7 + TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7 + movq [trans+0x40], mm0 + movq [trans+0x48], mm3 + movq [trans+0x50], mm7 + movq [trans+0x58], mm2 + movq mm7, [spill] + TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 + movq mm5, [trans+0x00] + movq mm1, [trans+0x08] + movq mm2, [trans+0x10] + movq mm3, [trans+0x18] + + HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 + + movq [spill+0], mm5 + movq [spill+8], mm7 + ABS2 mm0, mm1, mm5, mm7 + ABS2 mm2, mm3, mm5, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm1 + ABS2 mm4, mm6, mm2, mm3 + movq mm5, [spill+0] + movq mm7, [spill+8] + paddw mm0, mm4 + paddw mm0, mm6 + ABS1 mm7, mm1 + paddw mm0, mm7 ; 7x4 sum + movq mm6, mm5 + movq mm7, [ecx+8] ; left bottom + psllw mm7, 3 + psubw mm6, mm7 + ABS2 mm5, mm6, mm2, mm3 + paddw mm5, mm0 + paddw mm6, mm0 + movq [sum+0], mm5 ; dc + movq [sum+8], mm6 ; left + + movq mm0, [trans+0x20] + movq mm1, [trans+0x28] + movq mm2, [trans+0x30] + movq mm3, [trans+0x38] + movq mm4, [trans+0x40] + movq mm5, [trans+0x48] + movq mm6, [trans+0x50] + movq mm7, [trans+0x58] + + HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movd [sum+0x10], mm0 + movd [sum+0x12], mm1 + movd [sum+0x14], mm2 + movd [sum+0x16], mm3 + movd [sum+0x18], mm4 + movd [sum+0x1a], mm5 + movd [sum+0x1c], mm6 + movd [sum+0x1e], mm7 + + movq [spill], mm0 + movq [spill+8], mm1 + ABS2 mm2, mm3, mm0, mm1 + ABS2 mm4, mm5, mm0, mm1 + paddw mm2, mm3 + paddw mm4, mm5 + paddw mm2, mm4 + movq mm0, [spill] + movq mm1, [spill+8] + ABS2 mm6, mm7, mm4, mm5 + ABS1 mm1, mm4 + paddw mm2, mm7 + paddw mm1, mm6 + paddw mm2, mm1 ; 7x4 sum + movq mm1, mm0 + + movq mm7, [ecx+0] + psllw mm7, 3 ; left top + + movzx edx, word [ecx+0] + add dx, [ecx+16] + lea edx, [4*edx+32] + and edx, -64 + movd mm6, edx ; dc + + psubw mm1, mm7 + psubw mm0, mm6 + ABS2 mm0, mm1, mm5, mm6 + movq mm3, [sum+0] ; dc + paddw mm0, mm2 + paddw mm1, mm2 + movq mm2, mm0 + paddw mm0, mm3 + paddw mm1, [sum+8] ; h + psrlq mm2, 16 + paddw mm2, mm3 + + movq mm3, [ecx+16] ; top left + movq mm4, [ecx+24] ; top right + psllw mm3, 3 + psllw mm4, 3 + psubw mm3, [sum+16] + psubw mm4, [sum+24] + ABS2 mm3, mm4, mm5, mm6 + paddw mm2, mm3 + paddw mm2, mm4 ; v + + SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd + mov eax, [args+8] + movd ecx, mm2 + movd edx, mm1 + add ecx, 2 + add edx, 2 + shr ecx, 2 + shr edx, 2 + mov [eax+0], ecx ; i8x8_v satd + mov [eax+4], edx ; i8x8_h satd + movd ecx, mm0 + add ecx, 2 + shr ecx, 2 + mov [eax+8], ecx ; i8x8_dc satd + + add esp, 0x70 + ret +%undef args +%undef spill +%undef trans +%undef sum + + + +;----------------------------------------------------------------------------- +; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, +; const uint8_t *pix2, int stride2, int sums[2][4] ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_ssim_4x4x2_core_mmxext + push ebx + push edi + mov ebx, [esp+16] + mov edx, [esp+24] + mov edi, 4 + pxor mm0, mm0 +.loop + mov eax, [esp+12] + mov ecx, [esp+20] + add eax, edi + add ecx, edi + pxor mm1, mm1 + pxor mm2, mm2 + pxor mm3, mm3 + pxor mm4, mm4 +%rep 4 + movd mm5, [eax] + movd mm6, [ecx] + punpcklbw mm5, mm0 + punpcklbw mm6, mm0 + paddw mm1, mm5 + paddw mm2, mm6 + movq mm7, mm5 + pmaddwd mm5, mm5 + pmaddwd mm7, mm6 + pmaddwd mm6, mm6 + paddd mm3, mm5 + paddd mm4, mm7 + paddd mm3, mm6 + add eax, ebx + add ecx, edx +%endrep + mov eax, [esp+28] + lea eax, [eax+edi*4] + pshufw mm5, mm1, 0xE + pshufw mm6, mm2, 0xE + paddusw mm1, mm5 + paddusw mm2, mm6 + punpcklwd mm1, mm2 + pshufw mm2, mm1, 0xE + pshufw mm5, mm3, 0xE + pshufw mm6, mm4, 0xE + paddusw mm1, mm2 + paddd mm3, mm5 + paddd mm4, mm6 + punpcklwd mm1, mm0 + punpckldq mm3, mm4 + movq [eax+0], mm1 + movq [eax+8], mm3 + sub edi, 4 + jge .loop + pop edi + pop ebx + emms + ret + diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm new file mode 100644 index 00000000..93d2b7da --- /dev/null +++ b/common/x86/pixel-a.asm @@ -0,0 +1,1711 @@ +;***************************************************************************** +;* pixel.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003-2008 x264 project +;* +;* Authors: Loren Merritt +;* Laurent Aimar +;* Alex Izvorski +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 +ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 +ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 +mask_ff: times 16 db 0xff + times 16 db 0 + +SECTION .text + +%macro HADDD 2 ; sum junk + movhlps %2, %1 + paddd %1, %2 + pshuflw %2, %1, 0xE + paddd %1, %2 +%endmacro + +%macro HADDW 2 + pmaddwd %1, [pw_1 GLOBAL] + HADDD %1, %2 +%endmacro + +;============================================================================= +; SSD +;============================================================================= + +%macro SSD_INC_1x16P 0 + movq mm1, [r0] + movq mm2, [r2] + movq mm3, [r0+8] + movq mm4, [r2+8] + + movq mm5, mm2 + movq mm6, mm4 + psubusb mm2, mm1 + psubusb mm4, mm3 + psubusb mm1, mm5 + psubusb mm3, mm6 + por mm1, mm2 + por mm3, mm4 + + movq mm2, mm1 + movq mm4, mm3 + punpcklbw mm1, mm7 + punpcklbw mm3, mm7 + punpckhbw mm2, mm7 + punpckhbw mm4, mm7 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + pmaddwd mm3, mm3 + pmaddwd mm4, mm4 + + add r0, r1 + add r2, r3 + paddd mm0, mm1 + paddd mm0, mm2 + paddd mm0, mm3 + paddd mm0, mm4 +%endmacro + +%macro SSD_INC_1x8P 0 + movq mm1, [r0] + movq mm2, [r2] + + movq mm5, mm2 + psubusb mm2, mm1 + psubusb mm1, mm5 + por mm1, mm2 ; mm1 = 8bit abs diff + + movq mm2, mm1 + punpcklbw mm1, mm7 + punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + + add r0, r1 + add r2, r3 + paddd mm0, mm1 + paddd mm0, mm2 +%endmacro + +%macro SSD_INC_1x4P 0 + movd mm1, [r0] + movd mm2, [r2] + + movq mm5, mm2 + psubusb mm2, mm1 + psubusb mm1, mm5 + por mm1, mm2 + punpcklbw mm1, mm7 + pmaddwd mm1, mm1 + + add r0, r1 + add r2, r3 + paddd mm0, mm1 +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +%macro SSD_MMX 2 +cglobal x264_pixel_ssd_%1x%2_mmx, 4,4 + pxor mm7, mm7 ; zero + pxor mm0, mm0 ; mm0 holds the sum +%rep %2 + SSD_INC_1x%1P +%endrep + movq mm1, mm0 + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + RET +%endmacro + +SSD_MMX 16, 16 +SSD_MMX 16, 8 +SSD_MMX 8, 16 +SSD_MMX 8, 8 +SSD_MMX 8, 4 +SSD_MMX 4, 8 +SSD_MMX 4, 4 + +%macro SSD_INC_2x16P_SSE2 0 + movdqu xmm1, [r0] + movdqu xmm2, [r2] + movdqu xmm3, [r0+r1] + movdqu xmm4, [r2+r3] + + movdqa xmm5, xmm1 + movdqa xmm6, xmm3 + psubusb xmm1, xmm2 + psubusb xmm3, xmm4 + psubusb xmm2, xmm5 + psubusb xmm4, xmm6 + por xmm1, xmm2 + por xmm3, xmm4 + + movdqa xmm2, xmm1 + movdqa xmm4, xmm3 + punpcklbw xmm1, xmm7 + punpckhbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + punpckhbw xmm4, xmm7 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + pmaddwd xmm4, xmm4 + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + + paddd xmm1, xmm2 + paddd xmm3, xmm4 + paddd xmm0, xmm1 + paddd xmm0, xmm3 +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +%macro SSD_SSE2 2 +cglobal x264_pixel_ssd_%1x%2_sse2, 4,4 + pxor xmm7, xmm7 + pxor xmm0, xmm0 +%rep %2/2 + SSD_INC_2x16P_SSE2 +%endrep + HADDD xmm0, xmm1 + movd eax, xmm0 + RET +%endmacro + +SSD_SSE2 16, 16 +SSD_SSE2 16, 8 + + + +;============================================================================= +; SATD +;============================================================================= + +%macro LOAD_DIFF_4P 4 ; dst, tmp, [pix1], [pix2] + movd %1, %3 + movd %2, %4 + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 +%endmacro + +%macro LOAD_DIFF_8P 4 ; dst, tmp, [pix1], [pix2] + movq %1, %3 + movq %2, %4 + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 +%endmacro + +%macro LOAD_DIFF_8x4P 6 ; 4x dest, 2x temp + LOAD_DIFF_8P %1, %5, [r0], [r2] + LOAD_DIFF_8P %2, %6, [r0+r1], [r2+r3] + LOAD_DIFF_8P %3, %5, [r0+2*r1], [r2+2*r3] + LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5] +%endmacro + +;;; row transform not used, because phaddw is much slower than paddw on a Conroe +;%macro PHSUMSUB 3 +; movdqa %3, %1 +; phaddw %1, %2 +; phsubw %3, %2 +;%endmacro + +;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc +; PHSUMSUB %1, %2, %5 +; PHSUMSUB %3, %4, %2 +; PHSUMSUB %1, %3, %4 +; PHSUMSUB %5, %2, %3 +;%endmacro + +%macro SUMSUB_BADC 4 + paddw %1, %2 + paddw %3, %4 + paddw %2, %2 + paddw %4, %4 + psubw %2, %1 + psubw %4, %3 +%endmacro + +%macro HADAMARD4_1D 4 + SUMSUB_BADC %1, %2, %3, %4 + SUMSUB_BADC %1, %3, %2, %4 +%endmacro + +%macro HADAMARD8_1D 8 + SUMSUB_BADC %1, %5, %2, %6 + SUMSUB_BADC %3, %7, %4, %8 + SUMSUB_BADC %1, %3, %2, %4 + SUMSUB_BADC %5, %7, %6, %8 + SUMSUB_BADC %1, %2, %3, %4 + SUMSUB_BADC %5, %6, %7, %8 +%endmacro + +%macro SBUTTERFLY 5 + mov%1 %5, %3 + punpckl%2 %3, %4 + punpckh%2 %5, %4 +%endmacro + +%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4x2 to not shuffle registers + mov%1 %5, %3 + punpckh%2 %3, %4 + punpckl%2 %5, %4 +%endmacro + +%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc + SBUTTERFLY q, wd, %1, %2, %5 + SBUTTERFLY q, wd, %3, %4, %2 + SBUTTERFLY q, dq, %1, %3, %4 + SBUTTERFLY q, dq, %5, %2, %3 +%endmacro + +%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc + SBUTTERFLY dqa, dq, %1, %2, %5 + SBUTTERFLY dqa, dq, %3, %4, %2 + SBUTTERFLY dqa, qdq, %1, %3, %4 + SBUTTERFLY dqa, qdq, %5, %2, %3 +%endmacro + +%macro TRANSPOSE2x4x4W 5 ; abcd-t -> abcd + SBUTTERFLY dqa, wd, %1, %2, %5 + SBUTTERFLY dqa, wd, %3, %4, %2 + SBUTTERFLY dqa, dq, %1, %3, %4 + SBUTTERFLY2 dqa, dq, %5, %2, %3 + SBUTTERFLY dqa, qdq, %1, %3, %2 + SBUTTERFLY2 dqa, qdq, %4, %5, %3 +%endmacro + +%ifdef ARCH_X86_64 +%macro TRANSPOSE8x8W 9 ; abcdefgh-t -> afhdtecb + SBUTTERFLY dqa, wd, %1, %2, %9 + SBUTTERFLY dqa, wd, %3, %4, %2 + SBUTTERFLY dqa, wd, %5, %6, %4 + SBUTTERFLY dqa, wd, %7, %8, %6 + SBUTTERFLY dqa, dq, %1, %3, %8 + SBUTTERFLY dqa, dq, %9, %2, %3 + SBUTTERFLY dqa, dq, %5, %7, %2 + SBUTTERFLY dqa, dq, %4, %6, %7 + SBUTTERFLY dqa, qdq, %1, %5, %6 + SBUTTERFLY dqa, qdq, %9, %4, %5 + SBUTTERFLY dqa, qdq, %8, %2, %4 + SBUTTERFLY dqa, qdq, %3, %7, %2 +%endmacro +%else +%macro TRANSPOSE8x8W 9 ; abcdefgh -> afhdgecb + movdqa [%9], %8 + SBUTTERFLY dqa, wd, %1, %2, %8 + movdqa [%9+16], %8 + movdqa %8, [%9] + SBUTTERFLY dqa, wd, %3, %4, %2 + SBUTTERFLY dqa, wd, %5, %6, %4 + SBUTTERFLY dqa, wd, %7, %8, %6 + SBUTTERFLY dqa, dq, %1, %3, %8 + movdqa [%9], %8 + movdqa %8, [16+%9] + SBUTTERFLY dqa, dq, %8, %2, %3 + SBUTTERFLY dqa, dq, %5, %7, %2 + SBUTTERFLY dqa, dq, %4, %6, %7 + SBUTTERFLY dqa, qdq, %1, %5, %6 + SBUTTERFLY dqa, qdq, %8, %4, %5 + movdqa [%9+16], %8 + movdqa %8, [%9] + SBUTTERFLY dqa, qdq, %8, %2, %4 + SBUTTERFLY dqa, qdq, %3, %7, %2 + movdqa %7, [%9+16] +%endmacro +%endif + +%macro ABS1_MMX 2 ; a, tmp + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 +%endmacro + +%macro ABS2_MMX 4 ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + psubw %3, %1 + psubw %4, %2 + pmaxsw %1, %3 + pmaxsw %2, %4 +%endmacro + +%macro ABS1_SSSE3 2 + pabsw %1, %1 +%endmacro + +%macro ABS2_SSSE3 4 + pabsw %1, %1 + pabsw %2, %2 +%endmacro + +%define ABS1 ABS1_MMX +%define ABS2 ABS2_MMX + +%macro ABS4 6 + ABS2 %1, %2, %5, %6 + ABS2 %3, %4, %5, %6 +%endmacro + +%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) + HADAMARD4_1D mm4, mm5, mm6, mm7 + TRANSPOSE4x4W mm4, mm5, mm6, mm7, %1 + HADAMARD4_1D mm4, mm7, %1, mm6 + ABS2 mm4, mm7, mm3, mm5 + ABS2 %1, mm6, mm3, mm5 + paddw %1, mm4 + paddw mm6, mm7 + pavgw %1, mm6 +%endmacro + +; in: r4=3*stride1, r5=3*stride2 +; in: %2 = horizontal offset +; in: %3 = whether we need to increment pix1 and pix2 +; clobber: mm3..mm7 +; out: %1 = satd +%macro SATD_4x4_MMX 3 + LOAD_DIFF_4P mm4, mm3, [r0+%2], [r2+%2] + LOAD_DIFF_4P mm5, mm3, [r0+r1+%2], [r2+r3+%2] + LOAD_DIFF_4P mm6, mm3, [r0+2*r1+%2], [r2+2*r3+%2] + LOAD_DIFF_4P mm7, mm3, [r0+r4+%2], [r2+r5+%2] +%if %3 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%endif + HADAMARD4x4_SUM %1 +%endmacro + +%macro SATD_8x4_START 1 + SATD_4x4_MMX mm0, 0, 0 + SATD_4x4_MMX mm1, 4, %1 +%endmacro + +%macro SATD_8x4_INC 1 + SATD_4x4_MMX mm2, 0, 0 + paddw mm0, mm1 + SATD_4x4_MMX mm1, 4, %1 + paddw mm0, mm2 +%endmacro + +%macro SATD_16x4_START 1 + SATD_4x4_MMX mm0, 0, 0 + SATD_4x4_MMX mm1, 4, 0 + SATD_4x4_MMX mm2, 8, 0 + paddw mm0, mm1 + SATD_4x4_MMX mm1, 12, %1 + paddw mm0, mm2 +%endmacro + +%macro SATD_16x4_INC 1 + SATD_4x4_MMX mm2, 0, 0 + paddw mm0, mm1 + SATD_4x4_MMX mm1, 4, 0 + paddw mm0, mm2 + SATD_4x4_MMX mm2, 8, 0 + paddw mm0, mm1 + SATD_4x4_MMX mm1, 12, %1 + paddw mm0, mm2 +%endmacro + +%macro SATD_8x4_SSE2 1 + LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 +%if %1 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%endif + HADAMARD4_1D xmm0, xmm1, xmm2, xmm3 + TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4 + HADAMARD4_1D xmm0, xmm1, xmm2, xmm3 + ABS4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + paddusw xmm0, xmm1 + paddusw xmm2, xmm3 + paddusw xmm6, xmm0 + paddusw xmm6, xmm2 +%endmacro + +%macro SATD_START_MMX 0 + lea r4, [3*r1] ; 3*stride1 + lea r5, [3*r3] ; 3*stride2 +%endmacro + +%macro SATD_END_MMX 0 + pshufw mm1, mm0, 01001110b + paddw mm0, mm1 + pshufw mm1, mm0, 10110001b + paddw mm0, mm1 + movd eax, mm0 + and eax, 0xffff + RET +%endmacro + +; FIXME avoid the spilling of regs to hold 3*stride. +; for small blocks on x86_32, modify pixel pointer instead. + +;----------------------------------------------------------------------------- +; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_satd_16x16_mmxext, 4,6 + SATD_START_MMX + SATD_16x4_START 1 + SATD_16x4_INC 1 + SATD_16x4_INC 1 + SATD_16x4_INC 0 + paddw mm0, mm1 + pxor mm3, mm3 + pshufw mm1, mm0, 01001110b + paddw mm0, mm1 + punpcklwd mm0, mm3 + pshufw mm1, mm0, 01001110b + paddd mm0, mm1 + movd eax, mm0 + RET + +cglobal x264_pixel_satd_16x8_mmxext, 4,6 + SATD_START_MMX + SATD_16x4_START 1 + SATD_16x4_INC 0 + paddw mm0, mm1 + SATD_END_MMX + +cglobal x264_pixel_satd_8x16_mmxext, 4,6 + SATD_START_MMX + SATD_8x4_START 1 + SATD_8x4_INC 1 + SATD_8x4_INC 1 + SATD_8x4_INC 0 + paddw mm0, mm1 + SATD_END_MMX + +cglobal x264_pixel_satd_8x8_mmxext, 4,6 + SATD_START_MMX + SATD_8x4_START 1 + SATD_8x4_INC 0 + paddw mm0, mm1 + SATD_END_MMX + +cglobal x264_pixel_satd_8x4_mmxext, 4,6 + SATD_START_MMX + SATD_8x4_START 0 + paddw mm0, mm1 + SATD_END_MMX + +cglobal x264_pixel_satd_4x8_mmxext, 4,6 + SATD_START_MMX + SATD_4x4_MMX mm0, 0, 1 + SATD_4x4_MMX mm1, 0, 0 + paddw mm0, mm1 + SATD_END_MMX + +cglobal x264_pixel_satd_4x4_mmxext, 4,6 + SATD_START_MMX + SATD_4x4_MMX mm0, 0, 0 + SATD_END_MMX + + + +%macro SATD_START_SSE2 0 + pxor xmm6, xmm6 + lea r4, [3*r1] + lea r5, [3*r3] +%endmacro + +%macro SATD_END_SSE2 0 + picgetgot ebx + psrlw xmm6, 1 + HADDW xmm6, xmm7 + movd eax, xmm6 + RET +%endmacro + +%macro BACKUP_POINTERS 0 +%ifdef ARCH_X86_64 + mov r10, r0 + mov r11, r2 +%endif +%endmacro + +%macro RESTORE_AND_INC_POINTERS 0 +%ifdef ARCH_X86_64 + lea r0, [r10+8] + lea r2, [r11+8] +%else + mov r0, r0m + mov r2, r2m + add r0, 8 + add r2, 8 +%endif +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +%macro SATDS_SSE2 1 +cglobal x264_pixel_satd_16x16_%1, 4,6 + SATD_START_SSE2 + BACKUP_POINTERS + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 0 + RESTORE_AND_INC_POINTERS + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 0 + SATD_END_SSE2 + +cglobal x264_pixel_satd_16x8_%1, 4,6 + SATD_START_SSE2 + BACKUP_POINTERS + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 0 + RESTORE_AND_INC_POINTERS + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 0 + SATD_END_SSE2 + +cglobal x264_pixel_satd_8x16_%1, 4,6 + SATD_START_SSE2 + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 0 + SATD_END_SSE2 + +cglobal x264_pixel_satd_8x8_%1, 4,6 + SATD_START_SSE2 + SATD_8x4_SSE2 1 + SATD_8x4_SSE2 0 + SATD_END_SSE2 + +cglobal x264_pixel_satd_8x4_%1, 4,6 + SATD_START_SSE2 + SATD_8x4_SSE2 0 + SATD_END_SSE2 + +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_sa8d_8x8_%1 + lea r4, [3*r1] + lea r5, [3*r3] +.skip_lea: + LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm8, xmm9 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm8, xmm9 + + HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1 + + ABS4 xmm0, xmm1, xmm2, xmm3, xmm6, xmm9 + ABS4 xmm4, xmm5, xmm7, xmm8, xmm6, xmm9 + paddusw xmm0, xmm1 + paddusw xmm2, xmm3 + paddusw xmm4, xmm5 + paddusw xmm7, xmm8 + paddusw xmm0, xmm2 + paddusw xmm4, xmm7 + pavgw xmm0, xmm4 + HADDW xmm0, xmm1 + movd eax, xmm0 + add r10d, eax ; preserve rounding for 16x16 + add eax, 1 + shr eax, 1 + ret + +cglobal x264_pixel_sa8d_16x16_%1 + xor r10d, r10d + call x264_pixel_sa8d_8x8_%1 ; pix[0] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride] + neg r4 ; it's already r1*3 + neg r5 + lea r0, [r0+4*r4+8] + lea r2, [r2+4*r5+8] + call x264_pixel_sa8d_8x8_%1 ; pix[8] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride+8] + mov eax, r10d + add eax, 1 + shr eax, 1 + ret +%else ; ARCH_X86_32 +cglobal x264_pixel_sa8d_8x8_%1, 4,7 + mov r6, esp + and esp, ~15 + sub esp, 32 + lea r4, [3*r1] + lea r5, [3*r3] + LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm6, xmm7 + movdqa [esp], xmm2 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm2, xmm2 + movdqa xmm2, [esp] + + HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, esp + HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm6, xmm4, xmm2, xmm1 + +%ifidn %1, sse2 + movdqa [esp], xmm6 + movdqa [esp+16], xmm7 +%endif + ABS2 xmm2, xmm3, xmm6, xmm7 + ABS2 xmm0, xmm1, xmm6, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 +%ifidn %1, sse2 + movdqa xmm6, [esp] + movdqa xmm7, [esp+16] +%endif + ABS2 xmm4, xmm5, xmm2, xmm3 + ABS2 xmm6, xmm7, xmm2, xmm3 + paddusw xmm4, xmm5 + paddusw xmm6, xmm7 + paddusw xmm0, xmm1 + paddusw xmm4, xmm6 + pavgw xmm0, xmm4 + picgetgot ebx + HADDW xmm0, xmm1 + movd eax, xmm0 + mov ecx, eax ; preserve rounding for 16x16 + add eax, 1 + shr eax, 1 + mov esp, r6 + RET +%endif ; ARCH +%endmacro ; SATDS_SSE2 + +%macro SA8D_16x16_32 1 +%ifndef ARCH_X86_64 +cglobal x264_pixel_sa8d_16x16_%1 + push ebp + push dword [esp+20] ; stride2 + push dword [esp+20] ; pix2 + push dword [esp+20] ; stride1 + push dword [esp+20] ; pix1 + call x264_pixel_sa8d_8x8_%1 + mov ebp, ecx + add dword [esp+0], 8 ; pix1+8 + add dword [esp+8], 8 ; pix2+8 + call x264_pixel_sa8d_8x8_%1 + add ebp, ecx + mov eax, [esp+4] + mov edx, [esp+12] + shl eax, 3 + shl edx, 3 + add [esp+0], eax ; pix1+8*stride1+8 + add [esp+8], edx ; pix2+8*stride2+8 + call x264_pixel_sa8d_8x8_%1 + add ebp, ecx + sub dword [esp+0], 8 ; pix1+8*stride1 + sub dword [esp+8], 8 ; pix2+8*stride2 + call x264_pixel_sa8d_8x8_%1 + lea eax, [ebp+ecx+1] + shr eax, 1 + add esp, 16 + pop ebp + ret +%endif ; !ARCH_X86_64 +%endmacro ; SA8D_16x16_32 + + + +;============================================================================= +; INTRA SATD +;============================================================================= + +%macro INTRA_SA8D_SSE2 1 +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res ) +;----------------------------------------------------------------------------- +cglobal x264_intra_sa8d_x3_8x8_core_%1 + ; 8x8 hadamard + pxor xmm4, xmm4 + movq xmm0, [r0+0*FENC_STRIDE] + movq xmm7, [r0+1*FENC_STRIDE] + movq xmm6, [r0+2*FENC_STRIDE] + movq xmm3, [r0+3*FENC_STRIDE] + movq xmm5, [r0+4*FENC_STRIDE] + movq xmm1, [r0+5*FENC_STRIDE] + movq xmm8, [r0+6*FENC_STRIDE] + movq xmm2, [r0+7*FENC_STRIDE] + punpcklbw xmm0, xmm4 + punpcklbw xmm7, xmm4 + punpcklbw xmm6, xmm4 + punpcklbw xmm3, xmm4 + punpcklbw xmm5, xmm4 + punpcklbw xmm1, xmm4 + punpcklbw xmm8, xmm4 + punpcklbw xmm2, xmm4 + HADAMARD8_1D xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2 + TRANSPOSE8x8W xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4 + HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + + ; dc + movzx edi, word [r1+0] + add di, word [r1+16] + add edi, 8 + and edi, -16 + shl edi, 2 + + pxor xmm15, xmm15 + movdqa xmm8, xmm2 + movdqa xmm9, xmm3 + movdqa xmm10, xmm4 + movdqa xmm11, xmm5 + ABS4 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13 + paddusw xmm8, xmm10 + paddusw xmm9, xmm11 +%ifidn %1, ssse3 + pabsw xmm10, xmm6 + pabsw xmm11, xmm7 + pabsw xmm15, xmm1 +%else + movdqa xmm10, xmm6 + movdqa xmm11, xmm7 + movdqa xmm15, xmm1 + ABS2 xmm10, xmm11, xmm13, xmm14 + ABS1 xmm15, xmm13 +%endif + paddusw xmm10, xmm11 + paddusw xmm8, xmm9 + paddusw xmm15, xmm10 + paddusw xmm15, xmm8 + movdqa xmm14, xmm15 ; 7x8 sum + + movdqa xmm8, [r1+0] ; left edge + movd xmm9, edi + psllw xmm8, 3 + psubw xmm8, xmm0 + psubw xmm9, xmm0 + ABS1 xmm8, xmm10 + ABS1 xmm9, xmm11 ; 1x8 sum + paddusw xmm14, xmm8 + paddusw xmm15, xmm9 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + punpcklwd xmm4, xmm5 + punpcklwd xmm6, xmm7 + punpckldq xmm0, xmm2 + punpckldq xmm4, xmm6 + punpcklqdq xmm0, xmm4 ; transpose + movdqa xmm1, [r1+16] ; top edge + movdqa xmm2, xmm15 + psllw xmm1, 3 + psrldq xmm2, 2 ; 8x7 sum + psubw xmm0, xmm1 ; 8x1 sum + ABS1 xmm0, xmm1 + paddusw xmm2, xmm0 + + ; 3x HADDW + movdqa xmm7, [pw_1 GLOBAL] + pmaddwd xmm2, xmm7 + pmaddwd xmm14, xmm7 + pmaddwd xmm15, xmm7 + movdqa xmm3, xmm2 + punpckldq xmm2, xmm14 + punpckhdq xmm3, xmm14 + pshufd xmm5, xmm15, 0xf5 + paddd xmm2, xmm3 + paddd xmm5, xmm15 + movdqa xmm3, xmm2 + punpcklqdq xmm2, xmm5 + punpckhqdq xmm3, xmm5 + pavgw xmm3, xmm2 + pxor xmm0, xmm0 + pavgw xmm3, xmm0 + movq [r2], xmm3 ; i8x8_v, i8x8_h + psrldq xmm3, 8 + movd [r2+8], xmm3 ; i8x8_dc + ret +%endif ; ARCH_X86_64 +%endmacro ; INTRA_SATDS + +; in: r0 = fenc +; out: mm0..mm3 = hadamard coefs +ALIGN 16 +load_hadamard: + pxor mm7, mm7 + movd mm0, [r0+0*FENC_STRIDE] + movd mm4, [r0+1*FENC_STRIDE] + movd mm3, [r0+2*FENC_STRIDE] + movd mm1, [r0+3*FENC_STRIDE] + punpcklbw mm0, mm7 + punpcklbw mm4, mm7 + punpcklbw mm3, mm7 + punpcklbw mm1, mm7 + HADAMARD4_1D mm0, mm4, mm3, mm1 + TRANSPOSE4x4W mm0, mm4, mm3, mm1, mm2 + HADAMARD4_1D mm0, mm1, mm2, mm3 + ret + +%macro SCALAR_SUMSUB 4 + add %1, %2 + add %3, %4 + add %2, %2 + add %4, %4 + sub %2, %1 + sub %4, %3 +%endmacro + +%macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp +%ifnidn %1, 0 + shl %1d, 5 ; log(FDEC_STRIDE) +%endif + movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE] + movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE] + movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE] + movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE] +%ifnidn %1, 0 + shr %1d, 5 +%endif + SCALAR_SUMSUB %2d, %3d, %4d, %5d + SCALAR_SUMSUB %2d, %4d, %3d, %5d + mov [left_1d+2*%1+0], %2w + mov [left_1d+2*%1+2], %3w + mov [left_1d+2*%1+4], %4w + mov [left_1d+2*%1+6], %5w +%endmacro + +%macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp + movzx %2d, byte [r1+%1-FDEC_STRIDE+0] + movzx %3d, byte [r1+%1-FDEC_STRIDE+1] + movzx %4d, byte [r1+%1-FDEC_STRIDE+2] + movzx %5d, byte [r1+%1-FDEC_STRIDE+3] + SCALAR_SUMSUB %2d, %3d, %4d, %5d + SCALAR_SUMSUB %2d, %4d, %3d, %5d + mov [top_1d+2*%1+0], %2w + mov [top_1d+2*%1+2], %3w + mov [top_1d+2*%1+4], %4w + mov [top_1d+2*%1+6], %5w +%endmacro + +%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op + pxor %7, %7 + pshufw %4, %1, 01001110b + pshufw %5, %2, 01001110b + pshufw %6, %3, 01001110b + paddw %1, %4 + paddw %2, %5 + paddw %3, %6 + punpcklwd %1, %7 + punpcklwd %2, %7 + punpcklwd %3, %7 + pshufw %4, %1, 01001110b + pshufw %5, %2, 01001110b + pshufw %6, %3, 01001110b + %8 %1, %4 + %8 %2, %5 + %8 %3, %6 +%endmacro + +%macro CLEAR_SUMS 0 +%ifdef ARCH_X86_64 + mov qword [sums+0], 0 + mov qword [sums+8], 0 + mov qword [sums+16], 0 +%else + pxor mm7, mm7 + movq [sums+0], mm7 + movq [sums+8], mm7 + movq [sums+16], mm7 +%endif +%endmacro + +; in: mm1..mm3 +; out: mm7 +; clobber: mm4..mm6 +%macro SUM3x4 1 +%ifidn %1, ssse3 + pabsw mm4, mm1 + pabsw mm5, mm2 + pabsw mm7, mm3 + paddw mm4, mm5 +%else + movq mm4, mm1 + movq mm5, mm2 + ABS2 mm4, mm5, mm6, mm7 + movq mm7, mm3 + paddw mm4, mm5 + ABS1 mm7, mm6 +%endif + paddw mm7, mm4 +%endmacro + +; in: mm0..mm3 (4x4), mm7 (3x4) +; out: mm0 v, mm4 h, mm5 dc +; clobber: mm6 +%macro SUM4x3 3 ; dc, left, top + movq mm4, %2 + movd mm5, %1 + psllw mm4, 2 + psubw mm4, mm0 + psubw mm5, mm0 + punpcklwd mm0, mm1 + punpcklwd mm2, mm3 + punpckldq mm0, mm2 ; transpose + movq mm1, %3 + psllw mm1, 2 + psubw mm0, mm1 + ABS2 mm4, mm5, mm2, mm3 ; 1x4 sum + ABS1 mm0, mm1 ; 4x1 sum +%endmacro + +%macro INTRA_SATDS_MMX 1 +;----------------------------------------------------------------------------- +; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +;----------------------------------------------------------------------------- +cglobal x264_intra_satd_x3_4x4_%1, 2,6 +%ifdef ARCH_X86_64 + ; stack is 16 byte aligned because abi says so + %define top_1d rsp-8 ; size 8 + %define left_1d rsp-16 ; size 8 + %define t0 r10 + %define t0d r10d +%else + ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned + SUB esp, 16 + %define top_1d esp+8 + %define left_1d esp + %define t0 r2 + %define t0d r2d +%endif + + call load_hadamard + SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5 + mov t0d, r0d + SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5 + lea t0d, [t0d + r0d + 4] + and t0d, -8 + shl t0d, 1 ; dc + + SUM3x4 %1 + SUM4x3 t0d, [left_1d], [top_1d] + paddw mm4, mm7 + paddw mm5, mm7 + movq mm1, mm5 + psrlq mm1, 16 ; 4x3 sum + paddw mm0, mm1 + + SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw +%ifndef ARCH_X86_64 + mov r2, r2m +%endif + movd [r2+0], mm0 ; i4x4_v satd + movd [r2+4], mm4 ; i4x4_h satd + movd [r2+8], mm5 ; i4x4_dc satd +%ifndef ARCH_X86_64 + ADD esp, 16 +%endif + RET + +%ifdef ARCH_X86_64 + %define t0 r10 + %define t0d r10d + %define t2 r11 + %define t2w r11w + %define t2d r11d +%else + %define t0 r0 + %define t0d r0d + %define t2 r2 + %define t2w r2w + %define t2d r2d +%endif + +;----------------------------------------------------------------------------- +; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +;----------------------------------------------------------------------------- +cglobal x264_intra_satd_x3_16x16_%1, 0,7 +%ifdef ARCH_X86_64 + %assign stack_pad 88 +%else + %assign stack_pad 88 + ((stack_offset+88+4)&15) +%endif + ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call + SUB rsp, stack_pad +%define sums rsp+64 ; size 24 +%define top_1d rsp+32 ; size 32 +%define left_1d rsp ; size 32 + movifnidn r1d, r1m + CLEAR_SUMS + + ; 1D hadamards + xor t2d, t2d + mov t0d, 12 +.loop_edge: + SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6 + add t2d, r3d + SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6 + add t2d, r3d + sub t0d, 4 + jge .loop_edge + shr t2d, 1 + add t2d, 8 + and t2d, -16 ; dc + + ; 2D hadamards + movifnidn r0d, r0m + xor r3d, r3d +.loop_y: + xor r4d, r4d +.loop_x: + call load_hadamard + + SUM3x4 %1 + SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4] + pavgw mm4, mm7 + pavgw mm5, mm7 + paddw mm0, [sums+0] ; i16x16_v satd + paddw mm4, [sums+8] ; i16x16_h satd + paddw mm5, [sums+16] ; i16x16_dc satd + movq [sums+0], mm0 + movq [sums+8], mm4 + movq [sums+16], mm5 + + add r0, 4 + inc r4d + cmp r4d, 4 + jl .loop_x + add r0, 4*FENC_STRIDE-16 + inc r3d + cmp r3d, 4 + jl .loop_y + +; horizontal sum + movifnidn r2d, r2m + movq mm2, [sums+16] + movq mm1, [sums+8] + movq mm0, [sums+0] + movq mm7, mm2 + SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd + psrld mm0, 1 + pslld mm7, 16 + psrld mm7, 16 + paddd mm0, mm2 + psubd mm0, mm7 + movd [r2+8], mm2 ; i16x16_dc satd + movd [r2+4], mm1 ; i16x16_h satd + movd [r2+0], mm0 ; i16x16_v satd + ADD rsp, stack_pad + RET + +;----------------------------------------------------------------------------- +; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +;----------------------------------------------------------------------------- +cglobal x264_intra_satd_x3_8x8c_%1, 0,6 + ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call + SUB rsp, 72 +%define sums rsp+48 ; size 24 +%define dc_1d rsp+32 ; size 16 +%define top_1d rsp+16 ; size 16 +%define left_1d rsp ; size 16 + movifnidn r1d, r1m + CLEAR_SUMS + + ; 1D hadamards + mov t0d, 4 +.loop_edge: + SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5 + SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5 + sub t0d, 4 + jge .loop_edge + + ; dc + movzx t2d, word [left_1d+0] + movzx r3d, word [top_1d+0] + movzx r4d, word [left_1d+8] + movzx r5d, word [top_1d+8] + add t2d, r3d + lea r3, [r4 + r5] + lea t2, [2*t2 + 8] + lea r3, [2*r3 + 8] + lea r4, [4*r4 + 8] + lea r5, [4*r5 + 8] + and t2d, -16 ; tl + and r3d, -16 ; br + and r4d, -16 ; bl + and r5d, -16 ; tr + mov [dc_1d+ 0], t2d ; tl + mov [dc_1d+ 4], r5d ; tr + mov [dc_1d+ 8], r4d ; bl + mov [dc_1d+12], r3d ; br + lea r5, [dc_1d] + + ; 2D hadamards + movifnidn r0d, r0m + movifnidn r2d, r2m + xor r3d, r3d +.loop_y: + xor r4d, r4d +.loop_x: + call load_hadamard + + SUM3x4 %1 + SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4] + pavgw mm4, mm7 + pavgw mm5, mm7 + paddw mm0, [sums+16] ; i4x4_v satd + paddw mm4, [sums+8] ; i4x4_h satd + paddw mm5, [sums+0] ; i4x4_dc satd + movq [sums+16], mm0 + movq [sums+8], mm4 + movq [sums+0], mm5 + + add r0, 4 + inc r4d + cmp r4d, 2 + jl .loop_x + add r0, 4*FENC_STRIDE-8 + add r5, 8 + inc r3d + cmp r3d, 2 + jl .loop_y + +; horizontal sum + movq mm0, [sums+0] + movq mm1, [sums+8] + movq mm2, [sums+16] + movq mm7, mm0 + psrlq mm7, 15 + paddw mm2, mm7 + SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd + psrld mm2, 1 + movd [r2+0], mm0 ; i8x8c_dc satd + movd [r2+4], mm1 ; i8x8c_h satd + movd [r2+8], mm2 ; i8x8c_v satd + ADD rsp, 72 + RET +%endmacro + +; instantiate satds +; FIXME width4 can benefit from pabsw even if not sse2 + +cextern x264_pixel_sa8d_8x8_mmxext +SA8D_16x16_32 mmxext + +%define ABS1 ABS1_MMX +%define ABS2 ABS2_MMX +SATDS_SSE2 sse2 +SA8D_16x16_32 sse2 +INTRA_SA8D_SSE2 sse2 +INTRA_SATDS_MMX mmxext +%ifdef HAVE_SSE3 +%define ABS1 ABS1_SSSE3 +%define ABS2 ABS2_SSSE3 +SATDS_SSE2 ssse3 +SA8D_16x16_32 ssse3 +INTRA_SA8D_SSE2 ssse3 +INTRA_SATDS_MMX ssse3 +%endif + + + +;============================================================================= +; SSIM +;============================================================================= + +;----------------------------------------------------------------------------- +; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, +; const uint8_t *pix2, int stride2, int sums[2][4] ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4 + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 +%rep 4 + movq xmm5, [r0] + movq xmm6, [r2] + punpcklbw xmm5, xmm0 + punpcklbw xmm6, xmm0 + paddw xmm1, xmm5 + paddw xmm2, xmm6 + movdqa xmm7, xmm5 + pmaddwd xmm5, xmm5 + pmaddwd xmm7, xmm6 + pmaddwd xmm6, xmm6 + paddd xmm3, xmm5 + paddd xmm4, xmm7 + paddd xmm3, xmm6 + add r0, r1 + add r2, r3 +%endrep + ; PHADDW xmm1, xmm2 + ; PHADDD xmm3, xmm4 + picgetgot eax + movdqa xmm7, [pw_1 GLOBAL] + pshufd xmm5, xmm3, 0xb1 + pmaddwd xmm1, xmm7 + pmaddwd xmm2, xmm7 + pshufd xmm6, xmm4, 0xb1 + packssdw xmm1, xmm2 + paddd xmm3, xmm5 + pshufd xmm1, xmm1, 0xd8 + paddd xmm4, xmm6 + pmaddwd xmm1, xmm7 + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + punpckhdq xmm5, xmm4 + +%ifdef ARCH_X86_64 + %define t0 r4 +%else + %define t0 eax + mov t0, r4m +%endif +%ifnidn r4d, r4m + mov t0, r4m +%endif + + movq [t0+ 0], xmm1 + movq [t0+ 8], xmm3 + psrldq xmm1, 8 + movq [t0+16], xmm1 + movq [t0+24], xmm5 + RET + +;----------------------------------------------------------------------------- +; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_ssim_end4_sse2, 3,3 + movdqa xmm0, [r0+ 0] + movdqa xmm1, [r0+16] + movdqa xmm2, [r0+32] + movdqa xmm3, [r0+48] + movdqa xmm4, [r0+64] + paddd xmm0, [r1+ 0] + paddd xmm1, [r1+16] + paddd xmm2, [r1+32] + paddd xmm3, [r1+48] + paddd xmm4, [r1+64] + paddd xmm0, xmm1 + paddd xmm1, xmm2 + paddd xmm2, xmm3 + paddd xmm3, xmm4 + picgetgot r1 + movdqa xmm5, [ssim_c1 GLOBAL] + movdqa xmm6, [ssim_c2 GLOBAL] + TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4 + +; s1=mm0, s2=mm3, ss=mm4, s12=mm2 + movdqa xmm1, xmm3 + pslld xmm3, 16 + pmaddwd xmm1, xmm0 ; s1*s2 + por xmm0, xmm3 + pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2 + pslld xmm1, 1 + pslld xmm2, 7 + pslld xmm4, 6 + psubd xmm2, xmm1 ; covar*2 + psubd xmm4, xmm0 ; vars + paddd xmm0, xmm5 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm4, xmm6 + cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1) + cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1) + cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2) + cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2) + mulps xmm1, xmm2 + mulps xmm0, xmm4 + divps xmm1, xmm0 ; ssim + + cmp r2d, 4 + je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level + neg r2 +%ifdef PIC64 + lea r3, [mask_ff + 16 GLOBAL] + movdqu xmm3, [r3 + r2*4] +%else + movdqu xmm3, [mask_ff + r2*4 + 16 GLOBAL] +%endif + pand xmm1, xmm3 +.skip: + movhlps xmm0, xmm1 + addps xmm0, xmm1 + pshuflw xmm1, xmm0, 0xE + addss xmm0, xmm1 +%ifndef ARCH_X86_64 + movd r0m, xmm0 + fld dword r0m +%endif + RET + + + +;============================================================================= +; Successive Elimination ADS +;============================================================================= + +%macro ADS_START 1 ; unroll_size +%ifdef ARCH_X86_64 + %define t0 r6 + mov r10, rsp +%else + %define t0 r4 + PUSH rbp + mov rbp, rsp +%endif + mov r0d, r5m + sub rsp, r0 + sub rsp, %1*4-1 + and rsp, ~15 + mov t0, rsp + shl r2d, 1 +%endmacro + +%macro ADS_END 1 + add r1, 8*%1 + add r3, 8*%1 + add t0, 4*%1 + sub r0d, 4*%1 + jg .loop + jmp x264_pixel_ads_mvs +%endmacro + +%define ABS1 ABS1_MMX + +;----------------------------------------------------------------------------- +; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, +; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_ads4_mmxext, 4,5 + movq mm6, [r0] + movq mm4, [r0+8] + pshufw mm7, mm6, 0 + pshufw mm6, mm6, 0xAA + pshufw mm5, mm4, 0 + pshufw mm4, mm4, 0xAA + ADS_START 1 +.loop: + movq mm0, [r1] + movq mm1, [r1+16] + psubw mm0, mm7 + psubw mm1, mm6 + ABS1 mm0, mm2 + ABS1 mm1, mm3 + movq mm2, [r1+r2] + movq mm3, [r1+r2+16] + psubw mm2, mm5 + psubw mm3, mm4 + paddw mm0, mm1 + ABS1 mm2, mm1 + ABS1 mm3, mm1 + paddw mm0, mm2 + paddw mm0, mm3 +%ifdef ARCH_X86_64 + pshufw mm1, [r10+8], 0 +%else + pshufw mm1, [ebp+stack_offset+28], 0 +%endif + paddusw mm0, [r3] + psubusw mm1, mm0 + packsswb mm1, mm1 + movd [t0], mm1 + ADS_END 1 + +cglobal x264_pixel_ads2_mmxext, 4,5 + movq mm6, [r0] + pshufw mm5, r6m, 0 + pshufw mm7, mm6, 0 + pshufw mm6, mm6, 0xAA + ADS_START 1 +.loop: + movq mm0, [r1] + movq mm1, [r1+r2] + psubw mm0, mm7 + psubw mm1, mm6 + ABS1 mm0, mm2 + ABS1 mm1, mm3 + paddw mm0, mm1 + paddusw mm0, [r3] + movq mm4, mm5 + psubusw mm4, mm0 + packsswb mm4, mm4 + movd [t0], mm4 + ADS_END 1 + +cglobal x264_pixel_ads1_mmxext, 4,5 + pshufw mm7, [r0], 0 + pshufw mm6, r6m, 0 + ADS_START 2 +.loop: + movq mm0, [r1] + movq mm1, [r1+8] + psubw mm0, mm7 + psubw mm1, mm7 + ABS1 mm0, mm2 + ABS1 mm1, mm3 + paddusw mm0, [r3] + paddusw mm1, [r3+8] + movq mm4, mm6 + movq mm5, mm6 + psubusw mm4, mm0 + psubusw mm5, mm1 + packsswb mm4, mm5 + movq [t0], mm4 + ADS_END 2 + +%macro ADS_SSE2 1 +cglobal x264_pixel_ads4_%1, 4,5 + movdqa xmm4, [r0] + pshuflw xmm7, xmm4, 0 + pshuflw xmm6, xmm4, 0xAA + pshufhw xmm5, xmm4, 0 + pshufhw xmm4, xmm4, 0xAA + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + punpckhqdq xmm5, xmm5 + punpckhqdq xmm4, xmm4 +%ifdef ARCH_X86_64 + pshuflw xmm8, r6m, 0 + punpcklqdq xmm8, xmm8 + ADS_START 2 + movdqu xmm10, [r1] + movdqu xmm11, [r1+r2] +.loop: + movdqa xmm0, xmm10 + movdqu xmm1, [r1+16] + movdqa xmm10, xmm1 + psubw xmm0, xmm7 + psubw xmm1, xmm6 + ABS1 xmm0, xmm2 + ABS1 xmm1, xmm3 + movdqa xmm2, xmm11 + movdqu xmm3, [r1+r2+16] + movdqa xmm11, xmm3 + psubw xmm2, xmm5 + psubw xmm3, xmm4 + paddw xmm0, xmm1 + movdqu xmm9, [r3] + ABS1 xmm2, xmm1 + ABS1 xmm3, xmm1 + paddw xmm0, xmm2 + paddw xmm0, xmm3 + paddusw xmm0, xmm9 + movdqa xmm1, xmm8 + psubusw xmm1, xmm0 + packsswb xmm1, xmm1 + movq [t0], xmm1 +%else + ADS_START 2 +.loop: + movdqu xmm0, [r1] + movdqu xmm1, [r1+16] + psubw xmm0, xmm7 + psubw xmm1, xmm6 + ABS1 xmm0, xmm2 + ABS1 xmm1, xmm3 + movdqu xmm2, [r1+r2] + movdqu xmm3, [r1+r2+16] + psubw xmm2, xmm5 + psubw xmm3, xmm4 + paddw xmm0, xmm1 + ABS1 xmm2, xmm1 + ABS1 xmm3, xmm1 + paddw xmm0, xmm2 + paddw xmm0, xmm3 + movd xmm1, [ebp+stack_offset+28] + movdqu xmm2, [r3] + pshuflw xmm1, xmm1, 0 + punpcklqdq xmm1, xmm1 + paddusw xmm0, xmm2 + psubusw xmm1, xmm0 + packsswb xmm1, xmm1 + movq [t0], xmm1 +%endif ; ARCH + ADS_END 2 + +cglobal x264_pixel_ads2_%1, 4,5 + movq xmm6, [r0] + movd xmm5, r6m + pshuflw xmm7, xmm6, 0 + pshuflw xmm6, xmm6, 0xAA + pshuflw xmm5, xmm5, 0 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + punpcklqdq xmm5, xmm5 + ADS_START 2 +.loop: + movdqu xmm0, [r1] + movdqu xmm1, [r1+r2] + psubw xmm0, xmm7 + psubw xmm1, xmm6 + movdqu xmm4, [r3] + ABS1 xmm0, xmm2 + ABS1 xmm1, xmm3 + paddw xmm0, xmm1 + paddusw xmm0, xmm4 + movdqa xmm1, xmm5 + psubusw xmm1, xmm0 + packsswb xmm1, xmm1 + movq [t0], xmm1 + ADS_END 2 + +cglobal x264_pixel_ads1_%1, 4,5 + movd xmm7, [r0] + movd xmm6, r6m + pshuflw xmm7, xmm7, 0 + pshuflw xmm6, xmm6, 0 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + ADS_START 4 +.loop: + movdqu xmm0, [r1] + movdqu xmm1, [r1+16] + psubw xmm0, xmm7 + psubw xmm1, xmm7 + movdqu xmm2, [r3] + movdqu xmm3, [r3+16] + ABS1 xmm0, xmm4 + ABS1 xmm1, xmm5 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + movdqa xmm4, xmm6 + movdqa xmm5, xmm6 + psubusw xmm4, xmm0 + psubusw xmm5, xmm1 + packsswb xmm4, xmm5 + movdqa [t0], xmm4 + ADS_END 4 +%endmacro + +ADS_SSE2 sse2 +%ifdef HAVE_SSE3 +%define ABS1 ABS1_SSSE3 +ADS_SSE2 ssse3 +%endif + +; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) +; { +; int nmv=0, i, j; +; *(uint32_t*)(masks+width) = 0; +; for( i=0; i + * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -68,12 +68,17 @@ DECL_X4( sad, cache64_ssse3 ); #undef DECL_X4 void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_8x8c_ssse3( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * ); void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * ); +void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * ); +void x264_intra_sa8d_x3_8x8_ssse3( uint8_t *, uint8_t *, int * ); void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * ); +void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * ); +void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * ); void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); diff --git a/common/amd64/predict-a.asm b/common/x86/predict-a.asm similarity index 64% rename from common/amd64/predict-a.asm rename to common/x86/predict-a.asm index bdbbedce..64a01558 100644 --- a/common/amd64/predict-a.asm +++ b/common/x86/predict-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* predict-a.asm: h264 encoder library ;***************************************************************************** -;* Copyright (C) 2005 x264 project +;* Copyright (C) 2005-2008 x264 project ;* ;* Authors: Loren Merritt ;* @@ -20,53 +20,45 @@ ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** -BITS 64 - -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -%include "amd64inc.asm" +%include "x86inc.asm" %macro STORE8x8 2 - movq [parm1q + 0*FDEC_STRIDE], %1 - movq [parm1q + 1*FDEC_STRIDE], %1 - movq [parm1q + 2*FDEC_STRIDE], %1 - movq [parm1q + 3*FDEC_STRIDE], %1 - movq [parm1q + 4*FDEC_STRIDE], %2 - movq [parm1q + 5*FDEC_STRIDE], %2 - movq [parm1q + 6*FDEC_STRIDE], %2 - movq [parm1q + 7*FDEC_STRIDE], %2 + movq [r0 + 0*FDEC_STRIDE], %1 + movq [r0 + 1*FDEC_STRIDE], %1 + movq [r0 + 2*FDEC_STRIDE], %1 + movq [r0 + 3*FDEC_STRIDE], %1 + movq [r0 + 4*FDEC_STRIDE], %2 + movq [r0 + 5*FDEC_STRIDE], %2 + movq [r0 + 6*FDEC_STRIDE], %2 + movq [r0 + 7*FDEC_STRIDE], %2 %endmacro %macro STORE16x16 2 - mov eax, 4 + mov r1d, 4 .loop: - movq [parm1q + 0*FDEC_STRIDE], %1 - movq [parm1q + 1*FDEC_STRIDE], %1 - movq [parm1q + 2*FDEC_STRIDE], %1 - movq [parm1q + 3*FDEC_STRIDE], %1 - movq [parm1q + 0*FDEC_STRIDE + 8], %2 - movq [parm1q + 1*FDEC_STRIDE + 8], %2 - movq [parm1q + 2*FDEC_STRIDE + 8], %2 - movq [parm1q + 3*FDEC_STRIDE + 8], %2 - add parm1q, 4*FDEC_STRIDE - dec eax + movq [r0 + 0*FDEC_STRIDE], %1 + movq [r0 + 1*FDEC_STRIDE], %1 + movq [r0 + 2*FDEC_STRIDE], %1 + movq [r0 + 3*FDEC_STRIDE], %1 + movq [r0 + 0*FDEC_STRIDE + 8], %2 + movq [r0 + 1*FDEC_STRIDE + 8], %2 + movq [r0 + 2*FDEC_STRIDE + 8], %2 + movq [r0 + 3*FDEC_STRIDE + 8], %2 + add r0, 4*FDEC_STRIDE + dec r1d jg .loop - nop %endmacro %macro STORE16x16_SSE2 1 - mov eax, 4 + mov r1d, 4 .loop: - movdqa [parm1q + 0*FDEC_STRIDE], %1 - movdqa [parm1q + 1*FDEC_STRIDE], %1 - movdqa [parm1q + 2*FDEC_STRIDE], %1 - movdqa [parm1q + 3*FDEC_STRIDE], %1 - add parm1q, 4*FDEC_STRIDE - dec eax + movdqa [r0 + 0*FDEC_STRIDE], %1 + movdqa [r0 + 1*FDEC_STRIDE], %1 + movdqa [r0 + 2*FDEC_STRIDE], %1 + movdqa [r0 + 3*FDEC_STRIDE], %1 + add r0, 4*FDEC_STRIDE + dec r1d jg .loop - nop %endmacro SECTION_RODATA @@ -82,10 +74,6 @@ pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff -;============================================================================= -; Code -;============================================================================= - SECTION .text ; dest, left, right, src, tmp @@ -110,10 +98,10 @@ SECTION .text ;----------------------------------------------------------------------------- ; void predict_4x4_ddl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_4x4_ddl_mmxext - sub parm1q, FDEC_STRIDE - movq mm3, [parm1q] - movq mm1, [parm1q-1] +cglobal predict_4x4_ddl_mmxext, 1,1,1 + sub r0, FDEC_STRIDE + movq mm3, [r0] + movq mm1, [r0-1] movq mm2, mm3 movq mm4, [pb_0s_ff GLOBAL] psrlq mm2, 8 @@ -125,17 +113,17 @@ cglobal predict_4x4_ddl_mmxext %assign Y 1 %rep 4 psrlq mm0, 8 - movd [parm1q+Y*FDEC_STRIDE], mm0 + movd [r0+Y*FDEC_STRIDE], mm0 %assign Y (Y+1) %endrep - ret + RET ;----------------------------------------------------------------------------- ; void predict_4x4_vl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_4x4_vl_mmxext - movq mm1, [parm1q-FDEC_STRIDE] +cglobal predict_4x4_vl_mmxext, 1,1,1 + movq mm1, [r0-FDEC_STRIDE] movq mm3, mm1 movq mm2, mm1 psrlq mm3, 8 @@ -145,71 +133,129 @@ cglobal predict_4x4_vl_mmxext PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 - movd [parm1q+0*FDEC_STRIDE], mm4 - movd [parm1q+1*FDEC_STRIDE], mm0 + movd [r0+0*FDEC_STRIDE], mm4 + movd [r0+1*FDEC_STRIDE], mm0 psrlq mm4, 8 psrlq mm0, 8 - movd [parm1q+2*FDEC_STRIDE], mm4 - movd [parm1q+3*FDEC_STRIDE], mm0 + movd [r0+2*FDEC_STRIDE], mm4 + movd [r0+3*FDEC_STRIDE], mm0 - ret + RET ;----------------------------------------------------------------------------- ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_v_mmxext - movq mm0, [parm2q+16] +cglobal predict_8x8_v_mmxext, 2,2 + movq mm0, [r1+16] STORE8x8 mm0, mm0 - ret + RET ;----------------------------------------------------------------------------- ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- -cglobal predict_8x8_dc_mmxext +cglobal predict_8x8_dc_mmxext, 2,2,1 pxor mm0, mm0 pxor mm1, mm1 - psadbw mm0, [parm2q+7] - psadbw mm1, [parm2q+16] + psadbw mm0, [r1+7] + psadbw mm1, [r1+16] paddw mm0, [pw_8 GLOBAL] paddw mm0, mm1 psrlw mm0, 4 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 - ret + RET ;----------------------------------------------------------------------------- ; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- -cglobal predict_8x8_dc_top_mmxext +%macro PRED8x8_DC 2 +cglobal %1, 2,2,1 pxor mm0, mm0 - psadbw mm0, [parm2q+16] + psadbw mm0, [r1+%2] paddw mm0, [pw_4 GLOBAL] psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 - ret + RET +%endmacro + +PRED8x8_DC predict_8x8_dc_top_mmxext, 16 +PRED8x8_DC predict_8x8_dc_left_mmxext, 7 + +%ifndef ARCH_X86_64 +; sse2 is faster even on amd, so there's no sense in spending exe size on these +; functions if we know sse2 is available. ;----------------------------------------------------------------------------- -; void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t *edge ); +; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_dc_left_mmxext - pxor mm0, mm0 - psadbw mm0, [parm2q+7] - paddw mm0, [pw_4 GLOBAL] - psrlw mm0, 3 - pshufw mm0, mm0, 0 - packuswb mm0, mm0 - STORE8x8 mm0, mm0 - ret +cglobal predict_8x8_ddl_mmxext, 2,2,1 + movq mm5, [r1+16] + movq mm2, [r1+17] + movq mm3, [r1+23] + movq mm4, [r1+25] + movq mm1, mm5 + psllq mm1, 8 + PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7 + PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6 + +%assign Y 7 +%rep 6 + movq [r0+Y*FDEC_STRIDE], mm1 + movq mm2, mm0 + psllq mm1, 8 + psrlq mm2, 56 + psllq mm0, 8 + por mm1, mm2 +%assign Y (Y-1) +%endrep + movq [r0+Y*FDEC_STRIDE], mm1 + psllq mm1, 8 + psrlq mm0, 56 + por mm1, mm0 +%assign Y (Y-1) + movq [r0+Y*FDEC_STRIDE], mm1 + RET + +;----------------------------------------------------------------------------- +; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +cglobal predict_8x8_ddr_mmxext, 2,2,1 + movq mm1, [r1+7] + movq mm2, [r1+9] + movq mm3, [r1+15] + movq mm4, [r1+17] + PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7 + PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6 + +%assign Y 7 +%rep 6 + movq [r0+Y*FDEC_STRIDE], mm0 + movq mm2, mm1 + psrlq mm0, 8 + psllq mm2, 56 + psrlq mm1, 8 + por mm0, mm2 +%assign Y (Y-1) +%endrep + movq [r0+Y*FDEC_STRIDE], mm0 + psrlq mm0, 8 + psllq mm1, 56 + por mm0, mm1 +%assign Y (Y-1) + movq [r0+Y*FDEC_STRIDE], mm0 + RET + +%endif ; !ARCH_X86_64 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl_sse2 - movdqa xmm3, [parm2q+16] - movdqu xmm2, [parm2q+17] +cglobal predict_8x8_ddl_sse2, 2,2,1 + movdqa xmm3, [r1+16] + movdqu xmm2, [r1+17] movdqa xmm1, xmm3 pslldq xmm1, 1 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 @@ -217,17 +263,17 @@ cglobal predict_8x8_ddl_sse2 %assign Y 0 %rep 8 psrldq xmm0, 1 - movq [parm1q+Y*FDEC_STRIDE], xmm0 + movq [r0+Y*FDEC_STRIDE], xmm0 %assign Y (Y+1) %endrep - ret + RET ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddr_sse2 - movdqu xmm3, [parm2q+8] - movdqu xmm1, [parm2q+7] +cglobal predict_8x8_ddr_sse2, 2,2,1 + movdqu xmm3, [r1+8] + movdqu xmm1, [r1+7] movdqa xmm2, xmm3 psrldq xmm2, 1 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 @@ -236,22 +282,22 @@ cglobal predict_8x8_ddr_sse2 psrldq xmm1, 1 %assign Y 7 %rep 3 - movq [parm1q+Y*FDEC_STRIDE], xmm0 - movq [parm1q+(Y-1)*FDEC_STRIDE], xmm1 + movq [r0+Y*FDEC_STRIDE], xmm0 + movq [r0+(Y-1)*FDEC_STRIDE], xmm1 psrldq xmm0, 2 psrldq xmm1, 2 %assign Y (Y-2) %endrep - movq [parm1q+1*FDEC_STRIDE], xmm0 - movq [parm1q+0*FDEC_STRIDE], xmm1 + movq [r0+1*FDEC_STRIDE], xmm0 + movq [r0+0*FDEC_STRIDE], xmm1 - ret + RET ;----------------------------------------------------------------------------- ; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_vl_sse2 - movdqa xmm4, [parm2q+16] +cglobal predict_8x8_vl_sse2, 2,2,1 + movdqa xmm4, [r1+16] movdqa xmm2, xmm4 movdqa xmm1, xmm4 movdqa xmm3, xmm4 @@ -265,16 +311,16 @@ cglobal predict_8x8_vl_sse2 %assign Y 0 %rep 3 psrldq xmm0, 1 - movq [parm1q+ Y *FDEC_STRIDE], xmm3 - movq [parm1q+(Y+1)*FDEC_STRIDE], xmm0 + movq [r0+ Y *FDEC_STRIDE], xmm3 + movq [r0+(Y+1)*FDEC_STRIDE], xmm0 psrldq xmm3, 1 %assign Y (Y+2) %endrep psrldq xmm0, 1 - movq [parm1q+ Y *FDEC_STRIDE], xmm3 - movq [parm1q+(Y+1)*FDEC_STRIDE], xmm0 + movq [r0+ Y *FDEC_STRIDE], xmm3 + movq [r0+(Y+1)*FDEC_STRIDE], xmm0 - ret + RET ;----------------------------------------------------------------------------- ; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge ) @@ -291,40 +337,40 @@ cglobal predict_8x8_vl_sse2 ; 6 ..... ; 7 ,,,,, -cglobal predict_8x8_vr_core_mmxext - movq mm2, [parm2q+16] - movq mm3, [parm2q+15] - movq mm1, [parm2q+14] +cglobal predict_8x8_vr_core_mmxext, 2,2,1 + movq mm2, [r1+16] + movq mm3, [r1+15] + movq mm1, [r1+14] movq mm4, mm3 pavgb mm3, mm2 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7 %assign Y 0 %rep 3 - movq [parm1q+ Y *FDEC_STRIDE], mm3 - movq [parm1q+(Y+1)*FDEC_STRIDE], mm0 + movq [r0+ Y *FDEC_STRIDE], mm3 + movq [r0+(Y+1)*FDEC_STRIDE], mm0 psllq mm3, 8 psllq mm0, 8 %assign Y (Y+2) %endrep - movq [parm1q+ Y *FDEC_STRIDE], mm3 - movq [parm1q+(Y+1)*FDEC_STRIDE], mm0 + movq [r0+ Y *FDEC_STRIDE], mm3 + movq [r0+(Y+1)*FDEC_STRIDE], mm0 - ret + RET ;----------------------------------------------------------------------------- ; void predict_8x8c_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_8x8c_v_mmx - movq mm0, [parm1q - FDEC_STRIDE] +cglobal predict_8x8c_v_mmx, 1,1 + movq mm0, [r0 - FDEC_STRIDE] STORE8x8 mm0, mm0 - ret + RET ;----------------------------------------------------------------------------- ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ) ;----------------------------------------------------------------------------- -cglobal predict_8x8c_dc_core_mmxext - movq mm0, [parm1q - FDEC_STRIDE] +cglobal predict_8x8c_dc_core_mmxext, 1,1,1 + movq mm0, [r0 - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 punpckhbw mm1, mm0 @@ -332,10 +378,15 @@ cglobal predict_8x8c_dc_core_mmxext psadbw mm1, mm2 ; s1 psadbw mm0, mm2 ; s0 - movd mm4, parm2d - movd mm5, parm3d +%ifdef ARCH_X86_64 + movd mm4, r1d + movd mm5, r2d paddw mm0, mm4 pshufw mm2, mm5, 0 +%else + paddw mm0, r1m + pshufw mm2, r2m, 0 +%endif psrlw mm0, 3 paddw mm1, [pw_2 GLOBAL] movq mm3, mm2 @@ -350,25 +401,35 @@ cglobal predict_8x8c_dc_core_mmxext packuswb mm2, mm3 ; dc2,dc3 (b) STORE8x8 mm0, mm2 - ret + RET -;----------------------------------------------------------------------------- -; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) -;----------------------------------------------------------------------------- -cglobal predict_8x8c_p_core_mmxext - movd mm0, parm2d - movd mm2, parm3d - movd mm4, parm4d +%macro LOAD_PLANE_ARGS 0 +%ifdef ARCH_X86_64 + movd mm0, r1d + movd mm2, r2d + movd mm4, r3d pshufw mm0, mm0, 0 pshufw mm2, mm2, 0 pshufw mm4, mm4, 0 +%else + pshufw mm0, r1m, 0 + pshufw mm2, r2m, 0 + pshufw mm4, r3m, 0 +%endif +%endmacro + +;----------------------------------------------------------------------------- +; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) +;----------------------------------------------------------------------------- +cglobal predict_8x8c_p_core_mmxext, 1,2,1 + LOAD_PLANE_ARGS movq mm1, mm2 pmullw mm2, [pw_3210 GLOBAL] psllw mm1, 2 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} - mov eax, 8 + mov r1d, 8 ALIGN 4 .loop: movq mm5, mm0 @@ -376,27 +437,20 @@ ALIGN 4 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 - movq [parm1q], mm5 + movq [r0], mm5 paddsw mm0, mm4 paddsw mm1, mm4 - add parm1q, FDEC_STRIDE - dec eax + add r0, FDEC_STRIDE + dec r1d jg .loop - - nop - ret + REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_mmxext - movd mm0, parm2d - movd mm2, parm3d - movd mm4, parm4d - pshufw mm0, mm0, 0 - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 +cglobal predict_16x16_p_core_mmxext, 1,2,1 + LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 pmullw mm5, [pw_3210 GLOBAL] @@ -408,7 +462,7 @@ cglobal predict_16x16_p_core_mmxext paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} - mov eax, 16 + mov r1d, 16 ALIGN 4 .loop: movq mm5, mm0 @@ -416,33 +470,31 @@ ALIGN 4 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 - movq [parm1q], mm5 + movq [r0], mm5 movq mm5, mm2 movq mm6, mm3 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 - movq [parm1q+8], mm5 + movq [r0+8], mm5 paddsw mm0, mm4 paddsw mm1, mm4 paddsw mm2, mm4 paddsw mm3, mm4 - add parm1q, FDEC_STRIDE - dec eax + add r0, FDEC_STRIDE + dec r1d jg .loop - - nop - ret + REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_sse2 - movd xmm0, parm2d - movd xmm1, parm3d - movd xmm2, parm4d +cglobal predict_16x16_p_core_sse2, 1,2,1 + movd xmm0, r1m + movd xmm1, r2m + movd xmm2, r3m pshuflw xmm0, xmm0, 0 pshuflw xmm1, xmm1, 0 pshuflw xmm2, xmm2, 0 @@ -455,7 +507,7 @@ cglobal predict_16x16_p_core_sse2 paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} - mov eax, 16 + mov r1d, 16 ALIGN 4 .loop: movdqa xmm3, xmm0 @@ -463,33 +515,31 @@ ALIGN 4 psraw xmm3, 5 psraw xmm4, 5 packuswb xmm3, xmm4 - movdqa [parm1q], xmm3 + movdqa [r0], xmm3 paddsw xmm0, xmm2 paddsw xmm1, xmm2 - add parm1q, FDEC_STRIDE - dec eax + add r0, FDEC_STRIDE + dec r1d jg .loop - - nop - ret + REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_v_mmx - movq mm0, [parm1q - FDEC_STRIDE] - movq mm1, [parm1q - FDEC_STRIDE + 8] +cglobal predict_16x16_v_mmx, 1,2 + movq mm0, [r0 - FDEC_STRIDE] + movq mm1, [r0 - FDEC_STRIDE + 8] STORE16x16 mm0, mm1 - ret + REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_v_sse2( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_v_sse2 - movdqa xmm0, [parm1q - FDEC_STRIDE] +cglobal predict_16x16_v_sse2, 1,2 + movdqa xmm0, [r0 - FDEC_STRIDE] STORE16x16_SSE2 xmm0 - ret + REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) @@ -498,8 +548,8 @@ cglobal predict_16x16_v_sse2 %macro PRED16x16_DC 2 pxor mm0, mm0 pxor mm1, mm1 - psadbw mm0, [parm1q - FDEC_STRIDE] - psadbw mm1, [parm1q - FDEC_STRIDE + 8] + psadbw mm0, [r0 - FDEC_STRIDE] + psadbw mm1, [r0 - FDEC_STRIDE + 8] paddusw mm0, mm1 paddusw mm0, %1 psrlw mm0, %2 ; dc @@ -508,14 +558,18 @@ cglobal predict_16x16_v_sse2 STORE16x16 mm0, mm0 %endmacro -cglobal predict_16x16_dc_core_mmxext - movd mm2, parm2d +cglobal predict_16x16_dc_core_mmxext, 1,2 +%ifdef ARCH_X86_64 + movd mm2, r1d PRED16x16_DC mm2, 5 - ret +%else + PRED16x16_DC r1m, 5 +%endif + REP_RET -cglobal predict_16x16_dc_top_mmxext +cglobal predict_16x16_dc_top_mmxext, 1,2,1 PRED16x16_DC [pw_8 GLOBAL], 4 - ret + REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ) @@ -523,7 +577,7 @@ cglobal predict_16x16_dc_top_mmxext %macro PRED16x16_DC_SSE2 2 pxor xmm0, xmm0 - psadbw xmm0, [parm1q - FDEC_STRIDE] + psadbw xmm0, [r0 - FDEC_STRIDE] movhlps xmm1, xmm0 paddw xmm0, xmm1 paddusw xmm0, %1 @@ -534,12 +588,12 @@ cglobal predict_16x16_dc_top_mmxext STORE16x16_SSE2 xmm0 %endmacro -cglobal predict_16x16_dc_core_sse2 - movd xmm2, parm2d +cglobal predict_16x16_dc_core_sse2, 1,2 + movd xmm2, r1m PRED16x16_DC_SSE2 xmm2, 5 - ret + REP_RET -cglobal predict_16x16_dc_top_sse2 +cglobal predict_16x16_dc_top_sse2, 1,2,1 PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 - ret + REP_RET diff --git a/common/i386/predict-c.c b/common/x86/predict-c.c similarity index 93% rename from common/i386/predict-c.c rename to common/x86/predict-c.c index 8ff531a2..8768fbfa 100644 --- a/common/i386/predict-c.c +++ b/common/x86/predict-c.c @@ -450,44 +450,44 @@ static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] ) t=e; e+=f; f-=t;\ t=g; g+=h; h-=t; +#define INTRA_SA8D_X3(cpu) \ +void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )\ +{\ + PREDICT_8x8_LOAD_TOP\ + PREDICT_8x8_LOAD_LEFT\ + int t;\ + DECLARE_ALIGNED( int16_t, sa8d_1d[2][8], 16 );\ + SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\ + SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\ + SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\ + sa8d_1d[0][0] = l0;\ + sa8d_1d[0][1] = l1;\ + sa8d_1d[0][2] = l2;\ + sa8d_1d[0][3] = l3;\ + sa8d_1d[0][4] = l4;\ + sa8d_1d[0][5] = l5;\ + sa8d_1d[0][6] = l6;\ + sa8d_1d[0][7] = l7;\ + SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);\ + SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);\ + SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);\ + sa8d_1d[1][0] = t0;\ + sa8d_1d[1][1] = t1;\ + sa8d_1d[1][2] = t2;\ + sa8d_1d[1][3] = t3;\ + sa8d_1d[1][4] = t4;\ + sa8d_1d[1][5] = t5;\ + sa8d_1d[1][6] = t6;\ + sa8d_1d[1][7] = t7;\ + x264_intra_sa8d_x3_8x8_core_##cpu( fenc, sa8d_1d, res );\ +} + #ifdef ARCH_X86_64 -void x264_intra_sa8d_x3_8x8_sse2( uint8_t *fenc, uint8_t edge[33], int res[3] ) -#else -void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *fenc, uint8_t edge[33], int res[3] ) -#endif -{ - PREDICT_8x8_LOAD_TOP - PREDICT_8x8_LOAD_LEFT - int t; - DECLARE_ALIGNED( int16_t, sa8d_1d[2][8], 16 ); - SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7); - SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7); - SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7); - sa8d_1d[0][0] = l0; - sa8d_1d[0][1] = l1; - sa8d_1d[0][2] = l2; - sa8d_1d[0][3] = l3; - sa8d_1d[0][4] = l4; - sa8d_1d[0][5] = l5; - sa8d_1d[0][6] = l6; - sa8d_1d[0][7] = l7; - SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7); - SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7); - SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7); - sa8d_1d[1][0] = t0; - sa8d_1d[1][1] = t1; - sa8d_1d[1][2] = t2; - sa8d_1d[1][3] = t3; - sa8d_1d[1][4] = t4; - sa8d_1d[1][5] = t5; - sa8d_1d[1][6] = t6; - sa8d_1d[1][7] = t7; -#ifdef ARCH_X86_64 - x264_intra_sa8d_x3_8x8_core_sse2( fenc, sa8d_1d, res ); +INTRA_SA8D_X3(sse2) +INTRA_SA8D_X3(ssse3) #else - x264_intra_sa8d_x3_8x8_core_mmxext( fenc, sa8d_1d, res ); +INTRA_SA8D_X3(mmxext) #endif -} /**************************************************************************** * Exported functions: diff --git a/common/i386/predict.h b/common/x86/predict.h similarity index 96% rename from common/i386/predict.h rename to common/x86/predict.h index 4db2e91e..9310313e 100644 --- a/common/i386/predict.h +++ b/common/x86/predict.h @@ -2,7 +2,6 @@ * predict.h: h264 encoder library ***************************************************************************** * Copyright (C) 2003 Laurent Aimar - * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $ * * Authors: Laurent Aimar * diff --git a/common/amd64/quant-a.asm b/common/x86/quant-a.asm similarity index 66% rename from common/amd64/quant-a.asm rename to common/x86/quant-a.asm index 162c70a4..11e78ea6 100644 --- a/common/amd64/quant-a.asm +++ b/common/x86/quant-a.asm @@ -1,9 +1,10 @@ ;***************************************************************************** ;* quant-a.asm: h264 encoder library ;***************************************************************************** -;* Copyright (C) 2005 x264 project +;* Copyright (C) 2005-2008 x264 project ;* ;* Authors: Loren Merritt +;* Christian Heine ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -20,9 +21,7 @@ ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** -BITS 64 - -%include "amd64inc.asm" +%include "x86inc.asm" SECTION_RODATA pd_1: times 2 dd 1 @@ -30,15 +29,15 @@ pd_1: times 2 dd 1 SECTION .text %macro MMX_QUANT_DC_START 0 - movd mm6, parm2d ; mf - movd mm7, parm3d ; bias + movd mm6, r1m ; mf + movd mm7, r2m ; bias pshufw mm6, mm6, 0 pshufw mm7, mm7, 0 %endmacro %macro SSE2_QUANT_DC_START 0 - movd xmm6, parm2d ; mf - movd xmm7, parm3d ; bias + movd xmm6, r1m ; mf + movd xmm7, r2m ; bias pshuflw xmm6, xmm6, 0 pshuflw xmm7, xmm7, 0 punpcklqdq xmm6, xmm6 @@ -80,52 +79,52 @@ SECTION .text ;----------------------------------------------------------------------------- ; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias ) ;----------------------------------------------------------------------------- -cglobal x264_quant_2x2_dc_mmxext +cglobal x264_quant_2x2_dc_mmxext, 1,1 MMX_QUANT_DC_START - MMX_QUANT_1x4 [parm1q], mm6, mm7 - ret - -%macro QUANT_SSE 1 -;----------------------------------------------------------------------------- -; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias ) -;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_dc_%1 - SSE2_QUANT_DC_START -%assign x 0 -%rep 2 - QUANT_1x8 [parm1q+x], xmm6, xmm7 -%assign x (x+16) -%endrep - ret + MMX_QUANT_1x4 [r0], mm6, mm7 + RET ;----------------------------------------------------------------------------- -; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) +; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias ) ;----------------------------------------------------------------------------- -cglobal x264_quant_4x4_%1 +%macro QUANT_DC 6 +cglobal %1, 1,1 + %2 %assign x 0 -%rep 2 - QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x] -%assign x (x+16) +%rep %5 + %3 [r0+x], %4m6, %4m7 +%assign x x+%6 %endrep - ret + RET +%endmacro ;----------------------------------------------------------------------------- -; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) +; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) ;----------------------------------------------------------------------------- -cglobal x264_quant_8x8_%1 +%macro QUANT_AC 4 +cglobal %1, 3,3 %assign x 0 -%rep 8 - QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x] -%assign x (x+16) +%rep %3 + %2 [r0+x], [r1+x], [r2+x] +%assign x x+%4 %endrep - ret + RET %endmacro -%define QUANT_1x8 SSE2_QUANT_1x8 -QUANT_SSE sse2 +%ifndef ARCH_X86_64 ; not needed because sse2 is faster +QUANT_DC x264_quant_4x4_dc_mmxext, MMX_QUANT_DC_START, MMX_QUANT_1x4, m, 4, 8 +QUANT_AC x264_quant_4x4_mmx, MMX_QUANT_1x4, 4, 8 +QUANT_AC x264_quant_8x8_mmx, MMX_QUANT_1x4, 16, 8 +%endif + +QUANT_DC x264_quant_4x4_dc_sse2, SSE2_QUANT_DC_START, SSE2_QUANT_1x8, xm, 2, 16 +QUANT_AC x264_quant_4x4_sse2, SSE2_QUANT_1x8, 2, 16 +QUANT_AC x264_quant_8x8_sse2, SSE2_QUANT_1x8, 8, 16 + %ifdef HAVE_SSE3 -%define QUANT_1x8 SSSE3_QUANT_1x8 -QUANT_SSE ssse3 +QUANT_DC x264_quant_4x4_dc_ssse3, SSE2_QUANT_DC_START, SSSE3_QUANT_1x8, xm, 2, 16 +QUANT_AC x264_quant_4x4_ssse3, SSSE3_QUANT_1x8, 2, 16 +QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16 %endif @@ -180,53 +179,61 @@ QUANT_SSE ssse3 movq %1, mm0 %endmacro +%macro DEQUANT_LOOP 2 + mov t0d, 8*(%2-2) +%%loop: + %1 [r0+t0+8], [r1+t0*2+16], [r1+t0*2+24] + %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8] + sub t0d, 16 + jge %%loop + rep ret +%endmacro + ;----------------------------------------------------------------------------- ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) ;----------------------------------------------------------------------------- %macro DEQUANT_WxH 3 -cglobal %1 -; mov rdi, rdi ; dct -; mov rsi, rsi ; dequant_mf -; mov edx, edx ; i_qp - - imul eax, edx, 0x2b - shr eax, 8 ; i_qbits = i_qp / 6 - lea ecx, [eax+eax*2] - sub edx, ecx - sub edx, ecx ; i_mf = i_qp % 6 - shl edx, %3+2 - movsxd rdx, edx - add rsi, rdx ; dequant_mf[i_mf] - - sub eax, %3 +cglobal %1, 0,3 +%ifdef ARCH_X86_64 + %define t0 r4 + %define t0d r4d + imul r4d, r2d, 0x2b + shr r4d, 8 ; i_qbits = i_qp / 6 + lea r3d, [r4d*3] + sub r2, r3 + sub r2, r3 ; i_mf = i_qp % 6 + shl r2, %3+2 + add r1, r2 ; dequant_mf[i_mf] +%else + %define t0 r2 + %define t0d r2d + mov r1, r2m ; i_qp + imul r2, r1, 0x2b + shr r2, 8 ; i_qbits = i_qp / 6 + lea r0, [r2*3] + sub r1, r0 + sub r1, r0 ; i_mf = i_qp % 6 + shl r1, %3+2 + add r1, r1m ; dequant_mf[i_mf] + mov r0, r0m ; dct +%endif + + sub t0d, %3 jl .rshift32 ; negative qbits => rightshift .lshift: - movd mm5, eax - -%rep %2 - DEQUANT16_L_1x4 [rdi], [rsi], [rsi+8] - add rsi, byte 16 - add rdi, byte 8 -%endrep - - ret + movd mm5, t0d + DEQUANT_LOOP DEQUANT16_L_1x4, %2 .rshift32: - neg eax - movd mm5, eax + neg t0d + movd mm5, t0d + picgetgot t0d movq mm6, [pd_1 GLOBAL] pxor mm7, mm7 pslld mm6, mm5 psrld mm6, 1 - -%rep %2 - DEQUANT32_R_1x4 [rdi], [rsi], [rsi+8] - add rsi, byte 16 - add rdi, byte 8 -%endrep - - ret + DEQUANT_LOOP DEQUANT32_R_1x4, %2 %endmacro DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4 diff --git a/common/i386/quant.h b/common/x86/quant.h similarity index 100% rename from common/i386/quant.h rename to common/x86/quant.h diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm new file mode 100644 index 00000000..61a68ed6 --- /dev/null +++ b/common/x86/sad-a.asm @@ -0,0 +1,974 @@ +;***************************************************************************** +;* sad-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2003-2008 x264 project +;* +;* Authors: Loren Merritt +;* Laurent Aimar +;* Alex Izvorski +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA +sw_64: dq 64 + +SECTION .text + +;============================================================================= +; SAD MMX +;============================================================================= + +%macro SAD_INC_2x16P 0 + movq mm1, [r0] + movq mm2, [r0+8] + movq mm3, [r0+r1] + movq mm4, [r0+r1+8] + psadbw mm1, [r2] + psadbw mm2, [r2+8] + psadbw mm3, [r2+r3] + psadbw mm4, [r2+r3+8] + lea r0, [r0+2*r1] + paddw mm1, mm2 + paddw mm3, mm4 + lea r2, [r2+2*r3] + paddw mm0, mm1 + paddw mm0, mm3 +%endmacro + +%macro SAD_INC_2x8P 0 + movq mm1, [r0] + movq mm2, [r0+r1] + psadbw mm1, [r2] + psadbw mm2, [r2+r3] + lea r0, [r0+2*r1] + paddw mm0, mm1 + paddw mm0, mm2 + lea r2, [r2+2*r3] +%endmacro + +%macro SAD_INC_2x4P 0 + movd mm1, [r0] + movd mm2, [r2] + punpckldq mm1, [r0+r1] + punpckldq mm2, [r2+r3] + psadbw mm1, mm2 + paddw mm0, mm1 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +%macro SAD 2 +cglobal x264_pixel_sad_%1x%2_mmxext, 4,4 + pxor mm0, mm0 +%rep %2/2 + SAD_INC_2x%1P +%endrep + movd eax, mm0 + RET +%endmacro + +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 +SAD 8, 4 +SAD 4, 8 +SAD 4, 4 + + + +;============================================================================= +; SAD XMM +;============================================================================= + +%macro SAD_END_SSE2 0 + movhlps xmm1, xmm0 + paddw xmm0, xmm1 + movd eax, xmm0 + RET +%endmacro + +%macro SAD_W16 1 +;----------------------------------------------------------------------------- +; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_sad_16x16_%1, 4,4 + movdqu xmm0, [r2] + movdqu xmm1, [r2+r3] + lea r2, [r2+2*r3] + movdqu xmm2, [r2] + movdqu xmm3, [r2+r3] + lea r2, [r2+2*r3] + psadbw xmm0, [r0] + psadbw xmm1, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm4, [r2] + paddw xmm0, xmm1 + psadbw xmm2, [r0] + psadbw xmm3, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm5, [r2+r3] + lea r2, [r2+2*r3] + paddw xmm2, xmm3 + movdqu xmm6, [r2] + movdqu xmm7, [r2+r3] + lea r2, [r2+2*r3] + paddw xmm0, xmm2 + psadbw xmm4, [r0] + psadbw xmm5, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm1, [r2] + paddw xmm4, xmm5 + psadbw xmm6, [r0] + psadbw xmm7, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm2, [r2+r3] + lea r2, [r2+2*r3] + paddw xmm6, xmm7 + movdqu xmm3, [r2] + paddw xmm0, xmm4 + movdqu xmm4, [r2+r3] + lea r2, [r2+2*r3] + paddw xmm0, xmm6 + psadbw xmm1, [r0] + psadbw xmm2, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm5, [r2] + paddw xmm1, xmm2 + psadbw xmm3, [r0] + psadbw xmm4, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm6, [r2+r3] + lea r2, [r2+2*r3] + paddw xmm3, xmm4 + movdqu xmm7, [r2] + paddw xmm0, xmm1 + movdqu xmm1, [r2+r3] + paddw xmm0, xmm3 + psadbw xmm5, [r0] + psadbw xmm6, [r0+r1] + lea r0, [r0+2*r1] + paddw xmm5, xmm6 + psadbw xmm7, [r0] + psadbw xmm1, [r0+r1] + paddw xmm7, xmm1 + paddw xmm0, xmm5 + paddw xmm0, xmm7 + SAD_END_SSE2 + +;----------------------------------------------------------------------------- +; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +cglobal x264_pixel_sad_16x8_%1, 4,4 + movdqu xmm0, [r2] + movdqu xmm2, [r2+r3] + lea r2, [r2+2*r3] + movdqu xmm3, [r2] + movdqu xmm4, [r2+r3] + psadbw xmm0, [r0] + psadbw xmm2, [r0+r1] + lea r0, [r0+2*r1] + psadbw xmm3, [r0] + psadbw xmm4, [r0+r1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + paddw xmm0, xmm2 + paddw xmm3, xmm4 + paddw xmm0, xmm3 + movdqu xmm1, [r2] + movdqu xmm2, [r2+r3] + lea r2, [r2+2*r3] + movdqu xmm3, [r2] + movdqu xmm4, [r2+r3] + psadbw xmm1, [r0] + psadbw xmm2, [r0+r1] + lea r0, [r0+2*r1] + psadbw xmm3, [r0] + psadbw xmm4, [r0+r1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + paddw xmm1, xmm2 + paddw xmm3, xmm4 + paddw xmm0, xmm1 + paddw xmm0, xmm3 + SAD_END_SSE2 +%endmacro + +SAD_W16 sse2 +%ifdef HAVE_SSE3 +%define movdqu lddqu +SAD_W16 sse3 +%undef movdqu +%endif + + + +;============================================================================= +; SAD x3/x4 MMX +;============================================================================= + +%macro SAD_X3_START_1x8P 0 + movq mm3, [r0] + movq mm0, [r1] + movq mm1, [r2] + movq mm2, [r3] + psadbw mm0, mm3 + psadbw mm1, mm3 + psadbw mm2, mm3 +%endmacro + +%macro SAD_X3_1x8P 2 + movq mm3, [r0+%1] + movq mm4, [r1+%2] + movq mm5, [r2+%2] + movq mm6, [r3+%2] + psadbw mm4, mm3 + psadbw mm5, mm3 + psadbw mm6, mm3 + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 +%endmacro + +%macro SAD_X3_START_2x4P 3 + movd mm3, [r0] + movd %1, [r1] + movd %2, [r2] + movd %3, [r3] + punpckldq mm3, [r0+FENC_STRIDE] + punpckldq %1, [r1+r4] + punpckldq %2, [r2+r4] + punpckldq %3, [r3+r4] + psadbw %1, mm3 + psadbw %2, mm3 + psadbw %3, mm3 +%endmacro + +%macro SAD_X3_2x16P 1 +%if %1 + SAD_X3_START_1x8P +%else + SAD_X3_1x8P 0, 0 +%endif + SAD_X3_1x8P 8, 8 + SAD_X3_1x8P FENC_STRIDE, r4 + SAD_X3_1x8P FENC_STRIDE+8, r4+8 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r4] + lea r2, [r2+2*r4] + lea r3, [r3+2*r4] +%endmacro + +%macro SAD_X3_2x8P 1 +%if %1 + SAD_X3_START_1x8P +%else + SAD_X3_1x8P 0, 0 +%endif + SAD_X3_1x8P FENC_STRIDE, r4 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r4] + lea r2, [r2+2*r4] + lea r3, [r3+2*r4] +%endmacro + +%macro SAD_X3_2x4P 1 +%if %1 + SAD_X3_START_2x4P mm0, mm1, mm2 +%else + SAD_X3_START_2x4P mm4, mm5, mm6 + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 +%endif + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r4] + lea r2, [r2+2*r4] + lea r3, [r3+2*r4] +%endmacro + +%macro SAD_X4_START_1x8P 0 + movq mm7, [r0] + movq mm0, [r1] + movq mm1, [r2] + movq mm2, [r3] + movq mm3, [r4] + psadbw mm0, mm7 + psadbw mm1, mm7 + psadbw mm2, mm7 + psadbw mm3, mm7 +%endmacro + +%macro SAD_X4_1x8P 2 + movq mm7, [r0+%1] + movq mm4, [r1+%2] + movq mm5, [r2+%2] + movq mm6, [r3+%2] + psadbw mm4, mm7 + psadbw mm5, mm7 + psadbw mm6, mm7 + psadbw mm7, [r4+%2] + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 + paddw mm3, mm7 +%endmacro + +%macro SAD_X4_START_2x4P 0 + movd mm7, [r0] + movd mm0, [r1] + movd mm1, [r2] + movd mm2, [r3] + movd mm3, [r4] + punpckldq mm7, [r0+FENC_STRIDE] + punpckldq mm0, [r1+r5] + punpckldq mm1, [r2+r5] + punpckldq mm2, [r3+r5] + punpckldq mm3, [r4+r5] + psadbw mm0, mm7 + psadbw mm1, mm7 + psadbw mm2, mm7 + psadbw mm3, mm7 +%endmacro + +%macro SAD_X4_INC_2x4P 0 + movd mm7, [r0] + movd mm4, [r1] + movd mm5, [r2] + punpckldq mm7, [r0+FENC_STRIDE] + punpckldq mm4, [r1+r5] + punpckldq mm5, [r2+r5] + psadbw mm4, mm7 + psadbw mm5, mm7 + paddw mm0, mm4 + paddw mm1, mm5 + movd mm4, [r3] + movd mm5, [r4] + punpckldq mm4, [r3+r5] + punpckldq mm5, [r4+r5] + psadbw mm4, mm7 + psadbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 +%endmacro + +%macro SAD_X4_2x16P 1 +%if %1 + SAD_X4_START_1x8P +%else + SAD_X4_1x8P 0, 0 +%endif + SAD_X4_1x8P 8, 8 + SAD_X4_1x8P FENC_STRIDE, r5 + SAD_X4_1x8P FENC_STRIDE+8, r5+8 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r5] + lea r2, [r2+2*r5] + lea r3, [r3+2*r5] + lea r4, [r4+2*r5] +%endmacro + +%macro SAD_X4_2x8P 1 +%if %1 + SAD_X4_START_1x8P +%else + SAD_X4_1x8P 0, 0 +%endif + SAD_X4_1x8P FENC_STRIDE, r5 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r5] + lea r2, [r2+2*r5] + lea r3, [r3+2*r5] + lea r4, [r4+2*r5] +%endmacro + +%macro SAD_X4_2x4P 1 +%if %1 + SAD_X4_START_2x4P +%else + SAD_X4_INC_2x4P +%endif + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r5] + lea r2, [r2+2*r5] + lea r3, [r3+2*r5] + lea r4, [r4+2*r5] +%endmacro + +%macro SAD_X3_END 0 +%ifdef ARCH_X86_64 + movd [r5+0], mm0 + movd [r5+4], mm1 + movd [r5+8], mm2 +%else + mov r0, r5m + movd [r0+0], mm0 + movd [r0+4], mm1 + movd [r0+8], mm2 +%endif + RET +%endmacro + +%macro SAD_X4_END 0 + mov r0, r6m + movd [r0+0], mm0 + movd [r0+4], mm1 + movd [r0+8], mm2 + movd [r0+12], mm3 + RET +%endmacro + +;----------------------------------------------------------------------------- +; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, +; uint8_t *pix2, int i_stride, int scores[3] ) +;----------------------------------------------------------------------------- +%macro SAD_X 3 +cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2 + SAD_X%1_2x%2P 1 +%rep %3/2-1 + SAD_X%1_2x%2P 0 +%endrep + SAD_X%1_END +%endmacro + +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +SAD_X 3, 4, 8 +SAD_X 3, 4, 4 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 +SAD_X 4, 4, 8 +SAD_X 4, 4, 4 + + + +;============================================================================= +; SAD x3/x4 XMM +;============================================================================= + +%macro SAD_X3_START_1x16P_SSE2 0 + movdqa xmm3, [r0] + movdqu xmm0, [r1] + movdqu xmm1, [r2] + movdqu xmm2, [r3] + psadbw xmm0, xmm3 + psadbw xmm1, xmm3 + psadbw xmm2, xmm3 +%endmacro + +%macro SAD_X3_1x16P_SSE2 2 + movdqa xmm3, [r0+%1] + movdqu xmm4, [r1+%2] + movdqu xmm5, [r2+%2] + movdqu xmm6, [r3+%2] + psadbw xmm4, xmm3 + psadbw xmm5, xmm3 + psadbw xmm6, xmm3 + paddw xmm0, xmm4 + paddw xmm1, xmm5 + paddw xmm2, xmm6 +%endmacro + +%macro SAD_X3_2x16P_SSE2 1 +%if %1 + SAD_X3_START_1x16P_SSE2 +%else + SAD_X3_1x16P_SSE2 0, 0 +%endif + SAD_X3_1x16P_SSE2 FENC_STRIDE, r4 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r4] + lea r2, [r2+2*r4] + lea r3, [r3+2*r4] +%endmacro + +%macro SAD_X4_START_1x16P_SSE2 0 + movdqa xmm7, [r0] + movdqu xmm0, [r1] + movdqu xmm1, [r2] + movdqu xmm2, [r3] + movdqu xmm3, [r4] + psadbw xmm0, xmm7 + psadbw xmm1, xmm7 + psadbw xmm2, xmm7 + psadbw xmm3, xmm7 +%endmacro + +%macro SAD_X4_1x16P_SSE2 2 + movdqa xmm7, [r0+%1] + movdqu xmm4, [r1+%2] + movdqu xmm5, [r2+%2] + movdqu xmm6, [r3+%2] +%ifdef ARCH_X86_64 + movdqu xmm8, [r4+%2] + psadbw xmm4, xmm7 + psadbw xmm5, xmm7 + psadbw xmm6, xmm7 + psadbw xmm8, xmm7 + paddw xmm0, xmm4 + paddw xmm1, xmm5 + paddw xmm2, xmm6 + paddw xmm3, xmm8 +%else + psadbw xmm4, xmm7 + psadbw xmm5, xmm7 + paddw xmm0, xmm4 + psadbw xmm6, xmm7 + movdqu xmm4, [r4+%2] + paddw xmm1, xmm5 + psadbw xmm4, xmm7 + paddw xmm2, xmm6 + paddw xmm3, xmm4 +%endif +%endmacro + +%macro SAD_X4_2x16P_SSE2 1 +%if %1 + SAD_X4_START_1x16P_SSE2 +%else + SAD_X4_1x16P_SSE2 0, 0 +%endif + SAD_X4_1x16P_SSE2 FENC_STRIDE, r5 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r5] + lea r2, [r2+2*r5] + lea r3, [r3+2*r5] + lea r4, [r4+2*r5] +%endmacro + +%macro SAD_X3_END_SSE2 0 + movhlps xmm4, xmm0 + movhlps xmm5, xmm1 + movhlps xmm6, xmm2 + paddw xmm0, xmm4 + paddw xmm1, xmm5 + paddw xmm2, xmm6 +%ifdef ARCH_X86_64 + movd [r5+0], xmm0 + movd [r5+4], xmm1 + movd [r5+8], xmm2 +%else + mov r0, r5m + movd [r0+0], xmm0 + movd [r0+4], xmm1 + movd [r0+8], xmm2 +%endif + RET +%endmacro + +%macro SAD_X4_END_SSE2 0 + mov r0, r6m + psllq xmm1, 32 + psllq xmm3, 32 + paddw xmm0, xmm1 + paddw xmm2, xmm3 + movhlps xmm1, xmm0 + movhlps xmm3, xmm2 + paddw xmm0, xmm1 + paddw xmm2, xmm3 + movq [r0+0], xmm0 + movq [r0+8], xmm2 + RET +%endmacro + +;----------------------------------------------------------------------------- +; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, +; uint8_t *pix2, int i_stride, int scores[3] ) +;----------------------------------------------------------------------------- +%macro SAD_X_SSE2 4 +cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1 + SAD_X%1_2x%2P_SSE2 1 +%rep %3/2-1 + SAD_X%1_2x%2P_SSE2 0 +%endrep + SAD_X%1_END_SSE2 +%endmacro + +SAD_X_SSE2 3, 16, 16, sse2 +SAD_X_SSE2 3, 16, 8, sse2 +SAD_X_SSE2 4, 16, 16, sse2 +SAD_X_SSE2 4, 16, 8, sse2 + +%ifdef HAVE_SSE3 +%define movdqu lddqu +SAD_X_SSE2 3, 16, 16, sse3 +SAD_X_SSE2 3, 16, 8, sse3 +SAD_X_SSE2 4, 16, 16, sse3 +SAD_X_SSE2 4, 16, 8, sse3 +%undef movdqu +%endif + + + +;============================================================================= +; SAD cacheline split +;============================================================================= + +; Core2 (Conroe) can load unaligned data just as quickly as aligned data... +; unless the unaligned data spans the border between 2 cachelines, in which +; case it's really slow. The exact numbers may differ, but all Intel cpus +; have a large penalty for cacheline splits. +; (8-byte alignment exactly half way between two cachelines is ok though.) +; LDDQU was supposed to fix this, but it only works on Pentium 4. +; So in the split case we load aligned data and explicitly perform the +; alignment between registers. Like on archs that have only aligned loads, +; except complicated by the fact that PALIGNR takes only an immediate, not +; a variable alignment. +; It is also possible to hoist the realignment to the macroblock level (keep +; 2 copies of the reference frame, offset by 32 bytes), but the extra memory +; needed for that method makes it often slower. + +; sad 16x16 costs on Core2: +; good offsets: 49 cycles (50/64 of all mvs) +; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles) +; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles) +; cache or page split with palignr: 57 cycles (ammortized: +2 cycles) + +; computed jump assumes this loop is exactly 80 bytes +%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment +ALIGN 16 +sad_w16_align%1_sse2: + movdqa xmm1, [r2+16] + movdqa xmm2, [r2+r3+16] + movdqa xmm3, [r2] + movdqa xmm4, [r2+r3] + pslldq xmm1, 16-%1 + pslldq xmm2, 16-%1 + psrldq xmm3, %1 + psrldq xmm4, %1 + por xmm1, xmm3 + por xmm2, xmm4 + psadbw xmm1, [r0] + psadbw xmm2, [r0+r1] + paddw xmm0, xmm1 + paddw xmm0, xmm2 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + dec r4 + jg sad_w16_align%1_sse2 + rep ret +%endmacro + +; computed jump assumes this loop is exactly 64 bytes +%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment +ALIGN 16 +sad_w16_align%1_ssse3: + movdqa xmm1, [r2+16] + movdqa xmm2, [r2+r3+16] + palignr xmm1, [r2], %1 + palignr xmm2, [r2+r3], %1 + psadbw xmm1, [r0] + psadbw xmm2, [r0+r1] + paddw xmm0, xmm1 + paddw xmm0, xmm2 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + dec r4 + jg sad_w16_align%1_ssse3 + rep ret +%endmacro + +%macro SAD16_CACHELINE_FUNC 2 ; cpu, height +cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0 + mov eax, r2m + and eax, 0x37 + cmp eax, 0x30 + jle x264_pixel_sad_16x%2_sse2 + PROLOGUE 4,6,0 + mov r4d, r2d + and r4d, 15 +%ifidn %1, ssse3 + shl r4d, 6 ; code size = 64 +%else + lea r4, [r4*5] + shl r4d, 4 ; code size = 80 +%endif +%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1)) +%ifdef PIC64 + lea r5, [sad_w16_addr GLOBAL] + add r5, r4 +%else + picgetgot r5 + lea r5, [sad_w16_addr + r4 GLOBAL] +%endif + and r2, ~15 + mov r4d, %2/2 + pxor xmm0, xmm0 + call r5 + movhlps xmm1, xmm0 + paddw xmm0, xmm1 + movd eax, xmm0 + RET +%endmacro + +%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline + mov eax, r2m + and eax, 0x17|%2|(%4>>1) + cmp eax, 0x10|%2|(%4>>1) + jle x264_pixel_sad_%1x%2_mmxext + and eax, 7 + shl eax, 3 +%ifdef PIC32 + ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx + mov r2, 64 + sub r2, eax + movd mm7, eax + movd mm6, r2 +%else + movd mm6, [sw_64 GLOBAL] + movd mm7, eax + psubw mm6, mm7 +%endif + PROLOGUE 4,5,0 + and r2, ~7 + mov r4d, %3 + pxor mm0, mm0 +%endmacro + +%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline +cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0 + SAD_CACHELINE_START_MMX2 16, %1, %1, %2 +.loop: + movq mm1, [r2] + movq mm2, [r2+8] + movq mm3, [r2+16] + movq mm4, mm2 + psrlq mm1, mm7 + psllq mm2, mm6 + psllq mm3, mm6 + psrlq mm4, mm7 + por mm1, mm2 + por mm3, mm4 + psadbw mm1, [r0] + psadbw mm3, [r0+8] + paddw mm0, mm1 + paddw mm0, mm3 + add r2, r3 + add r0, r1 + dec r4 + jg .loop + movd eax, mm0 + RET +%endmacro + +%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline +cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0 + SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 +.loop: + movq mm1, [r2+8] + movq mm2, [r2+r3+8] + movq mm3, [r2] + movq mm4, [r2+r3] + psllq mm1, mm6 + psllq mm2, mm6 + psrlq mm3, mm7 + psrlq mm4, mm7 + por mm1, mm3 + por mm2, mm4 + psadbw mm1, [r0] + psadbw mm2, [r0+r1] + paddw mm0, mm1 + paddw mm0, mm2 + lea r2, [r2+2*r3] + lea r0, [r0+2*r1] + dec r4 + jg .loop + movd eax, mm0 + RET +%endmacro + +; sad_x3/x4_cache64: check each mv. +; if they're all within a cacheline, use normal sad_x3/x4. +; otherwise, send them individually to sad_cache64. +%macro CHECK_SPLIT 3 ; pix, width, cacheline + mov eax, %1 + and eax, 0x17|%2|(%3>>1) + cmp eax, 0x10|%2|(%3>>1) + jg .split +%endmacro + +%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver +cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0 + CHECK_SPLIT r1m, %1, %3 + CHECK_SPLIT r2m, %1, %3 + CHECK_SPLIT r3m, %1, %3 + jmp x264_pixel_sad_x3_%1x%2_%4 +.split: +%ifdef ARCH_X86_64 + push r3 + push r2 + mov r2, r1 + mov r1, FENC_STRIDE + mov r3, r4 + mov r10, r0 + mov r11, r5 + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [r11], eax + pop r2 + mov r0, r10 + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [r11+4], eax + pop r2 + mov r0, r10 + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [r11+8], eax +%else + push edi + mov edi, [esp+28] + push dword [esp+24] + push dword [esp+16] + push dword 16 + push dword [esp+20] + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+32] + mov [edi], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+36] + mov [edi+4], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [edi+8], eax + add esp, 16 + pop edi +%endif + ret +%endmacro + +%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver +cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0 + CHECK_SPLIT r1m, %1, %3 + CHECK_SPLIT r2m, %1, %3 + CHECK_SPLIT r3m, %1, %3 + CHECK_SPLIT r4m, %1, %3 + jmp x264_pixel_sad_x4_%1x%2_%4 +.split: +%ifdef ARCH_X86_64 + mov r11, r6m + push r4 + push r3 + push r2 + mov r2, r1 + mov r1, FENC_STRIDE + mov r3, r5 + mov r10, r0 + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [r11], eax + pop r2 + mov r0, r10 + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [r11+4], eax + pop r2 + mov r0, r10 + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [r11+8], eax + pop r2 + mov r0, r10 + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [r11+12], eax +%else + push edi + mov edi, [esp+32] + push dword [esp+28] + push dword [esp+16] + push dword 16 + push dword [esp+20] + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+32] + mov [edi], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+36] + mov [edi+4], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+40] + mov [edi+8], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [edi+12], eax + add esp, 16 + pop edi +%endif + ret +%endmacro + +%macro SADX34_CACHELINE_FUNC 5 + SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5 + SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5 +%endmacro + + +; instantiate the aligned sads + +%ifndef ARCH_X86_64 +SAD16_CACHELINE_FUNC_MMX2 8, 32 +SAD16_CACHELINE_FUNC_MMX2 16, 32 +SAD8_CACHELINE_FUNC_MMX2 4, 32 +SAD8_CACHELINE_FUNC_MMX2 8, 32 +SAD8_CACHELINE_FUNC_MMX2 16, 32 +SAD16_CACHELINE_FUNC_MMX2 8, 64 +SAD16_CACHELINE_FUNC_MMX2 16, 64 +%endif ; !ARCH_X86_64 +SAD8_CACHELINE_FUNC_MMX2 4, 64 +SAD8_CACHELINE_FUNC_MMX2 8, 64 +SAD8_CACHELINE_FUNC_MMX2 16, 64 + +%ifndef ARCH_X86_64 +SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext +%endif ; !ARCH_X86_64 +SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext + +%ifndef ARCH_X86_64 +SAD16_CACHELINE_FUNC sse2, 8 +SAD16_CACHELINE_FUNC sse2, 16 +%assign i 1 +%rep 15 +SAD16_CACHELINE_LOOP_SSE2 i +%assign i i+1 +%endrep +SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2 +SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2 +%endif ; !ARCH_X86_64 + +%ifdef HAVE_SSE3 +SAD16_CACHELINE_FUNC ssse3, 8 +SAD16_CACHELINE_FUNC ssse3, 16 +%assign i 1 +%rep 15 +SAD16_CACHELINE_LOOP_SSSE3 i +%assign i i+1 +%endrep +SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3 +SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3 +%endif ; HAVE_SSE3 diff --git a/common/i386/i386inc.asm b/common/x86/x86inc-32.asm similarity index 82% rename from common/i386/i386inc.asm rename to common/x86/x86inc-32.asm index c5a97b46..49895fc7 100644 --- a/common/i386/i386inc.asm +++ b/common/x86/x86inc-32.asm @@ -1,7 +1,7 @@ ;***************************************************************************** -;* i386inc.asm: h264 encoder library +;* x86inc-32.asm: h264 encoder library ;***************************************************************************** -;* Copyright (C) 2006 x264 project +;* Copyright (C) 2006-2008 x264 project ;* ;* Author: Sam Hocevar ;* @@ -22,31 +22,6 @@ BITS 32 -;============================================================================= -; Macros and other preprocessor constants -;============================================================================= - -; Symbol prefix for C linkage -%macro cglobal 1 - %ifdef PREFIX - global _%1 - %define %1 _%1 - %else - global %1 - %endif - align 16 - %1: -%endmacro - -%macro cextern 1 - %ifdef PREFIX - extern _%1 - %define %1 _%1 - %else - extern %1 - %endif -%endmacro - ; Name of the .rodata section. On OS X we cannot use .rodata because NASM ; is unable to compute address offsets outside of .text so we use the .text ; section instead until NASM is fixed. @@ -93,6 +68,7 @@ BITS 32 ; mov eax, [esp + 12] ; %ifdef __PIC__ + %define PIC32 %ifidn __OUTPUT_FORMAT__,macho ; There is no real global offset table on OS X, but we still ; need to reference our variables by offset. @@ -140,12 +116,3 @@ BITS 32 %define picesp esp %endif -%assign FENC_STRIDE 16 -%assign FDEC_STRIDE 32 - -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __OUTPUT_FORMAT__,elf -SECTION ".note.GNU-stack" noalloc noexec nowrite progbits -%endif - diff --git a/common/amd64/amd64inc.asm b/common/x86/x86inc-64.asm similarity index 90% rename from common/amd64/amd64inc.asm rename to common/x86/x86inc-64.asm index e6d6632b..f09a0dc0 100644 --- a/common/amd64/amd64inc.asm +++ b/common/x86/x86inc-64.asm @@ -1,7 +1,7 @@ ;***************************************************************************** -;* amd64inc.asm: h264 encoder library +;* x86inc-64.asm: h264 encoder library ;***************************************************************************** -;* Copyright (C) 2005 x264 project +;* Copyright (C) 2005-2008 x264 project ;* ;* Authors: Andrew Dunstan ;* @@ -24,31 +24,8 @@ BITS 64 ; FIXME: All of the 64bit asm functions that take a stride as an argument ; via register, assume that the high dword of that register is filled with 0. -; This is true in practice (since we never do any 64bit arithmetic on strides), -; but is not guaranteed by the ABI. - -%macro cglobal 1 - %ifdef PREFIX - global _%1:function hidden - %define %1 _%1 - %else - global %1:function hidden - %endif -%ifdef WIN64 - %define %1 pad %1 -%endif - align 16 - %1: -%endmacro - -%macro cextern 1 - %ifdef PREFIX - extern _%1 - %define %1 _%1 - %else - extern %1 - %endif -%endmacro +; This is true in practice (since we never do any 64bit arithmetic on strides, +; and x264's strides are all positive), but is not guaranteed by the ABI. ; Name of the .rodata section. On OS X we cannot use .rodata because YASM ; is unable to compute address offsets outside of .text so we use the .text @@ -300,15 +277,10 @@ SECTION .text ; %ifdef __PIC__ %define GLOBAL wrt rip + %define PIC64 %else %define GLOBAL %endif -%assign FENC_STRIDE 16 -%assign FDEC_STRIDE 32 - -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __YASM_OBJFMT__,elf -section ".note.GNU-stack" noalloc noexec nowrite progbits -%endif +%macro picgetgot 1 +%endmacro diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm new file mode 100644 index 00000000..2453552f --- /dev/null +++ b/common/x86/x86inc.asm @@ -0,0 +1,328 @@ +;***************************************************************************** +;* x86inc.asm +;***************************************************************************** +;* Copyright (C) 2008 Loren Merritt +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%ifdef WIN64 +%define ARCH_X86_64 +%endif + +%ifdef ARCH_X86_64 +%include "x86inc-64.asm" +%else +%include "x86inc-32.asm" +%endif + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. +; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. +; PROLOGUE can also be invoked by adding the same options to cglobal + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE + +; REP_RET: +; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons +; which are slow when a normal ret follows a branch. + +%macro DECLARE_REG 5 + %define r%1q %2 + %define r%1d %3 + %define r%1w %4 + ; no r%1b, because some regs don't have a byte form, and anyway x264 doesn't need it + %define r%1m %5 + %define r%1 r%1q +%endmacro + +%macro DECLARE_REG_SIZE 1 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 +%ifndef ARCH_X86_64 + %define r%1 e%1 +%endif +%endmacro + +DECLARE_REG_SIZE ax +DECLARE_REG_SIZE bx +DECLARE_REG_SIZE cx +DECLARE_REG_SIZE dx +DECLARE_REG_SIZE si +DECLARE_REG_SIZE di +DECLARE_REG_SIZE bp + +%ifdef ARCH_X86_64 + %define push_size 8 +%else + %define push_size 4 +%endif + +%macro PUSH 1 + push %1 + %assign stack_offset stack_offset+push_size +%endmacro + +%macro POP 1 + pop %1 + %assign stack_offset stack_offset-push_size +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assert failed + %endif +%endmacro + +%ifdef WIN64 ;================================================================ + +DECLARE_REG 0, rcx, ecx, cx, ecx +DECLARE_REG 1, rdx, edx, dx, edx +DECLARE_REG 2, r8, r8d, r8w, r8d +DECLARE_REG 3, r9, r9d, r9w, r9d +DECLARE_REG 4, rdi, edi, di, [rsp + stack_offset + 40] +DECLARE_REG 5, rsi, esi, si, [rsp + stack_offset + 48] +DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 56] +%define r7m [rsp + stack_offset + 64] + +%macro LOAD_IF_USED 2 ; reg_id, number_of_args + %if %1 < %2 + mov r%1, [rsp + 8 + %1*8] + %endif +%endmacro + +%macro PROLOGUE 3 + ASSERT %2 >= %1 + ASSERT %2 <= 7 + %assign stack_offset 0 + LOAD_IF_USED 4, %1 + LOAD_IF_USED 5, %1 + LOAD_IF_USED 6, %1 +%endmacro + +%macro RET 0 + ret +%endmacro + +%macro REP_RET 0 + rep ret +%endmacro + +%elifdef ARCH_X86_64 ;======================================================== + +DECLARE_REG 0, rdi, edi, di, edi +DECLARE_REG 1, rsi, esi, si, esi +DECLARE_REG 2, rdx, edx, dx, edx +DECLARE_REG 3, rcx, ecx, cx, ecx +DECLARE_REG 4, r8, r8d, r8w, r8d +DECLARE_REG 5, r9, r9d, r9w, r9d +DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 8] +%define r7m [rsp + stack_offset + 16] + +%macro LOAD_IF_USED 2 ; reg_id, number_of_args + %if %1 < %2 + mov r%1, [rsp - 40 + %1*8] + %endif +%endmacro + +%macro PROLOGUE 3 + ASSERT %2 >= %1 + ASSERT %2 <= 7 + %assign stack_offset 0 + LOAD_IF_USED 6, %1 +%endmacro + +%macro RET 0 + ret +%endmacro + +%macro REP_RET 0 + rep ret +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, eax, ax, [esp + stack_offset + 4] +DECLARE_REG 1, ecx, ecx, cx, [esp + stack_offset + 8] +DECLARE_REG 2, edx, edx, dx, [esp + stack_offset + 12] +DECLARE_REG 3, ebx, ebx, bx, [esp + stack_offset + 16] +DECLARE_REG 4, esi, esi, si, [esp + stack_offset + 20] +DECLARE_REG 5, edi, edi, di, [esp + stack_offset + 24] +DECLARE_REG 6, ebp, ebp, bp, [esp + stack_offset + 28] +%define r7m [esp + stack_offset + 32] +%define rsp esp + +%macro PUSH_IF_USED 1 ; reg_id + %if %1 < regs_used + push r%1 + %assign stack_offset stack_offset+4 + %endif +%endmacro + +%macro POP_IF_USED 1 ; reg_id + %if %1 < regs_used + pop r%1 + %endif +%endmacro + +%macro LOAD_IF_USED 2 ; reg_id, number_of_args + %if %1 < %2 + mov r%1, [esp + stack_offset + 4 + %1*4] + %endif +%endmacro + +%macro PROLOGUE 3 + ASSERT %2 >= %1 + %assign stack_offset 0 + %assign regs_used %2 + %if %3 + %assign regs_used regs_used+1 + %endif + ASSERT regs_used <= 7 + PUSH_IF_USED 3 + PUSH_IF_USED 4 + PUSH_IF_USED 5 + PUSH_IF_USED 6 + LOAD_IF_USED 0, %1 + LOAD_IF_USED 1, %1 + LOAD_IF_USED 2, %1 + LOAD_IF_USED 3, %1 + LOAD_IF_USED 4, %1 + LOAD_IF_USED 5, %1 + LOAD_IF_USED 6, %1 + %if %3 + picgetgot r%2 + %endif +%endmacro + +%macro RET 0 + POP_IF_USED 6 + POP_IF_USED 5 + POP_IF_USED 4 + POP_IF_USED 3 + ret +%endmacro + +%macro REP_RET 0 + %if regs_used > 3 + RET + %else + rep ret + %endif +%endmacro + +%endif ;====================================================================== + + + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Symbol prefix for C linkage +%macro cglobal 1 + %ifidn __OUTPUT_FORMAT__,elf + %ifdef PREFIX + global _%1:function hidden + %define %1 _%1 + %else + global %1:function hidden + %endif + %else + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif + %endif +%ifdef WIN64 + %define %1 pad %1 +%endif + align function_align + %1: +%endmacro + +%macro cglobal 3 + cglobal %1 + PROLOGUE %2, %3, 0 +%endmacro + +%macro cglobal 4 + cglobal %1 + PROLOGUE %2, %3, %4 +%endmacro + +%macro cextern 1 + %ifdef PREFIX + extern _%1 + %define %1 _%1 + %else + extern %1 + %endif +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is +; executable by default. +%ifidn __OUTPUT_FORMAT__,elf +SECTION ".note.GNU-stack" noalloc noexec nowrite progbits +%endif + +%assign FENC_STRIDE 16 +%assign FDEC_STRIDE 32 + diff --git a/tools/checkasm.c b/tools/checkasm.c index 4fbc6a34..dc500b40 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -3,15 +3,6 @@ #include "common/common.h" #include "common/cpu.h" -#ifdef HAVE_MMX -#include "common/i386/pixel.h" -#include "common/i386/dct.h" -#include "common/i386/mc.h" -#endif -#ifdef ARCH_PPC -#include "common/ppc/pixel.h" -#include "common/ppc/mc.h" -#endif /* buf1, buf2: initialised to random data and shouldn't write into them */ uint8_t * buf1, * buf2; @@ -169,10 +160,19 @@ static int check_pixel( int cpu_ref, int cpu_new ) for( j=0; j<4; j++ ) dc[j] = rand() & 0x3fff; used_asm = 1; - mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 32, thresh ); - mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 32, thresh ); + mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 28, thresh ); + mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 28, thresh ); if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) ) + { ok = 0; + printf("c%d: ", i&3); + for(j=0; j 1 ) ? atoi(argv[1]) : x264_mdate(); fprintf( stderr, "x264: using random seed %u\n", i );