From: Holger Lubitz Date: Wed, 30 Jul 2008 03:26:58 +0000 (-0600) Subject: Refactor asm macros part 1: DCT X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=60f7c47de10a240cb50568996ff8232726c19881;p=libx264 Refactor asm macros part 1: DCT --- diff --git a/Makefile b/Makefile index 8606ecac..e53de9d7 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ endif ifneq ($(AS),) X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \ pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \ - cpu-32.asm dct-32.asm + cpu-32.asm dct-32.asm x86util.asm X86SRC = $(X86SRC0:%=common/x86/%) ifeq ($(ARCH),X86) diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm index 3c6df968..b8f7832d 100644 --- a/common/x86/dct-32.asm +++ b/common/x86/dct-32.asm @@ -24,6 +24,7 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA @@ -31,12 +32,6 @@ pw_32: times 8 dw 32 SECTION .text -%macro SUMSUB_BA 2 - paddw %1, %2 - paddw %2, %2 - psubw %2, %1 -%endmacro - %macro SBUTTERFLY 4 mova m%4, m%2 punpckl%1 m%2, m%3 @@ -52,23 +47,6 @@ SECTION .text SWAP %2, %3 %endmacro -%macro LOAD_DIFF_8P 4 - movh %1, %3 - movh %2, %4 - punpcklbw %1, %2 - punpcklbw %2, %2 - psubw %1, %2 -%endmacro - -%macro STORE_DIFF_8P 4 - psraw %1, 6 - movh %3, %2 - punpcklbw %3, %4 - paddsw %1, %3 - packuswb %1, %1 - movh %2, %1 -%endmacro - ; in: m0..m7 ; out: 0,4,6 in mem, rest in regs %macro DCT8_1D 9 @@ -175,15 +153,15 @@ SECTION .text INIT_MMX ALIGN 16 load_diff_4x8_mmx: - LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] - LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] - LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] - LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] - LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] - LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] + LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] + LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] + LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] + LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] + LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] + LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] movq [r0], m0 - LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] - LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] + LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] + LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] movq m0, [r0] ret @@ -412,15 +390,15 @@ INIT_XMM cglobal x264_sub8x8_dct8_sse2, 3,3 global x264_sub8x8_dct8_sse2 %+ .skip_prologue .skip_prologue: - LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] - LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] - LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] - LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] - LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] - LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] + LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] + LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] + LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] + LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] + LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] + LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] SPILL r0, 0 - LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] - LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] + LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] + LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] UNSPILL r0, 0 DCT8_1D 0,1,2,3,4,5,6,7,r0 UNSPILL r0, 0,4 @@ -446,14 +424,14 @@ global x264_add8x8_idct8_sse2 %+ .skip_prologue IDCT8_1D 0,1,2,3,4,5,6,7,r1 SPILL r1, 6,7 pxor m7, m7 - STORE_DIFF_8P m0, [r0+FDEC_STRIDE*0], m6, m7 - STORE_DIFF_8P m1, [r0+FDEC_STRIDE*1], m6, m7 - STORE_DIFF_8P m2, [r0+FDEC_STRIDE*2], m6, m7 - STORE_DIFF_8P m3, [r0+FDEC_STRIDE*3], m6, m7 - STORE_DIFF_8P m4, [r0+FDEC_STRIDE*4], m6, m7 - STORE_DIFF_8P m5, [r0+FDEC_STRIDE*5], m6, m7 + STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*0] + STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*1] + STORE_DIFF m2, m6, m7, [r0+FDEC_STRIDE*2] + STORE_DIFF m3, m6, m7, [r0+FDEC_STRIDE*3] + STORE_DIFF m4, m6, m7, [r0+FDEC_STRIDE*4] + STORE_DIFF m5, m6, m7, [r0+FDEC_STRIDE*5] UNSPILL_SHUFFLE r1, 0,1, 6,7 - STORE_DIFF_8P m0, [r0+FDEC_STRIDE*6], m6, m7 - STORE_DIFF_8P m1, [r0+FDEC_STRIDE*7], m6, m7 + STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*6] + STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*7] ret diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm index a6d753e8..383a9001 100644 --- a/common/x86/dct-64.asm +++ b/common/x86/dct-64.asm @@ -23,6 +23,7 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA pw_32: times 8 dw 32 @@ -31,20 +32,6 @@ SECTION .text INIT_XMM -%macro LOAD_DIFF_8P 5 - movq %1, %4 - punpcklbw %1, %3 - movq %2, %5 - punpcklbw %2, %3 - psubw %1, %2 -%endmacro - -%macro SUMSUB_BA 2 - paddw %1, %2 - paddw %2, %2 - psubw %2, %1 -%endmacro - %macro SBUTTERFLY 4 mova m%4, m%2 punpckl%1 m%2, m%3 @@ -69,15 +56,6 @@ INIT_XMM SWAP %4, %7 %endmacro -%macro STORE_DIFF_8P 4 - psraw %1, 6 - movq %2, %4 - punpcklbw %2, %3 - paddsw %1, %2 - packuswb %1, %1 - movq %4, %1 -%endmacro - SECTION .text %macro DCT8_1D 10 @@ -136,14 +114,14 @@ SECTION .text ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- cglobal x264_sub8x8_dct8_sse2 - LOAD_DIFF_8P m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] - LOAD_DIFF_8P m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] - LOAD_DIFF_8P m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] - LOAD_DIFF_8P m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] - LOAD_DIFF_8P m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] - LOAD_DIFF_8P m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] - LOAD_DIFF_8P m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] - LOAD_DIFF_8P m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] + LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] + LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] + LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] + LOAD_DIFF m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] + LOAD_DIFF m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] + LOAD_DIFF m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] + LOAD_DIFF m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] + LOAD_DIFF m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] DCT8_1D 0,1,2,3,4,5,6,7,8,9 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 @@ -232,14 +210,14 @@ cglobal x264_add8x8_idct8_sse2 IDCT8_1D 0,1,2,3,4,5,6,7,8,9 pxor m9, m9 - STORE_DIFF_8P m0, m8, m9, [r0+0*FDEC_STRIDE] - STORE_DIFF_8P m1, m8, m9, [r0+1*FDEC_STRIDE] - STORE_DIFF_8P m2, m8, m9, [r0+2*FDEC_STRIDE] - STORE_DIFF_8P m3, m8, m9, [r0+3*FDEC_STRIDE] - STORE_DIFF_8P m4, m8, m9, [r0+4*FDEC_STRIDE] - STORE_DIFF_8P m5, m8, m9, [r0+5*FDEC_STRIDE] - STORE_DIFF_8P m6, m8, m9, [r0+6*FDEC_STRIDE] - STORE_DIFF_8P m7, m8, m9, [r0+7*FDEC_STRIDE] + STORE_DIFF m0, m8, m9, [r0+0*FDEC_STRIDE] + STORE_DIFF m1, m8, m9, [r0+1*FDEC_STRIDE] + STORE_DIFF m2, m8, m9, [r0+2*FDEC_STRIDE] + STORE_DIFF m3, m8, m9, [r0+3*FDEC_STRIDE] + STORE_DIFF m4, m8, m9, [r0+4*FDEC_STRIDE] + STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE] + STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE] + STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE] ret diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 8026cec0..59fa6ffc 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -23,6 +23,7 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA pw_1: times 8 dw 1 @@ -31,46 +32,6 @@ pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 SECTION .text -%macro LOAD_DIFF_4P 5 - movh %1, %4 - punpcklbw %1, %3 - movh %2, %5 - punpcklbw %2, %3 - psubw %1, %2 -%endmacro - -%macro SUMSUB_BA 2 - paddw %1, %2 - paddw %2, %2 - psubw %2, %1 -%endmacro - -%macro SUMSUB_BADC 4 - paddw %1, %2 - paddw %3, %4 - paddw %2, %2 - paddw %4, %4 - psubw %2, %1 - psubw %4, %3 -%endmacro - -%macro SUMSUB2_AB 3 - mova %3, %1 - paddw %1, %1 - paddw %1, %2 - psubw %3, %2 - psubw %3, %2 -%endmacro - -%macro SUMSUBD2_AB 4 - mova %4, %1 - mova %3, %2 - psraw %2, 1 - psraw %4, 1 - paddw %1, %2 - psubw %4, %3 -%endmacro - %macro SBUTTERFLY 4 mova m%4, m%2 punpckl%1 m%2, m%3 @@ -95,15 +56,6 @@ SECTION .text SBUTTERFLY qdq, %3, %4, %5 %endmacro -%macro STORE_DIFF_4P 4 - psraw %1, 6 - movh %2, %4 - punpcklbw %2, %3 - paddsw %1, %2 - packuswb %1, %1 - movh %4, %1 -%endmacro - %macro HADAMARD4_1D 4 SUMSUB_BADC m%2, m%1, m%4, m%3 SUMSUB_BADC m%4, m%2, m%3, m%1 @@ -173,10 +125,10 @@ cglobal x264_idct4x4dc_mmx, 1,1 cglobal x264_sub4x4_dct_mmx, 3,3 .skip_prologue: %macro SUB_DCT4 1 - LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] - LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] - LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] - LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] + LOAD_DIFF m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] + LOAD_DIFF m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] + LOAD_DIFF m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] + LOAD_DIFF m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] DCT4_1D 0,1,2,3,4 TRANSPOSE%1 0,1,2,3,4 DCT4_1D 0,1,2,3,4 @@ -203,10 +155,10 @@ cglobal x264_add4x4_idct_mmx, 2,2,1 paddw m0, [pw_32 GLOBAL] IDCT4_1D 0,1,2,3,4,5 pxor m7, m7 - STORE_DIFF_4P m0, m4, m7, [r0+0*FDEC_STRIDE] - STORE_DIFF_4P m1, m4, m7, [r0+1*FDEC_STRIDE] - STORE_DIFF_4P m2, m4, m7, [r0+2*FDEC_STRIDE] - STORE_DIFF_4P m3, m4, m7, [r0+3*FDEC_STRIDE] + STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE] + STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE] + STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE] + STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE] %endmacro ADD_IDCT4 4x4W RET