From: Loren Merritt Date: Mon, 7 Apr 2008 16:22:03 +0000 (-0600) Subject: macros to deal with macros that permute their arguments X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=56bf7565f2743e4fe85763388cb74b75b9bf41c5;p=libx264 macros to deal with macros that permute their arguments --- diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm index 260bcf03..a68f122a 100644 --- a/common/x86/dct-32.asm +++ b/common/x86/dct-32.asm @@ -46,18 +46,19 @@ SECTION .text psubw %4, %3 %endmacro -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 +%macro SBUTTERFLY 4 + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 + SWAP %3, %4 %endmacro -; input ABCD output ADTC %macro TRANSPOSE4x4W 5 - SBUTTERFLY q, wd, %1, %2, %5 - SBUTTERFLY q, wd, %3, %4, %2 - SBUTTERFLY q, dq, %1, %3, %4 - SBUTTERFLY q, dq, %5, %2, %3 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SWAP %2, %3 %endmacro ; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2) @@ -306,45 +307,45 @@ x264_pixel_add_8x8_mmx: ;----------------------------------------------------------------------------- ALIGN 16 x264_transpose_8x8_mmx: - movq mm0, [r0 ] - movq mm1, [r0+ 16] - movq mm2, [r0+ 32] - movq mm3, [r0+ 48] - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 - movq [r0 ], mm0 - movq [r0+ 16], mm3 - movq [r0+ 32], mm4 - movq [r0+ 48], mm2 - - movq mm0, [r0+ 72] - movq mm1, [r0+ 88] - movq mm2, [r0+104] - movq mm3, [r0+120] - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 - movq [r0+ 72], mm0 - movq [r0+ 88], mm3 - movq [r0+104], mm4 - movq [r0+120], mm2 - - movq mm0, [r0+ 8] - movq mm1, [r0+ 24] - movq mm2, [r0+ 40] - movq mm3, [r0+ 56] - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 - movq mm1, [r0+ 64] - movq mm5, [r0+ 80] - movq mm6, [r0+ 96] - movq mm7, [r0+112] - - movq [r0+ 64], mm0 - movq [r0+ 80], mm3 - movq [r0+ 96], mm4 - movq [r0+112], mm2 - TRANSPOSE4x4W mm1, mm5, mm6, mm7, mm4 - movq [r0+ 8], mm1 - movq [r0+ 24], mm7 - movq [r0+ 40], mm4 - movq [r0+ 56], mm6 + movq m0, [r0 ] + movq m1, [r0+ 16] + movq m2, [r0+ 32] + movq m3, [r0+ 48] + TRANSPOSE4x4W 0,1,2,3,4 + movq [r0 ], m0 + movq [r0+ 16], m1 + movq [r0+ 32], m2 + movq [r0+ 48], m3 + + movq m0, [r0+ 72] + movq m1, [r0+ 88] + movq m2, [r0+104] + movq m3, [r0+120] + TRANSPOSE4x4W 0,1,2,3,4 + movq [r0+ 72], m0 + movq [r0+ 88], m1 + movq [r0+104], m2 + movq [r0+120], m3 + + movq m0, [r0+ 8] + movq m1, [r0+ 24] + movq m2, [r0+ 40] + movq m3, [r0+ 56] + TRANSPOSE4x4W 0,1,2,3,4 + movq m4, [r0+ 64] + movq m5, [r0+ 80] + movq m6, [r0+ 96] + movq m7, [r0+112] + + movq [r0+ 64], m0 + movq [r0+ 80], m1 + movq [r0+ 96], m2 + movq [r0+112], m3 + TRANSPOSE4x4W 4,5,6,7,0 + movq [r0+ 8], m4 + movq [r0+ 24], m5 + movq [r0+ 40], m6 + movq [r0+ 56], m7 ret ;----------------------------------------------------------------------------- @@ -369,123 +370,125 @@ cglobal x264_add8x8_idct8_mmx, 0,1 mov r0, r0m jmp x264_pixel_add_8x8_mmx +INIT_XMM + %macro IDCT8_1D 8 - movdqa %1, %3 - movdqa %5, %7 - psraw %3, 1 - psraw %7, 1 - psubw %3, %5 - paddw %7, %1 - movdqa %5, %2 - psraw %5, 1 - paddw %5, %2 - paddw %5, %4 - paddw %5, %6 - movdqa %1, %6 - psraw %1, 1 - paddw %1, %6 - paddw %1, %8 - psubw %1, %2 - psubw %2, %4 - psubw %6, %4 - paddw %2, %8 - psubw %6, %8 - psraw %4, 1 - psraw %8, 1 - psubw %2, %4 - psubw %6, %8 - movdqa %4, %5 - movdqa %8, %1 - psraw %4, 2 - psraw %8, 2 - paddw %4, %6 - paddw %8, %2 - psraw %6, 2 - psraw %2, 2 - psubw %5, %6 - psubw %2, %1 - movdqa %1, [eax+0x00] - movdqa %6, [eax+0x40] - SUMSUB_BA %6, %1 - SUMSUB_BA %7, %6 - SUMSUB_BA %3, %1 - SUMSUB_BA %5, %7 - SUMSUB_BA %2, %3 - SUMSUB_BA %8, %1 - SUMSUB_BA %4, %6 + movdqa m%1, m%3 + movdqa m%5, m%7 + psraw m%3, 1 + psraw m%7, 1 + psubw m%3, m%5 + paddw m%7, m%1 + movdqa m%5, m%2 + psraw m%5, 1 + paddw m%5, m%2 + paddw m%5, m%4 + paddw m%5, m%6 + movdqa m%1, m%6 + psraw m%1, 1 + paddw m%1, m%6 + paddw m%1, m%8 + psubw m%1, m%2 + psubw m%2, m%4 + psubw m%6, m%4 + paddw m%2, m%8 + psubw m%6, m%8 + psraw m%4, 1 + psraw m%8, 1 + psubw m%2, m%4 + psubw m%6, m%8 + movdqa m%4, m%5 + movdqa m%8, m%1 + psraw m%4, 2 + psraw m%8, 2 + paddw m%4, m%6 + paddw m%8, m%2 + psraw m%6, 2 + psraw m%2, 2 + psubw m%5, m%6 + psubw m%2, m%1 + movdqa m%1, [r1+0x00] + movdqa m%6, [r1+0x40] + SUMSUB_BA m%6, m%1 + SUMSUB_BA m%7, m%6 + SUMSUB_BA m%3, m%1 + SUMSUB_BA m%5, m%7 + SUMSUB_BA m%2, m%3 + SUMSUB_BA m%8, m%1 + SUMSUB_BA m%4, m%6 + SWAP %1, %5, %6 + SWAP %3, %8, %7 %endmacro -%macro TRANSPOSE8 9 - movdqa [%9], %8 - SBUTTERFLY dqa, wd, %1, %2, %8 - movdqa [%9+16], %8 - movdqa %8, [%9] - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, wd, %5, %6, %4 - SBUTTERFLY dqa, wd, %7, %8, %6 - SBUTTERFLY dqa, dq, %1, %3, %8 - movdqa [%9], %8 - movdqa %8, [16+%9] - SBUTTERFLY dqa, dq, %8, %2, %3 - SBUTTERFLY dqa, dq, %5, %7, %2 - SBUTTERFLY dqa, dq, %4, %6, %7 - SBUTTERFLY dqa, qdq, %1, %5, %6 - SBUTTERFLY dqa, qdq, %8, %4, %5 - movdqa [%9+16], %8 - movdqa %8, [%9] - SBUTTERFLY dqa, qdq, %8, %2, %4 - SBUTTERFLY dqa, qdq, %3, %7, %2 - movdqa %7, [%9+16] +; in: m0..m7 +; out: all except m4, which is in [%9+0x40] +%macro TRANSPOSE8x8W 9 + movdqa [%9], m%8 + SBUTTERFLY wd, %1, %2, %8 + movdqa [%9+16], m%2 + movdqa m%8, [%9] + SBUTTERFLY wd, %3, %4, %2 + SBUTTERFLY wd, %5, %6, %2 + SBUTTERFLY wd, %7, %8, %2 + SBUTTERFLY dq, %1, %3, %2 + movdqa [%9], m%3 + movdqa m%2, [16+%9] + SBUTTERFLY dq, %2, %4, %3 + SBUTTERFLY dq, %5, %7, %3 + SBUTTERFLY dq, %6, %8, %3 + SBUTTERFLY qdq, %1, %5, %3 + SBUTTERFLY qdq, %2, %6, %3 + movdqa [%9+0x40], m%2 + movdqa m%3, [%9] + SBUTTERFLY qdq, %3, %7, %2 + SBUTTERFLY qdq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 %endmacro ;----------------------------------------------------------------------------- ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2 - mov ecx, [esp+4] - mov eax, [esp+8] - movdqa xmm1, [eax+0x10] - movdqa xmm2, [eax+0x20] - movdqa xmm3, [eax+0x30] - movdqa xmm5, [eax+0x50] - movdqa xmm6, [eax+0x60] - movdqa xmm7, [eax+0x70] - IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 - TRANSPOSE8 xmm4, xmm1, xmm7, xmm3, xmm5, xmm0, xmm2, xmm6, eax +cglobal x264_add8x8_idct8_sse2, 2,2 + movdqa m1, [r1+0x10] + movdqa m2, [r1+0x20] + movdqa m3, [r1+0x30] + movdqa m5, [r1+0x50] + movdqa m6, [r1+0x60] + movdqa m7, [r1+0x70] + IDCT8_1D 0,1,2,3,4,5,6,7 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r1 picgetgot edx - paddw xmm4, [pw_32 GLOBAL] - movdqa [eax+0x00], xmm4 - movdqa [eax+0x40], xmm2 - IDCT8_1D xmm4, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7, xmm1 - movdqa [eax+0x60], xmm6 - movdqa [eax+0x70], xmm7 - pxor xmm7, xmm7 - STORE_DIFF_8P xmm2, [ecx+FDEC_STRIDE*0], xmm6, xmm7 - STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*1], xmm6, xmm7 - STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*2], xmm6, xmm7 - STORE_DIFF_8P xmm3, [ecx+FDEC_STRIDE*3], xmm6, xmm7 - STORE_DIFF_8P xmm5, [ecx+FDEC_STRIDE*4], xmm6, xmm7 - STORE_DIFF_8P xmm4, [ecx+FDEC_STRIDE*5], xmm6, xmm7 - movdqa xmm0, [eax+0x60] - movdqa xmm1, [eax+0x70] - STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7 - STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7 + paddw m0, [pw_32 GLOBAL] + movdqa [r1+0x00], m0 +; movdqa [r1+0x40], m4 ; still there from transpose + IDCT8_1D 0,1,2,3,4,5,6,7 + movdqa [r1+0x60], m6 + movdqa [r1+0x70], m7 + pxor m7, m7 + STORE_DIFF_8P m0, [r0+FDEC_STRIDE*0], m6, m7 + STORE_DIFF_8P m1, [r0+FDEC_STRIDE*1], m6, m7 + STORE_DIFF_8P m2, [r0+FDEC_STRIDE*2], m6, m7 + STORE_DIFF_8P m3, [r0+FDEC_STRIDE*3], m6, m7 + STORE_DIFF_8P m4, [r0+FDEC_STRIDE*4], m6, m7 + STORE_DIFF_8P m5, [r0+FDEC_STRIDE*5], m6, m7 + movdqa m0, [r1+0x60] + movdqa m1, [r1+0x70] + STORE_DIFF_8P m0, [r0+FDEC_STRIDE*6], m6, m7 + STORE_DIFF_8P m1, [r0+FDEC_STRIDE*7], m6, m7 ret ;----------------------------------------------------------------------------- ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- %macro SUB_NxN_DCT 4 -cglobal %1 - mov edx, [esp+12] - mov ecx, [esp+ 8] - mov eax, [esp+ 4] - add edx, %4 - add ecx, %4 - add eax, %3 - push edx - push ecx - push eax +cglobal %1, 3,3 + add r2, %4 + add r1, %4 + add r0, %3 + push r2 + push r1 + push r0 call %2 add dword [esp+0], %3 add dword [esp+4], %4*FENC_STRIDE-%4 @@ -503,13 +506,11 @@ cglobal %1 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 4 -cglobal %1 - mov ecx, [esp+8] - mov eax, [esp+4] - add ecx, %3 - add eax, %4 - push ecx - push eax +cglobal %1, 2,2 + add r1, %3 + add r0, %4 + push r1 + push r0 call %2 add dword [esp+0], %4*FDEC_STRIDE-%4 add dword [esp+4], %3 diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm index e3825204..fcc8a445 100644 --- a/common/x86/dct-64.asm +++ b/common/x86/dct-64.asm @@ -29,6 +29,8 @@ pw_32: times 8 dw 32 SECTION .text +INIT_XMM + %macro LOAD_DIFF_8P 5 movq %1, %4 punpcklbw %1, %3 @@ -43,28 +45,28 @@ SECTION .text psubw %2, %1 %endmacro -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 +%macro SBUTTERFLY 4 + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 + SWAP %3, %4 %endmacro -;----------------------------------------------------------------------------- -; input ABCDEFGH output AFHDTECB -;----------------------------------------------------------------------------- %macro TRANSPOSE8x8W 9 - SBUTTERFLY dqa, wd, %1, %2, %9 - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, wd, %5, %6, %4 - SBUTTERFLY dqa, wd, %7, %8, %6 - SBUTTERFLY dqa, dq, %1, %3, %8 - SBUTTERFLY dqa, dq, %9, %2, %3 - SBUTTERFLY dqa, dq, %5, %7, %2 - SBUTTERFLY dqa, dq, %4, %6, %7 - SBUTTERFLY dqa, qdq, %1, %5, %6 - SBUTTERFLY dqa, qdq, %9, %4, %5 - SBUTTERFLY dqa, qdq, %8, %2, %4 - SBUTTERFLY dqa, qdq, %3, %7, %2 + SBUTTERFLY wd, %1, %2, %9 + SBUTTERFLY wd, %3, %4, %9 + SBUTTERFLY wd, %5, %6, %9 + SBUTTERFLY wd, %7, %8, %9 + SBUTTERFLY dq, %1, %3, %9 + SBUTTERFLY dq, %2, %4, %9 + SBUTTERFLY dq, %5, %7, %9 + SBUTTERFLY dq, %6, %8, %9 + SBUTTERFLY qdq, %1, %5, %9 + SBUTTERFLY qdq, %2, %6, %9 + SBUTTERFLY qdq, %3, %7, %9 + SBUTTERFLY qdq, %4, %8, %9 + SWAP %2, %5 + SWAP %4, %7 %endmacro %macro STORE_DIFF_8P 4 @@ -78,166 +80,166 @@ SECTION .text SECTION .text -; in: ABCDEFGH -; out: FBCGEDHI %macro DCT8_1D 10 - SUMSUB_BA %8, %1 ; %8=s07, %1=d07 - SUMSUB_BA %7, %2 ; %7=s16, %2=d16 - SUMSUB_BA %6, %3 ; %6=s25, %3=d25 - SUMSUB_BA %5, %4 ; %5=s34, %4=d34 - - SUMSUB_BA %5, %8 ; %5=a0, %8=a2 - SUMSUB_BA %6, %7 ; %6=a1, %7=a3 - - movdqa %9, %1 - psraw %9, 1 - paddw %9, %1 - paddw %9, %2 - paddw %9, %3 ; %9=a4 - - movdqa %10, %4 - psraw %10, 1 - paddw %10, %4 - paddw %10, %2 - psubw %10, %3 ; %10=a7 - - SUMSUB_BA %4, %1 - psubw %1, %3 - psubw %4, %2 - psraw %3, 1 - psraw %2, 1 - psubw %1, %3 ; %1=a5 - psubw %4, %2 ; %4=a6 - - SUMSUB_BA %6, %5 ; %6=b0, %5=b4 - - movdqa %2, %10 - psraw %2, 2 - paddw %2, %9 ; %2=b1 - psraw %9, 2 - psubw %9, %10 ; %9=b7 - - movdqa %3, %7 - psraw %3, 1 - paddw %3, %8 ; %3=b2 - psraw %8, 1 - psubw %8, %7 ; %8=b6 - - movdqa %7, %4 - psraw %7, 2 - paddw %7, %1 ; %7=b3 - psraw %1, 2 - psubw %4, %1 ; %4=b5 + SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07 + SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16 + SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25 + SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34 + + SUMSUB_BA m%5, m%8 ; %5=a0, %8=a2 + SUMSUB_BA m%6, m%7 ; %6=a1, %7=a3 + + movdqa m%9, m%1 + psraw m%9, 1 + paddw m%9, m%1 + paddw m%9, m%2 + paddw m%9, m%3 ; %9=a4 + + movdqa m%10, m%4 + psraw m%10, 1 + paddw m%10, m%4 + paddw m%10, m%2 + psubw m%10, m%3 ; %10=a7 + + SUMSUB_BA m%4, m%1 + psubw m%1, m%3 + psubw m%4, m%2 + psraw m%3, 1 + psraw m%2, 1 + psubw m%1, m%3 ; %1=a5 + psubw m%4, m%2 ; %4=a6 + + SUMSUB_BA m%6, m%5 ; %6=b0, %5=b4 + + movdqa m%2, m%10 + psraw m%2, 2 + paddw m%2, m%9 ; %2=b1 + psraw m%9, 2 + psubw m%9, m%10 ; %9=b7 + + movdqa m%3, m%7 + psraw m%3, 1 + paddw m%3, m%8 ; %3=b2 + psraw m%8, 1 + psubw m%8, m%7 ; %8=b6 + + movdqa m%7, m%4 + psraw m%7, 2 + paddw m%7, m%1 ; %7=b3 + psraw m%1, 2 + psubw m%4, m%1 ; %4=b5 + + SWAP %1, %6, %4, %7, %8, %9 %endmacro ;----------------------------------------------------------------------------- ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- cglobal x264_sub8x8_dct8_sse2 - pxor xmm9, xmm9 - LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE] - LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE] - LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE] - LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE] - LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE] - LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE] - LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE] - LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE] - - DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9 - TRANSPOSE8x8W xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0 - DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9 - - movdqa [parm1q+0x00], xmm4 - movdqa [parm1q+0x10], xmm3 - movdqa [parm1q+0x20], xmm8 - movdqa [parm1q+0x30], xmm2 - movdqa [parm1q+0x40], xmm0 - movdqa [parm1q+0x50], xmm6 - movdqa [parm1q+0x60], xmm1 - movdqa [parm1q+0x70], xmm7 + LOAD_DIFF_8P m0, m8, m9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE] + LOAD_DIFF_8P m1, m8, m9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE] + LOAD_DIFF_8P m2, m8, m9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE] + LOAD_DIFF_8P m3, m8, m9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE] + LOAD_DIFF_8P m4, m8, m9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE] + LOAD_DIFF_8P m5, m8, m9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE] + LOAD_DIFF_8P m6, m8, m9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE] + LOAD_DIFF_8P m7, m8, m9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE] + + DCT8_1D 0,1,2,3,4,5,6,7,8,9 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 + DCT8_1D 0,1,2,3,4,5,6,7,8,9 + + movdqa [parm1q+0x00], m0 + movdqa [parm1q+0x10], m1 + movdqa [parm1q+0x20], m2 + movdqa [parm1q+0x30], m3 + movdqa [parm1q+0x40], m4 + movdqa [parm1q+0x50], m5 + movdqa [parm1q+0x60], m6 + movdqa [parm1q+0x70], m7 ret -; in: ABCDEFGH -; out: IBHDEACG %macro IDCT8_1D 10 - SUMSUB_BA %5, %1 ; %5=a0, %1=a2 - movdqa %10, %3 - psraw %3, 1 - psubw %3, %7 ; %3=a4 - psraw %7, 1 - paddw %7, %10 ; %7=a6 - - movdqa %9, %2 - psraw %9, 1 - paddw %9, %2 - paddw %9, %4 - paddw %9, %6 ; %9=a7 + SUMSUB_BA m%5, m%1 ; %5=a0, %1=a2 + movdqa m%10, m%3 + psraw m%3, 1 + psubw m%3, m%7 ; %3=a4 + psraw m%7, 1 + paddw m%7, m%10 ; %7=a6 + + movdqa m%9, m%2 + psraw m%9, 1 + paddw m%9, m%2 + paddw m%9, m%4 + paddw m%9, m%6 ; %9=a7 - movdqa %10, %6 - psraw %10, 1 - paddw %10, %6 - paddw %10, %8 - psubw %10, %2 ; %10=a5 - - psubw %2, %4 - psubw %6, %4 - paddw %2, %8 - psubw %6, %8 - psraw %4, 1 - psraw %8, 1 - psubw %2, %4 ; %2=a3 - psubw %6, %8 ; %6=a1 - - SUMSUB_BA %7, %5 ; %7=b0, %5=b6 - SUMSUB_BA %3, %1 ; %3=b2, %1=b4 - - movdqa %4, %9 - psraw %4, 2 - paddw %4, %6 ; %4=b1 - psraw %6, 2 - psubw %9, %6 ; %9=b7 - - movdqa %8, %10 - psraw %8, 2 - paddw %8, %2 ; %8=b3 - psraw %2, 2 - psubw %2, %10 ; %2=b5 - - SUMSUB_BA %9, %7 ; %9=c0, %7=c7 - SUMSUB_BA %2, %3 ; %2=c1, %3=c6 - SUMSUB_BA %8, %1 ; %8=c2, %1=c5 - SUMSUB_BA %4, %5 ; %4=c3, %5=c4 + movdqa m%10, m%6 + psraw m%10, 1 + paddw m%10, m%6 + paddw m%10, m%8 + psubw m%10, m%2 ; %10=a5 + + psubw m%2, m%4 + psubw m%6, m%4 + paddw m%2, m%8 + psubw m%6, m%8 + psraw m%4, 1 + psraw m%8, 1 + psubw m%2, m%4 ; %2=a3 + psubw m%6, m%8 ; %6=a1 + + SUMSUB_BA m%7, m%5 ; %7=b0, %5=b6 + SUMSUB_BA m%3, m%1 ; %3=b2, %1=b4 + + movdqa m%4, m%9 + psraw m%4, 2 + paddw m%4, m%6 ; %4=b1 + psraw m%6, 2 + psubw m%9, m%6 ; %9=b7 + + movdqa m%8, m%10 + psraw m%8, 2 + paddw m%8, m%2 ; %8=b3 + psraw m%2, 2 + psubw m%2, m%10 ; %2=b5 + + SUMSUB_BA m%9, m%7 ; %9=c0, %7=c7 + SUMSUB_BA m%2, m%3 ; %2=c1, %3=c6 + SUMSUB_BA m%8, m%1 ; %8=c2, %1=c5 + SUMSUB_BA m%4, m%5 ; %4=c3, %5=c4 + + SWAP %1, %9, %6 + SWAP %3, %8, %7 %endmacro ;----------------------------------------------------------------------------- ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- cglobal x264_add8x8_idct8_sse2 - movdqa xmm0, [parm2q+0x00] - movdqa xmm1, [parm2q+0x10] - movdqa xmm2, [parm2q+0x20] - movdqa xmm3, [parm2q+0x30] - movdqa xmm4, [parm2q+0x40] - movdqa xmm5, [parm2q+0x50] - movdqa xmm6, [parm2q+0x60] - movdqa xmm7, [parm2q+0x70] - - IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8 - TRANSPOSE8x8W xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5 - paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end - IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2 + movdqa m0, [parm2q+0x00] + movdqa m1, [parm2q+0x10] + movdqa m2, [parm2q+0x20] + movdqa m3, [parm2q+0x30] + movdqa m4, [parm2q+0x40] + movdqa m5, [parm2q+0x50] + movdqa m6, [parm2q+0x60] + movdqa m7, [parm2q+0x70] + + IDCT8_1D 0,1,2,3,4,5,6,7,8,9 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 + paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end + IDCT8_1D 0,1,2,3,4,5,6,7,8,9 - pxor xmm15, xmm15 - STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE] - STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE] - STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE] - STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE] - STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE] - STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE] - STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE] - STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE] + pxor m9, m9 + STORE_DIFF_8P m0, m8, m9, [parm1q+0*FDEC_STRIDE] + STORE_DIFF_8P m1, m8, m9, [parm1q+1*FDEC_STRIDE] + STORE_DIFF_8P m2, m8, m9, [parm1q+2*FDEC_STRIDE] + STORE_DIFF_8P m3, m8, m9, [parm1q+3*FDEC_STRIDE] + STORE_DIFF_8P m4, m8, m9, [parm1q+4*FDEC_STRIDE] + STORE_DIFF_8P m5, m8, m9, [parm1q+5*FDEC_STRIDE] + STORE_DIFF_8P m6, m8, m9, [parm1q+6*FDEC_STRIDE] + STORE_DIFF_8P m7, m8, m9, [parm1q+7*FDEC_STRIDE] ret diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 0b21f6b2..2fa1b756 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -71,20 +71,19 @@ SECTION .text psubw %4, %3 %endmacro -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 +%macro SBUTTERFLY 4 + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 + SWAP %3, %4 %endmacro -;----------------------------------------------------------------------------- -; input ABCD output ADTC -;----------------------------------------------------------------------------- %macro TRANSPOSE4x4W 5 - SBUTTERFLY q, wd, %1, %2, %5 - SBUTTERFLY q, wd, %3, %4, %2 - SBUTTERFLY q, dq, %1, %3, %4 - SBUTTERFLY q, dq, %5, %2, %3 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SWAP %2, %3 %endmacro %macro STORE_DIFF_4P 5 @@ -97,91 +96,85 @@ SECTION .text movd %5, %1 %endmacro +%macro HADAMARD4_1D 4 + SUMSUB_BADC m%2, m%1, m%4, m%3 + SUMSUB_BADC m%4, m%2, m%3, m%1 + SWAP %1, %4, %3 +%endmacro + ;----------------------------------------------------------------------------- ; void x264_dct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- cglobal x264_dct4x4dc_mmx, 1,1,1 - movq mm0, [r0+ 0] - movq mm1, [r0+ 8] - movq mm2, [r0+16] - movq mm3, [r0+24] - - SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 - SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 - - TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 - - SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 - SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 - - movq mm6, [pw_1 GLOBAL] - paddw mm0, mm6 - paddw mm2, mm6 - psraw mm0, 1 - movq [r0+ 0], mm0 - psraw mm2, 1 - movq [r0+ 8], mm2 - paddw mm3, mm6 - paddw mm4, mm6 - psraw mm3, 1 - movq [r0+16], mm3 - psraw mm4, 1 - movq [r0+24], mm4 + movq m0, [r0+ 0] + movq m1, [r0+ 8] + movq m2, [r0+16] + movq m3, [r0+24] + HADAMARD4_1D 0,1,2,3 + TRANSPOSE4x4W 0,1,2,3,4 + HADAMARD4_1D 0,1,2,3 + movq m6, [pw_1 GLOBAL] + paddw m0, m6 + paddw m1, m6 + paddw m2, m6 + paddw m3, m6 + psraw m0, 1 + psraw m1, 1 + psraw m2, 1 + psraw m3, 1 + movq [r0+0], m0 + movq [r0+8], m1 + movq [r0+16], m2 + movq [r0+24], m3 RET ;----------------------------------------------------------------------------- ; void x264_idct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- cglobal x264_idct4x4dc_mmx, 1,1 - movq mm0, [r0+ 0] - movq mm1, [r0+ 8] - movq mm2, [r0+16] - movq mm3, [r0+24] - - SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 - SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 - - TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 + movq m0, [r0+ 0] + movq m1, [r0+ 8] + movq m2, [r0+16] + movq m3, [r0+24] + HADAMARD4_1D 0,1,2,3 + TRANSPOSE4x4W 0,1,2,3,4 + HADAMARD4_1D 0,1,2,3 + movq [r0+ 0], m0 + movq [r0+ 8], m1 + movq [r0+16], m2 + movq [r0+24], m3 + RET - SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 - SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 +%macro DCT4_1D 5 + SUMSUB_BADC m%4, m%1, m%3, m%2 + SUMSUB_BA m%3, m%4 + SUMSUB2_AB m%1, m%2, m%5 + SWAP %1, %3, %4, %5, %2 +%endmacro - movq [r0+ 0], mm0 - movq [r0+ 8], mm2 - movq [r0+16], mm3 - movq [r0+24], mm4 - RET +%macro IDCT4_1D 6 + SUMSUB_BA m%3, m%1 + SUMSUBD2_AB m%2, m%4, m%6, m%5 + SUMSUB_BADC m%2, m%3, m%5, m%1 + SWAP %1, %2, %5, %4, %3 +%endmacro ;----------------------------------------------------------------------------- ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- cglobal x264_sub4x4_dct_mmx, 3,3 .skip_prologue: - pxor mm7, mm7 - - ; Load 4 lines - LOAD_DIFF_4P mm0, mm6, mm7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] - LOAD_DIFF_4P mm1, mm6, mm7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] - LOAD_DIFF_4P mm2, mm6, mm7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] - LOAD_DIFF_4P mm3, mm6, mm7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] - - SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12 - - SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12 - SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12 - - ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 - TRANSPOSE4x4W mm2, mm0, mm3, mm4, mm1 - - SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12 - - SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12 - SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12 - - movq [r0+ 0], mm1 - movq [r0+ 8], mm2 - movq [r0+16], mm3 - movq [r0+24], mm0 + LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] + LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] + LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] + LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] + DCT4_1D 0,1,2,3,4 + TRANSPOSE4x4W 0,1,2,3,4 + DCT4_1D 0,1,2,3,4 + movq [r0+ 0], m0 + movq [r0+ 8], m1 + movq [r0+16], m2 + movq [r0+24], m3 RET ;----------------------------------------------------------------------------- @@ -189,33 +182,19 @@ cglobal x264_sub4x4_dct_mmx, 3,3 ;----------------------------------------------------------------------------- cglobal x264_add4x4_idct_mmx, 2,2,1 .skip_prologue: - ; Load dct coeffs - movq mm0, [r1+ 0] ; dct - movq mm1, [r1+ 8] - movq mm2, [r1+16] - movq mm3, [r1+24] - - SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02 - SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) - - SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 - - ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0 - TRANSPOSE4x4W mm1, mm4, mm0, mm2, mm3 - - SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02 - SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) - - SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 - - pxor mm7, mm7 - movq mm6, [pw_32 GLOBAL] - - STORE_DIFF_4P mm2, mm0, mm6, mm7, [r0+0*FDEC_STRIDE] - STORE_DIFF_4P mm4, mm0, mm6, mm7, [r0+1*FDEC_STRIDE] - STORE_DIFF_4P mm1, mm0, mm6, mm7, [r0+2*FDEC_STRIDE] - STORE_DIFF_4P mm3, mm0, mm6, mm7, [r0+3*FDEC_STRIDE] - + movq m0, [r1+ 0] + movq m1, [r1+ 8] + movq m2, [r1+16] + movq m3, [r1+24] + IDCT4_1D 0,1,2,3,4,5 + TRANSPOSE4x4W 0,1,2,3,4 + IDCT4_1D 0,1,2,3,4,5 + pxor m7, m7 + movq m6, [pw_32 GLOBAL] + STORE_DIFF_4P m0, m4, m6, m7, [r0+0*FDEC_STRIDE] + STORE_DIFF_4P m1, m4, m6, m7, [r0+1*FDEC_STRIDE] + STORE_DIFF_4P m2, m4, m6, m7, [r0+2*FDEC_STRIDE] + STORE_DIFF_4P m3, m4, m6, m7, [r0+3*FDEC_STRIDE] RET diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 2b4946e9..f8ac8144 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -300,6 +300,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %endif align function_align %1: + RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer %endmacro %macro cglobal 3 @@ -330,7 +331,10 @@ SECTION ".note.GNU-stack" noalloc noexec nowrite progbits %assign FENC_STRIDE 16 %assign FDEC_STRIDE 32 +; merge mmx and sse* + %macro INIT_MMX 0 + %define RESET_MM_PERMUTATION INIT_MMX %define regsize 8 %define mova movq %define movu movq @@ -346,9 +350,16 @@ SECTION ".note.GNU-stack" noalloc noexec nowrite progbits %define m7 mm7 %undef m8 %undef m9 + %undef m10 + %undef m11 + %undef m12 + %undef m13 + %undef m14 + %undef m15 %endmacro %macro INIT_XMM 0 + %define RESET_MM_PERMUTATION INIT_XMM %define regsize 16 %define mova movdqa %define movu movdqu @@ -364,5 +375,94 @@ SECTION ".note.GNU-stack" noalloc noexec nowrite progbits %define m7 xmm7 %define m8 xmm8 %define m9 xmm9 + %define m10 xmm10 + %define m11 xmm11 + %define m12 xmm12 + %define m13 xmm13 + %define m14 xmm14 + %define m15 xmm15 +%endmacro + +INIT_MMX + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap +%rep %0/2 + %xdefine tmp%2 m%2 + %rotate 2 +%endrep +%rep %0/2 + %xdefine m%1 tmp%2 + %undef tmp%2 + %rotate 2 +%endrep +%endmacro + +%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) +%rep %0-1 + %xdefine tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 tmp + %undef tmp + %rotate 1 +%endrep +%endmacro + +%macro SAVE_MM_PERMUTATION 1 + %xdefine %1_m0 m0 + %xdefine %1_m1 m1 + %xdefine %1_m2 m2 + %xdefine %1_m3 m3 + %xdefine %1_m4 m4 + %xdefine %1_m5 m5 + %xdefine %1_m6 m6 + %xdefine %1_m7 m7 + %xdefine %1_m8 m8 + %xdefine %1_m9 m9 + %xdefine %1_m10 m10 + %xdefine %1_m11 m11 + %xdefine %1_m12 m12 + %xdefine %1_m13 m13 + %xdefine %1_m14 m14 + %xdefine %1_m15 m15 +%endmacro + +%macro LOAD_MM_PERMUTATION 1 + %xdefine m0 %1_m0 + %xdefine m1 %1_m1 + %xdefine m2 %1_m2 + %xdefine m3 %1_m3 + %xdefine m4 %1_m4 + %xdefine m5 %1_m5 + %xdefine m6 %1_m6 + %xdefine m7 %1_m7 + %xdefine m8 %1_m8 + %xdefine m9 %1_m9 + %xdefine m10 %1_m10 + %xdefine m11 %1_m11 + %xdefine m12 %1_m12 + %xdefine m13 %1_m13 + %xdefine m14 %1_m14 + %xdefine m15 %1_m15 +%endmacro + +%macro call 1 + call %1 + %ifdef %1_m0 + LOAD_MM_PERMUTATION %1 + %endif %endmacro