psubw %4, %3
%endmacro
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
+%macro SBUTTERFLY 4
+ mova m%4, m%2
+ punpckl%1 m%2, m%3
+ punpckh%1 m%4, m%3
+ SWAP %3, %4
%endmacro
-; input ABCD output ADTC
%macro TRANSPOSE4x4W 5
- SBUTTERFLY q, wd, %1, %2, %5
- SBUTTERFLY q, wd, %3, %4, %2
- SBUTTERFLY q, dq, %1, %3, %4
- SBUTTERFLY q, dq, %5, %2, %3
+ SBUTTERFLY wd, %1, %2, %5
+ SBUTTERFLY wd, %3, %4, %5
+ SBUTTERFLY dq, %1, %3, %5
+ SBUTTERFLY dq, %2, %4, %5
+ SWAP %2, %3
%endmacro
; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
;-----------------------------------------------------------------------------
ALIGN 16
x264_transpose_8x8_mmx:
- movq mm0, [r0 ]
- movq mm1, [r0+ 16]
- movq mm2, [r0+ 32]
- movq mm3, [r0+ 48]
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
- movq [r0 ], mm0
- movq [r0+ 16], mm3
- movq [r0+ 32], mm4
- movq [r0+ 48], mm2
-
- movq mm0, [r0+ 72]
- movq mm1, [r0+ 88]
- movq mm2, [r0+104]
- movq mm3, [r0+120]
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
- movq [r0+ 72], mm0
- movq [r0+ 88], mm3
- movq [r0+104], mm4
- movq [r0+120], mm2
-
- movq mm0, [r0+ 8]
- movq mm1, [r0+ 24]
- movq mm2, [r0+ 40]
- movq mm3, [r0+ 56]
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
- movq mm1, [r0+ 64]
- movq mm5, [r0+ 80]
- movq mm6, [r0+ 96]
- movq mm7, [r0+112]
-
- movq [r0+ 64], mm0
- movq [r0+ 80], mm3
- movq [r0+ 96], mm4
- movq [r0+112], mm2
- TRANSPOSE4x4W mm1, mm5, mm6, mm7, mm4
- movq [r0+ 8], mm1
- movq [r0+ 24], mm7
- movq [r0+ 40], mm4
- movq [r0+ 56], mm6
+ movq m0, [r0 ]
+ movq m1, [r0+ 16]
+ movq m2, [r0+ 32]
+ movq m3, [r0+ 48]
+ TRANSPOSE4x4W 0,1,2,3,4
+ movq [r0 ], m0
+ movq [r0+ 16], m1
+ movq [r0+ 32], m2
+ movq [r0+ 48], m3
+
+ movq m0, [r0+ 72]
+ movq m1, [r0+ 88]
+ movq m2, [r0+104]
+ movq m3, [r0+120]
+ TRANSPOSE4x4W 0,1,2,3,4
+ movq [r0+ 72], m0
+ movq [r0+ 88], m1
+ movq [r0+104], m2
+ movq [r0+120], m3
+
+ movq m0, [r0+ 8]
+ movq m1, [r0+ 24]
+ movq m2, [r0+ 40]
+ movq m3, [r0+ 56]
+ TRANSPOSE4x4W 0,1,2,3,4
+ movq m4, [r0+ 64]
+ movq m5, [r0+ 80]
+ movq m6, [r0+ 96]
+ movq m7, [r0+112]
+
+ movq [r0+ 64], m0
+ movq [r0+ 80], m1
+ movq [r0+ 96], m2
+ movq [r0+112], m3
+ TRANSPOSE4x4W 4,5,6,7,0
+ movq [r0+ 8], m4
+ movq [r0+ 24], m5
+ movq [r0+ 40], m6
+ movq [r0+ 56], m7
ret
;-----------------------------------------------------------------------------
mov r0, r0m
jmp x264_pixel_add_8x8_mmx
+INIT_XMM
+
%macro IDCT8_1D 8
- movdqa %1, %3
- movdqa %5, %7
- psraw %3, 1
- psraw %7, 1
- psubw %3, %5
- paddw %7, %1
- movdqa %5, %2
- psraw %5, 1
- paddw %5, %2
- paddw %5, %4
- paddw %5, %6
- movdqa %1, %6
- psraw %1, 1
- paddw %1, %6
- paddw %1, %8
- psubw %1, %2
- psubw %2, %4
- psubw %6, %4
- paddw %2, %8
- psubw %6, %8
- psraw %4, 1
- psraw %8, 1
- psubw %2, %4
- psubw %6, %8
- movdqa %4, %5
- movdqa %8, %1
- psraw %4, 2
- psraw %8, 2
- paddw %4, %6
- paddw %8, %2
- psraw %6, 2
- psraw %2, 2
- psubw %5, %6
- psubw %2, %1
- movdqa %1, [eax+0x00]
- movdqa %6, [eax+0x40]
- SUMSUB_BA %6, %1
- SUMSUB_BA %7, %6
- SUMSUB_BA %3, %1
- SUMSUB_BA %5, %7
- SUMSUB_BA %2, %3
- SUMSUB_BA %8, %1
- SUMSUB_BA %4, %6
+ movdqa m%1, m%3
+ movdqa m%5, m%7
+ psraw m%3, 1
+ psraw m%7, 1
+ psubw m%3, m%5
+ paddw m%7, m%1
+ movdqa m%5, m%2
+ psraw m%5, 1
+ paddw m%5, m%2
+ paddw m%5, m%4
+ paddw m%5, m%6
+ movdqa m%1, m%6
+ psraw m%1, 1
+ paddw m%1, m%6
+ paddw m%1, m%8
+ psubw m%1, m%2
+ psubw m%2, m%4
+ psubw m%6, m%4
+ paddw m%2, m%8
+ psubw m%6, m%8
+ psraw m%4, 1
+ psraw m%8, 1
+ psubw m%2, m%4
+ psubw m%6, m%8
+ movdqa m%4, m%5
+ movdqa m%8, m%1
+ psraw m%4, 2
+ psraw m%8, 2
+ paddw m%4, m%6
+ paddw m%8, m%2
+ psraw m%6, 2
+ psraw m%2, 2
+ psubw m%5, m%6
+ psubw m%2, m%1
+ movdqa m%1, [r1+0x00]
+ movdqa m%6, [r1+0x40]
+ SUMSUB_BA m%6, m%1
+ SUMSUB_BA m%7, m%6
+ SUMSUB_BA m%3, m%1
+ SUMSUB_BA m%5, m%7
+ SUMSUB_BA m%2, m%3
+ SUMSUB_BA m%8, m%1
+ SUMSUB_BA m%4, m%6
+ SWAP %1, %5, %6
+ SWAP %3, %8, %7
%endmacro
-%macro TRANSPOSE8 9
- movdqa [%9], %8
- SBUTTERFLY dqa, wd, %1, %2, %8
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- movdqa [%9], %8
- movdqa %8, [16+%9]
- SBUTTERFLY dqa, dq, %8, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %8, %4, %5
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
- movdqa %7, [%9+16]
+; in: m0..m7
+; out: all except m4, which is in [%9+0x40]
+%macro TRANSPOSE8x8W 9
+ movdqa [%9], m%8
+ SBUTTERFLY wd, %1, %2, %8
+ movdqa [%9+16], m%2
+ movdqa m%8, [%9]
+ SBUTTERFLY wd, %3, %4, %2
+ SBUTTERFLY wd, %5, %6, %2
+ SBUTTERFLY wd, %7, %8, %2
+ SBUTTERFLY dq, %1, %3, %2
+ movdqa [%9], m%3
+ movdqa m%2, [16+%9]
+ SBUTTERFLY dq, %2, %4, %3
+ SBUTTERFLY dq, %5, %7, %3
+ SBUTTERFLY dq, %6, %8, %3
+ SBUTTERFLY qdq, %1, %5, %3
+ SBUTTERFLY qdq, %2, %6, %3
+ movdqa [%9+0x40], m%2
+ movdqa m%3, [%9]
+ SBUTTERFLY qdq, %3, %7, %2
+ SBUTTERFLY qdq, %4, %8, %2
+ SWAP %2, %5
+ SWAP %4, %7
%endmacro
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2
- mov ecx, [esp+4]
- mov eax, [esp+8]
- movdqa xmm1, [eax+0x10]
- movdqa xmm2, [eax+0x20]
- movdqa xmm3, [eax+0x30]
- movdqa xmm5, [eax+0x50]
- movdqa xmm6, [eax+0x60]
- movdqa xmm7, [eax+0x70]
- IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
- TRANSPOSE8 xmm4, xmm1, xmm7, xmm3, xmm5, xmm0, xmm2, xmm6, eax
+cglobal x264_add8x8_idct8_sse2, 2,2
+ movdqa m1, [r1+0x10]
+ movdqa m2, [r1+0x20]
+ movdqa m3, [r1+0x30]
+ movdqa m5, [r1+0x50]
+ movdqa m6, [r1+0x60]
+ movdqa m7, [r1+0x70]
+ IDCT8_1D 0,1,2,3,4,5,6,7
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r1
picgetgot edx
- paddw xmm4, [pw_32 GLOBAL]
- movdqa [eax+0x00], xmm4
- movdqa [eax+0x40], xmm2
- IDCT8_1D xmm4, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7, xmm1
- movdqa [eax+0x60], xmm6
- movdqa [eax+0x70], xmm7
- pxor xmm7, xmm7
- STORE_DIFF_8P xmm2, [ecx+FDEC_STRIDE*0], xmm6, xmm7
- STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*1], xmm6, xmm7
- STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*2], xmm6, xmm7
- STORE_DIFF_8P xmm3, [ecx+FDEC_STRIDE*3], xmm6, xmm7
- STORE_DIFF_8P xmm5, [ecx+FDEC_STRIDE*4], xmm6, xmm7
- STORE_DIFF_8P xmm4, [ecx+FDEC_STRIDE*5], xmm6, xmm7
- movdqa xmm0, [eax+0x60]
- movdqa xmm1, [eax+0x70]
- STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7
- STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7
+ paddw m0, [pw_32 GLOBAL]
+ movdqa [r1+0x00], m0
+; movdqa [r1+0x40], m4 ; still there from transpose
+ IDCT8_1D 0,1,2,3,4,5,6,7
+ movdqa [r1+0x60], m6
+ movdqa [r1+0x70], m7
+ pxor m7, m7
+ STORE_DIFF_8P m0, [r0+FDEC_STRIDE*0], m6, m7
+ STORE_DIFF_8P m1, [r0+FDEC_STRIDE*1], m6, m7
+ STORE_DIFF_8P m2, [r0+FDEC_STRIDE*2], m6, m7
+ STORE_DIFF_8P m3, [r0+FDEC_STRIDE*3], m6, m7
+ STORE_DIFF_8P m4, [r0+FDEC_STRIDE*4], m6, m7
+ STORE_DIFF_8P m5, [r0+FDEC_STRIDE*5], m6, m7
+ movdqa m0, [r1+0x60]
+ movdqa m1, [r1+0x70]
+ STORE_DIFF_8P m0, [r0+FDEC_STRIDE*6], m6, m7
+ STORE_DIFF_8P m1, [r0+FDEC_STRIDE*7], m6, m7
ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 4
-cglobal %1
- mov edx, [esp+12]
- mov ecx, [esp+ 8]
- mov eax, [esp+ 4]
- add edx, %4
- add ecx, %4
- add eax, %3
- push edx
- push ecx
- push eax
+cglobal %1, 3,3
+ add r2, %4
+ add r1, %4
+ add r0, %3
+ push r2
+ push r1
+ push r0
call %2
add dword [esp+0], %3
add dword [esp+4], %4*FENC_STRIDE-%4
; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 4
-cglobal %1
- mov ecx, [esp+8]
- mov eax, [esp+4]
- add ecx, %3
- add eax, %4
- push ecx
- push eax
+cglobal %1, 2,2
+ add r1, %3
+ add r0, %4
+ push r1
+ push r0
call %2
add dword [esp+0], %4*FDEC_STRIDE-%4
add dword [esp+4], %3
SECTION .text
+INIT_XMM
+
%macro LOAD_DIFF_8P 5
movq %1, %4
punpcklbw %1, %3
psubw %2, %1
%endmacro
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
+%macro SBUTTERFLY 4
+ mova m%4, m%2
+ punpckl%1 m%2, m%3
+ punpckh%1 m%4, m%3
+ SWAP %3, %4
%endmacro
-;-----------------------------------------------------------------------------
-; input ABCDEFGH output AFHDTECB
-;-----------------------------------------------------------------------------
%macro TRANSPOSE8x8W 9
- SBUTTERFLY dqa, wd, %1, %2, %9
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- SBUTTERFLY dqa, dq, %9, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %9, %4, %5
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
+ SBUTTERFLY wd, %1, %2, %9
+ SBUTTERFLY wd, %3, %4, %9
+ SBUTTERFLY wd, %5, %6, %9
+ SBUTTERFLY wd, %7, %8, %9
+ SBUTTERFLY dq, %1, %3, %9
+ SBUTTERFLY dq, %2, %4, %9
+ SBUTTERFLY dq, %5, %7, %9
+ SBUTTERFLY dq, %6, %8, %9
+ SBUTTERFLY qdq, %1, %5, %9
+ SBUTTERFLY qdq, %2, %6, %9
+ SBUTTERFLY qdq, %3, %7, %9
+ SBUTTERFLY qdq, %4, %8, %9
+ SWAP %2, %5
+ SWAP %4, %7
%endmacro
%macro STORE_DIFF_8P 4
SECTION .text
-; in: ABCDEFGH
-; out: FBCGEDHI
%macro DCT8_1D 10
- SUMSUB_BA %8, %1 ; %8=s07, %1=d07
- SUMSUB_BA %7, %2 ; %7=s16, %2=d16
- SUMSUB_BA %6, %3 ; %6=s25, %3=d25
- SUMSUB_BA %5, %4 ; %5=s34, %4=d34
-
- SUMSUB_BA %5, %8 ; %5=a0, %8=a2
- SUMSUB_BA %6, %7 ; %6=a1, %7=a3
-
- movdqa %9, %1
- psraw %9, 1
- paddw %9, %1
- paddw %9, %2
- paddw %9, %3 ; %9=a4
-
- movdqa %10, %4
- psraw %10, 1
- paddw %10, %4
- paddw %10, %2
- psubw %10, %3 ; %10=a7
-
- SUMSUB_BA %4, %1
- psubw %1, %3
- psubw %4, %2
- psraw %3, 1
- psraw %2, 1
- psubw %1, %3 ; %1=a5
- psubw %4, %2 ; %4=a6
-
- SUMSUB_BA %6, %5 ; %6=b0, %5=b4
-
- movdqa %2, %10
- psraw %2, 2
- paddw %2, %9 ; %2=b1
- psraw %9, 2
- psubw %9, %10 ; %9=b7
-
- movdqa %3, %7
- psraw %3, 1
- paddw %3, %8 ; %3=b2
- psraw %8, 1
- psubw %8, %7 ; %8=b6
-
- movdqa %7, %4
- psraw %7, 2
- paddw %7, %1 ; %7=b3
- psraw %1, 2
- psubw %4, %1 ; %4=b5
+ SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
+ SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
+ SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25
+ SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34
+
+ SUMSUB_BA m%5, m%8 ; %5=a0, %8=a2
+ SUMSUB_BA m%6, m%7 ; %6=a1, %7=a3
+
+ movdqa m%9, m%1
+ psraw m%9, 1
+ paddw m%9, m%1
+ paddw m%9, m%2
+ paddw m%9, m%3 ; %9=a4
+
+ movdqa m%10, m%4
+ psraw m%10, 1
+ paddw m%10, m%4
+ paddw m%10, m%2
+ psubw m%10, m%3 ; %10=a7
+
+ SUMSUB_BA m%4, m%1
+ psubw m%1, m%3
+ psubw m%4, m%2
+ psraw m%3, 1
+ psraw m%2, 1
+ psubw m%1, m%3 ; %1=a5
+ psubw m%4, m%2 ; %4=a6
+
+ SUMSUB_BA m%6, m%5 ; %6=b0, %5=b4
+
+ movdqa m%2, m%10
+ psraw m%2, 2
+ paddw m%2, m%9 ; %2=b1
+ psraw m%9, 2
+ psubw m%9, m%10 ; %9=b7
+
+ movdqa m%3, m%7
+ psraw m%3, 1
+ paddw m%3, m%8 ; %3=b2
+ psraw m%8, 1
+ psubw m%8, m%7 ; %8=b6
+
+ movdqa m%7, m%4
+ psraw m%7, 2
+ paddw m%7, m%1 ; %7=b3
+ psraw m%1, 2
+ psubw m%4, m%1 ; %4=b5
+
+ SWAP %1, %6, %4, %7, %8, %9
%endmacro
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2
- pxor xmm9, xmm9
- LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
- LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
- LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
- LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
- LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE]
- LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE]
- LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE]
- LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE]
-
- DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
- TRANSPOSE8x8W xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
- DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
-
- movdqa [parm1q+0x00], xmm4
- movdqa [parm1q+0x10], xmm3
- movdqa [parm1q+0x20], xmm8
- movdqa [parm1q+0x30], xmm2
- movdqa [parm1q+0x40], xmm0
- movdqa [parm1q+0x50], xmm6
- movdqa [parm1q+0x60], xmm1
- movdqa [parm1q+0x70], xmm7
+ LOAD_DIFF_8P m0, m8, m9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
+ LOAD_DIFF_8P m1, m8, m9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
+ LOAD_DIFF_8P m2, m8, m9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
+ LOAD_DIFF_8P m3, m8, m9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
+ LOAD_DIFF_8P m4, m8, m9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE]
+ LOAD_DIFF_8P m5, m8, m9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE]
+ LOAD_DIFF_8P m6, m8, m9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE]
+ LOAD_DIFF_8P m7, m8, m9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE]
+
+ DCT8_1D 0,1,2,3,4,5,6,7,8,9
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+ DCT8_1D 0,1,2,3,4,5,6,7,8,9
+
+ movdqa [parm1q+0x00], m0
+ movdqa [parm1q+0x10], m1
+ movdqa [parm1q+0x20], m2
+ movdqa [parm1q+0x30], m3
+ movdqa [parm1q+0x40], m4
+ movdqa [parm1q+0x50], m5
+ movdqa [parm1q+0x60], m6
+ movdqa [parm1q+0x70], m7
ret
-; in: ABCDEFGH
-; out: IBHDEACG
%macro IDCT8_1D 10
- SUMSUB_BA %5, %1 ; %5=a0, %1=a2
- movdqa %10, %3
- psraw %3, 1
- psubw %3, %7 ; %3=a4
- psraw %7, 1
- paddw %7, %10 ; %7=a6
-
- movdqa %9, %2
- psraw %9, 1
- paddw %9, %2
- paddw %9, %4
- paddw %9, %6 ; %9=a7
+ SUMSUB_BA m%5, m%1 ; %5=a0, %1=a2
+ movdqa m%10, m%3
+ psraw m%3, 1
+ psubw m%3, m%7 ; %3=a4
+ psraw m%7, 1
+ paddw m%7, m%10 ; %7=a6
+
+ movdqa m%9, m%2
+ psraw m%9, 1
+ paddw m%9, m%2
+ paddw m%9, m%4
+ paddw m%9, m%6 ; %9=a7
- movdqa %10, %6
- psraw %10, 1
- paddw %10, %6
- paddw %10, %8
- psubw %10, %2 ; %10=a5
-
- psubw %2, %4
- psubw %6, %4
- paddw %2, %8
- psubw %6, %8
- psraw %4, 1
- psraw %8, 1
- psubw %2, %4 ; %2=a3
- psubw %6, %8 ; %6=a1
-
- SUMSUB_BA %7, %5 ; %7=b0, %5=b6
- SUMSUB_BA %3, %1 ; %3=b2, %1=b4
-
- movdqa %4, %9
- psraw %4, 2
- paddw %4, %6 ; %4=b1
- psraw %6, 2
- psubw %9, %6 ; %9=b7
-
- movdqa %8, %10
- psraw %8, 2
- paddw %8, %2 ; %8=b3
- psraw %2, 2
- psubw %2, %10 ; %2=b5
-
- SUMSUB_BA %9, %7 ; %9=c0, %7=c7
- SUMSUB_BA %2, %3 ; %2=c1, %3=c6
- SUMSUB_BA %8, %1 ; %8=c2, %1=c5
- SUMSUB_BA %4, %5 ; %4=c3, %5=c4
+ movdqa m%10, m%6
+ psraw m%10, 1
+ paddw m%10, m%6
+ paddw m%10, m%8
+ psubw m%10, m%2 ; %10=a5
+
+ psubw m%2, m%4
+ psubw m%6, m%4
+ paddw m%2, m%8
+ psubw m%6, m%8
+ psraw m%4, 1
+ psraw m%8, 1
+ psubw m%2, m%4 ; %2=a3
+ psubw m%6, m%8 ; %6=a1
+
+ SUMSUB_BA m%7, m%5 ; %7=b0, %5=b6
+ SUMSUB_BA m%3, m%1 ; %3=b2, %1=b4
+
+ movdqa m%4, m%9
+ psraw m%4, 2
+ paddw m%4, m%6 ; %4=b1
+ psraw m%6, 2
+ psubw m%9, m%6 ; %9=b7
+
+ movdqa m%8, m%10
+ psraw m%8, 2
+ paddw m%8, m%2 ; %8=b3
+ psraw m%2, 2
+ psubw m%2, m%10 ; %2=b5
+
+ SUMSUB_BA m%9, m%7 ; %9=c0, %7=c7
+ SUMSUB_BA m%2, m%3 ; %2=c1, %3=c6
+ SUMSUB_BA m%8, m%1 ; %8=c2, %1=c5
+ SUMSUB_BA m%4, m%5 ; %4=c3, %5=c4
+
+ SWAP %1, %9, %6
+ SWAP %3, %8, %7
%endmacro
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2
- movdqa xmm0, [parm2q+0x00]
- movdqa xmm1, [parm2q+0x10]
- movdqa xmm2, [parm2q+0x20]
- movdqa xmm3, [parm2q+0x30]
- movdqa xmm4, [parm2q+0x40]
- movdqa xmm5, [parm2q+0x50]
- movdqa xmm6, [parm2q+0x60]
- movdqa xmm7, [parm2q+0x70]
-
- IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
- TRANSPOSE8x8W xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
- paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end
- IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
+ movdqa m0, [parm2q+0x00]
+ movdqa m1, [parm2q+0x10]
+ movdqa m2, [parm2q+0x20]
+ movdqa m3, [parm2q+0x30]
+ movdqa m4, [parm2q+0x40]
+ movdqa m5, [parm2q+0x50]
+ movdqa m6, [parm2q+0x60]
+ movdqa m7, [parm2q+0x70]
+
+ IDCT8_1D 0,1,2,3,4,5,6,7,8,9
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+ paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
+ IDCT8_1D 0,1,2,3,4,5,6,7,8,9
- pxor xmm15, xmm15
- STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE]
- STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE]
- STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE]
- STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE]
- STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE]
- STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE]
- STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE]
- STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]
+ pxor m9, m9
+ STORE_DIFF_8P m0, m8, m9, [parm1q+0*FDEC_STRIDE]
+ STORE_DIFF_8P m1, m8, m9, [parm1q+1*FDEC_STRIDE]
+ STORE_DIFF_8P m2, m8, m9, [parm1q+2*FDEC_STRIDE]
+ STORE_DIFF_8P m3, m8, m9, [parm1q+3*FDEC_STRIDE]
+ STORE_DIFF_8P m4, m8, m9, [parm1q+4*FDEC_STRIDE]
+ STORE_DIFF_8P m5, m8, m9, [parm1q+5*FDEC_STRIDE]
+ STORE_DIFF_8P m6, m8, m9, [parm1q+6*FDEC_STRIDE]
+ STORE_DIFF_8P m7, m8, m9, [parm1q+7*FDEC_STRIDE]
ret
psubw %4, %3
%endmacro
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
+%macro SBUTTERFLY 4
+ mova m%4, m%2
+ punpckl%1 m%2, m%3
+ punpckh%1 m%4, m%3
+ SWAP %3, %4
%endmacro
-;-----------------------------------------------------------------------------
-; input ABCD output ADTC
-;-----------------------------------------------------------------------------
%macro TRANSPOSE4x4W 5
- SBUTTERFLY q, wd, %1, %2, %5
- SBUTTERFLY q, wd, %3, %4, %2
- SBUTTERFLY q, dq, %1, %3, %4
- SBUTTERFLY q, dq, %5, %2, %3
+ SBUTTERFLY wd, %1, %2, %5
+ SBUTTERFLY wd, %3, %4, %5
+ SBUTTERFLY dq, %1, %3, %5
+ SBUTTERFLY dq, %2, %4, %5
+ SWAP %2, %3
%endmacro
%macro STORE_DIFF_4P 5
movd %5, %1
%endmacro
+%macro HADAMARD4_1D 4
+ SUMSUB_BADC m%2, m%1, m%4, m%3
+ SUMSUB_BADC m%4, m%2, m%3, m%1
+ SWAP %1, %4, %3
+%endmacro
+
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_dct4x4dc_mmx, 1,1,1
- movq mm0, [r0+ 0]
- movq mm1, [r0+ 8]
- movq mm2, [r0+16]
- movq mm3, [r0+24]
-
- SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
- SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
-
- TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
-
- SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
- SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
-
- movq mm6, [pw_1 GLOBAL]
- paddw mm0, mm6
- paddw mm2, mm6
- psraw mm0, 1
- movq [r0+ 0], mm0
- psraw mm2, 1
- movq [r0+ 8], mm2
- paddw mm3, mm6
- paddw mm4, mm6
- psraw mm3, 1
- movq [r0+16], mm3
- psraw mm4, 1
- movq [r0+24], mm4
+ movq m0, [r0+ 0]
+ movq m1, [r0+ 8]
+ movq m2, [r0+16]
+ movq m3, [r0+24]
+ HADAMARD4_1D 0,1,2,3
+ TRANSPOSE4x4W 0,1,2,3,4
+ HADAMARD4_1D 0,1,2,3
+ movq m6, [pw_1 GLOBAL]
+ paddw m0, m6
+ paddw m1, m6
+ paddw m2, m6
+ paddw m3, m6
+ psraw m0, 1
+ psraw m1, 1
+ psraw m2, 1
+ psraw m3, 1
+ movq [r0+0], m0
+ movq [r0+8], m1
+ movq [r0+16], m2
+ movq [r0+24], m3
RET
;-----------------------------------------------------------------------------
; void x264_idct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_idct4x4dc_mmx, 1,1
- movq mm0, [r0+ 0]
- movq mm1, [r0+ 8]
- movq mm2, [r0+16]
- movq mm3, [r0+24]
-
- SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
- SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
-
- TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
+ movq m0, [r0+ 0]
+ movq m1, [r0+ 8]
+ movq m2, [r0+16]
+ movq m3, [r0+24]
+ HADAMARD4_1D 0,1,2,3
+ TRANSPOSE4x4W 0,1,2,3,4
+ HADAMARD4_1D 0,1,2,3
+ movq [r0+ 0], m0
+ movq [r0+ 8], m1
+ movq [r0+16], m2
+ movq [r0+24], m3
+ RET
- SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
- SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
+%macro DCT4_1D 5
+ SUMSUB_BADC m%4, m%1, m%3, m%2
+ SUMSUB_BA m%3, m%4
+ SUMSUB2_AB m%1, m%2, m%5
+ SWAP %1, %3, %4, %5, %2
+%endmacro
- movq [r0+ 0], mm0
- movq [r0+ 8], mm2
- movq [r0+16], mm3
- movq [r0+24], mm4
- RET
+%macro IDCT4_1D 6
+ SUMSUB_BA m%3, m%1
+ SUMSUBD2_AB m%2, m%4, m%6, m%5
+ SUMSUB_BADC m%2, m%3, m%5, m%1
+ SWAP %1, %2, %5, %4, %3
+%endmacro
;-----------------------------------------------------------------------------
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub4x4_dct_mmx, 3,3
.skip_prologue:
- pxor mm7, mm7
-
- ; Load 4 lines
- LOAD_DIFF_4P mm0, mm6, mm7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF_4P mm1, mm6, mm7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF_4P mm2, mm6, mm7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF_4P mm3, mm6, mm7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
-
- SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
-
- SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12
- SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12
-
- ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
- TRANSPOSE4x4W mm2, mm0, mm3, mm4, mm1
-
- SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12
-
- SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
- SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
-
- movq [r0+ 0], mm1
- movq [r0+ 8], mm2
- movq [r0+16], mm3
- movq [r0+24], mm0
+ LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ DCT4_1D 0,1,2,3,4
+ TRANSPOSE4x4W 0,1,2,3,4
+ DCT4_1D 0,1,2,3,4
+ movq [r0+ 0], m0
+ movq [r0+ 8], m1
+ movq [r0+16], m2
+ movq [r0+24], m3
RET
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal x264_add4x4_idct_mmx, 2,2,1
.skip_prologue:
- ; Load dct coeffs
- movq mm0, [r1+ 0] ; dct
- movq mm1, [r1+ 8]
- movq mm2, [r1+16]
- movq mm3, [r1+24]
-
- SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
- SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
-
- SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
-
- ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
- TRANSPOSE4x4W mm1, mm4, mm0, mm2, mm3
-
- SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02
- SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
-
- SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
-
- pxor mm7, mm7
- movq mm6, [pw_32 GLOBAL]
-
- STORE_DIFF_4P mm2, mm0, mm6, mm7, [r0+0*FDEC_STRIDE]
- STORE_DIFF_4P mm4, mm0, mm6, mm7, [r0+1*FDEC_STRIDE]
- STORE_DIFF_4P mm1, mm0, mm6, mm7, [r0+2*FDEC_STRIDE]
- STORE_DIFF_4P mm3, mm0, mm6, mm7, [r0+3*FDEC_STRIDE]
-
+ movq m0, [r1+ 0]
+ movq m1, [r1+ 8]
+ movq m2, [r1+16]
+ movq m3, [r1+24]
+ IDCT4_1D 0,1,2,3,4,5
+ TRANSPOSE4x4W 0,1,2,3,4
+ IDCT4_1D 0,1,2,3,4,5
+ pxor m7, m7
+ movq m6, [pw_32 GLOBAL]
+ STORE_DIFF_4P m0, m4, m6, m7, [r0+0*FDEC_STRIDE]
+ STORE_DIFF_4P m1, m4, m6, m7, [r0+1*FDEC_STRIDE]
+ STORE_DIFF_4P m2, m4, m6, m7, [r0+2*FDEC_STRIDE]
+ STORE_DIFF_4P m3, m4, m6, m7, [r0+3*FDEC_STRIDE]
RET
%endif
align function_align
%1:
+ RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
%endmacro
%macro cglobal 3
%assign FENC_STRIDE 16
%assign FDEC_STRIDE 32
+; merge mmx and sse*
+
%macro INIT_MMX 0
+ %define RESET_MM_PERMUTATION INIT_MMX
%define regsize 8
%define mova movq
%define movu movq
%define m7 mm7
%undef m8
%undef m9
+ %undef m10
+ %undef m11
+ %undef m12
+ %undef m13
+ %undef m14
+ %undef m15
%endmacro
%macro INIT_XMM 0
+ %define RESET_MM_PERMUTATION INIT_XMM
%define regsize 16
%define mova movdqa
%define movu movdqu
%define m7 xmm7
%define m8 xmm8
%define m9 xmm9
+ %define m10 xmm10
+ %define m11 xmm11
+ %define m12 xmm12
+ %define m13 xmm13
+ %define m14 xmm14
+ %define m15 xmm15
+%endmacro
+
+INIT_MMX
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+ %xdefine tmp%2 m%2
+ %rotate 2
+%endrep
+%rep %0/2
+ %xdefine m%1 tmp%2
+ %undef tmp%2
+ %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+ %xdefine tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 tmp
+ %undef tmp
+ %rotate 1
+%endrep
+%endmacro
+
+%macro SAVE_MM_PERMUTATION 1
+ %xdefine %1_m0 m0
+ %xdefine %1_m1 m1
+ %xdefine %1_m2 m2
+ %xdefine %1_m3 m3
+ %xdefine %1_m4 m4
+ %xdefine %1_m5 m5
+ %xdefine %1_m6 m6
+ %xdefine %1_m7 m7
+ %xdefine %1_m8 m8
+ %xdefine %1_m9 m9
+ %xdefine %1_m10 m10
+ %xdefine %1_m11 m11
+ %xdefine %1_m12 m12
+ %xdefine %1_m13 m13
+ %xdefine %1_m14 m14
+ %xdefine %1_m15 m15
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1
+ %xdefine m0 %1_m0
+ %xdefine m1 %1_m1
+ %xdefine m2 %1_m2
+ %xdefine m3 %1_m3
+ %xdefine m4 %1_m4
+ %xdefine m5 %1_m5
+ %xdefine m6 %1_m6
+ %xdefine m7 %1_m7
+ %xdefine m8 %1_m8
+ %xdefine m9 %1_m9
+ %xdefine m10 %1_m10
+ %xdefine m11 %1_m11
+ %xdefine m12 %1_m12
+ %xdefine m13 %1_m13
+ %xdefine m14 %1_m14
+ %xdefine m15 %1_m15
+%endmacro
+
+%macro call 1
+ call %1
+ %ifdef %1_m0
+ LOAD_MM_PERMUTATION %1
+ %endif
%endmacro