From: Loren Merritt Date: Fri, 4 Apr 2008 02:46:36 +0000 (-0600) Subject: more mmx/xmm macros (mova, movu, movh) X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=29899d84c3ca0e11f70a0aea8e6adf721e6bbfb2;p=libx264 more mmx/xmm macros (mova, movu, movh) --- diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index fd87234d..0f8ed8ba 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -138,8 +138,8 @@ SECTION .text ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT 5 - movq %5, %2 - movq %4, %1 + mova %5, %2 + mova %4, %1 psubusb %5, %1 psubusb %4, %2 por %4, %5 @@ -149,8 +149,8 @@ SECTION .text ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT2 5 - movq %5, %2 - movq %4, %1 + mova %5, %2 + mova %4, %1 psubusb %5, %1 psubusb %4, %2 psubusb %5, %3 @@ -190,7 +190,7 @@ SECTION .text ; out: m1=p0' m2=q0' ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 - movq m5, m1 + mova m5, m1 pxor m5, m2 ; p0^q0 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 pcmpeqb m4, m4 @@ -201,7 +201,7 @@ SECTION .text pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m3, m5 paddusb m3, m4 ; d+128+33 - movq m6, [pb_a1 GLOBAL] + mova m6, [pb_a1 GLOBAL] psubusb m6, m3 psubusb m3, [pb_a1 GLOBAL] pminub m6, m7 @@ -217,18 +217,18 @@ SECTION .text ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) ; clobbers: q2, tmp, tc0 %macro LUMA_Q1 6 - movq %6, m1 + mova %6, m1 pavgb %6, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - movq %6, %1 + mova %6, %1 psubusb %6, %5 paddusb %5, %1 pmaxub %2, %6 pminub %2, %5 - movq %4, %2 + mova %4, %2 %endmacro ;----------------------------------------------------------------------------- @@ -244,10 +244,10 @@ cglobal x264_deblock_v_luma_sse2 dec r3d ; beta-1 add r4, r0 ; pix-3*stride - movdqa m0, [r4+r1] ; p1 - movdqa m1, [r4+2*r1] ; p0 - movdqa m2, [r0] ; q0 - movdqa m3, [r0+r1] ; q1 + mova m0, [r4+r1] ; p1 + mova m1, [r4+2*r1] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r1] ; q1 LOAD_MASK r2d, r3d punpcklbw m8, m8 @@ -260,7 +260,7 @@ cglobal x264_deblock_v_luma_sse2 movdqa m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m9 - movdqa m7, m8 + mova m7, m8 psubb m7, m6 pand m6, m8 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -270,12 +270,12 @@ cglobal x264_deblock_v_luma_sse2 pand m6, m9 pand m8, m6 psubb m7, m6 - movdqa m3, [r0+r1] + mova m3, [r0+r1] LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 DEBLOCK_P0_Q0 - movdqa [r4+2*r1], m1 - movdqa [r0], m2 + mova [r4+2*r1], m1 + mova [r0], m2 ret ;----------------------------------------------------------------------------- @@ -338,10 +338,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5,1 dec r3 ; beta-1 add r4, r0 ; pix-3*stride - movq m0, [r4+r1] ; p1 - movq m1, [r4+2*r1] ; p0 - movq m2, [r0] ; q0 - movq m3, [r0+r1] ; q1 + mova m0, [r4+r1] ; p1 + mova m1, [r4+2*r1] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r1] ; q1 LOAD_MASK r2, r3 mov r3, r4m @@ -356,34 +356,34 @@ cglobal x264_deblock_%2_luma_%1, 5,5,1 movd m4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] - movq [esp+%3], m4 ; tc + mova [esp+%3], m4 ; tc pcmpeqb m3, m3 pcmpgtb m4, m3 pand m4, m7 - movq [esp], m4 ; mask + mova [esp], m4 ; mask - movq m3, [r4] ; p2 + mova m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m4 pand m4, [esp+%3] ; tc - movq m7, m4 + mova m7, m4 psubb m7, m6 pand m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 - movq m4, [r0+2*r1] ; q2 + mova m4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 - movq m5, [esp] ; mask + mova m5, [esp] ; mask pand m6, m5 - movq m5, [esp+%3] ; tc + mova m5, [esp+%3] ; tc pand m5, m6 psubb m7, m6 - movq m3, [r0+r1] + mova m3, [r0+r1] LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 DEBLOCK_P0_Q0 - movq [r4+2*r1], m1 - movq [r0], m2 + mova [r4+2*r1], m1 + mova [r0], m2 %if %3 == 16 mov esp, r2 diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index b8ed1945..90aebf7f 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -75,7 +75,7 @@ SECTION .text ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t) ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t) - movq m0, %1 ; load dct coeffs + mova m0, %1 ; load dct coeffs pxor m1, m1 pcmpgtw m1, m0 ; sign(coeff) pxor m0, m1 @@ -84,16 +84,16 @@ SECTION .text pmulhuw m0, %2 ; divide pxor m0, m1 ; restore sign psubw m0, m1 - movq %1, m0 ; store + mova %1, m0 ; store %endmacro %macro QUANT_SSSE3 3 - movq m1, %1 ; load dct coeffs + mova m1, %1 ; load dct coeffs pabsw m0, m1 paddusw m0, %3 ; round pmulhuw m0, %2 ; divide psignw m0, m1 ; restore sign - movq %1, m0 ; store + mova %1, m0 ; store %endmacro INIT_MMX @@ -162,11 +162,11 @@ QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16 ;;; %2,%3 dequant_mf[i_mf][y][x] ;;; m5 i_qbits - movq m0, %2 + mova m0, %2 packssdw m0, %3 pmullw m0, %1 psllw m0, m5 - movq %1, m0 + mova %1, m0 %endmacro %macro DEQUANT32_R 3 @@ -176,8 +176,8 @@ QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16 ;;; m6 f ;;; m7 0 - movq m0, %1 - movq m1, m0 + mova m0, %1 + mova m1, m0 punpcklwd m0, m7 punpckhwd m1, m7 pmaddwd m0, %2 @@ -187,7 +187,7 @@ QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16 psrad m0, m5 psrad m1, m5 packssdw m0, m1 - movq %1, m0 + mova %1, m0 %endmacro %macro DEQUANT_LOOP 3 @@ -207,17 +207,17 @@ QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16 %endmacro %macro DEQUANT16_FLAT 2-8 - movq m0, %1 + mova m0, %1 %assign i %0-2 %rep %0-1 %if i - movq m %+ i, [r0+%2] + mova m %+ i, [r0+%2] pmullw m %+ i, m0 %else pmullw m0, [r0+%2] %endif psllw m %+ i, m7 - movq [r0+%2], m %+ i + mova [r0+%2], m %+ i %assign i i-1 %rotate 1 %endrep @@ -268,7 +268,7 @@ cglobal x264_dequant_%2x%2_%1, 0,3 neg t0d movd m5, t0d picgetgot t0d - movq m6, [pd_1 GLOBAL] + mova m6, [pd_1 GLOBAL] pxor m7, m7 pslld m6, m5 psrld m6, 1 diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index d0432f45..8b124b0b 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -331,7 +331,9 @@ SECTION ".note.GNU-stack" noalloc noexec nowrite progbits %assign FDEC_STRIDE 32 %macro INIT_MMX 0 - %undef movq + %define mova movq + %define movu movq + %define movh movd %define m0 mm0 %define m1 mm1 %define m2 mm2 @@ -345,7 +347,9 @@ SECTION ".note.GNU-stack" noalloc noexec nowrite progbits %endmacro %macro INIT_XMM 0 - %define movq movdqa + %define mova movdqa + %define movu movdqu + %define movh movq %define m0 xmm0 %define m1 xmm1 %define m2 xmm2