From: Loren Merritt Date: Sat, 1 Mar 2008 13:47:05 +0000 (+0000) Subject: some simplifications to mmx intra pred that should have been done way back when we... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c48882dd3f9a51b64c2d129f604bedb79d140626;p=libx264 some simplifications to mmx intra pred that should have been done way back when we switched to constant fdec_stride. and remove pic spills in functions that have a free caller-saved reg. patch partly by Fiona Glaser. git-svn-id: svn://svn.videolan.org/x264/trunk@741 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/predict-a.asm b/common/amd64/predict-a.asm index 91e6b8bc..d2cce53a 100644 --- a/common/amd64/predict-a.asm +++ b/common/amd64/predict-a.asm @@ -41,19 +41,18 @@ BITS 64 %macro STORE16x16 2 mov eax, 4 -ALIGN 4 .loop: + movq [parm1q + 0*FDEC_STRIDE], %1 movq [parm1q + 1*FDEC_STRIDE], %1 movq [parm1q + 2*FDEC_STRIDE], %1 movq [parm1q + 3*FDEC_STRIDE], %1 - movq [parm1q + 4*FDEC_STRIDE], %1 + movq [parm1q + 0*FDEC_STRIDE + 8], %2 movq [parm1q + 1*FDEC_STRIDE + 8], %2 movq [parm1q + 2*FDEC_STRIDE + 8], %2 movq [parm1q + 3*FDEC_STRIDE + 8], %2 - movq [parm1q + 4*FDEC_STRIDE + 8], %2 + add parm1q, 4*FDEC_STRIDE dec eax - lea parm1q, [parm1q + 4*FDEC_STRIDE] - jnz .loop + jg .loop nop %endmacro @@ -466,9 +465,8 @@ ALIGN 4 ; void predict_16x16_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_16x16_v_mmx - sub parm1q, FDEC_STRIDE - movq mm0, [parm1q] - movq mm1, [parm1q + 8] + movq mm0, [parm1q - FDEC_STRIDE] + movq mm1, [parm1q - FDEC_STRIDE + 8] STORE16x16 mm0, mm1 ret @@ -477,18 +475,15 @@ cglobal predict_16x16_v_mmx ;----------------------------------------------------------------------------- %macro PRED16x16_DC 2 - sub parm1q, FDEC_STRIDE - pxor mm0, mm0 pxor mm1, mm1 - psadbw mm0, [parm1q] - psadbw mm1, [parm1q + 8] + psadbw mm0, [parm1q - FDEC_STRIDE] + psadbw mm1, [parm1q - FDEC_STRIDE + 8] paddusw mm0, mm1 paddusw mm0, %1 psrlw mm0, %2 ; dc pshufw mm0, mm0, 0 packuswb mm0, mm0 ; dc in bytes - STORE16x16 mm0, mm0 %endmacro diff --git a/common/i386/dct-a.asm b/common/i386/dct-a.asm index 058b477c..53532756 100644 --- a/common/i386/dct-a.asm +++ b/common/i386/dct-a.asm @@ -141,8 +141,7 @@ cglobal x264_dct4x4dc_mmx movq mm2, [eax+16] movq mm3, [eax+24] - picpush ebx - picgetgot ebx + picgetgot edx MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 @@ -152,7 +151,7 @@ cglobal x264_dct4x4dc_mmx MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 - movq mm6, [pw_1 GOT_ebx] + movq mm6, [pw_1 GOT_edx] paddw mm0, mm6 paddw mm2, mm6 psraw mm0, 1 @@ -165,7 +164,6 @@ cglobal x264_dct4x4dc_mmx movq [eax+16], mm3 psraw mm4, 1 movq [eax+24], mm4 - picpop ebx ret ;----------------------------------------------------------------------------- @@ -241,8 +239,7 @@ cglobal x264_add4x4_idct_mmx mov eax, [esp+ 4] ; p_dst - picpush ebx - picgetgot ebx + picgetgot edx MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02 MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) @@ -258,14 +255,13 @@ cglobal x264_add4x4_idct_mmx MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 MMX_ZERO mm7 - movq mm6, [pw_32 GOT_ebx] + movq mm6, [pw_32 GOT_edx] MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax+0*FDEC_STRIDE] MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+1*FDEC_STRIDE] MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+2*FDEC_STRIDE] MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+3*FDEC_STRIDE] - picpop ebx ret diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm index 991b0966..2622b643 100644 --- a/common/i386/mc-a.asm +++ b/common/i386/mc-a.asm @@ -253,21 +253,19 @@ AVG2_END %macro BIWEIGHT_START_MMX 0 push edi push esi - picpush ebx - picgetgot ebx - mov edi, [picesp+12] ; dst - mov esi, [picesp+16] ; i_dst - mov edx, [picesp+20] ; src - mov ecx, [picesp+24] ; i_src - - pshufw mm4, [picesp+28], 0 ; weight_dst - movq mm5, [pw_64 GOT_ebx] - psubw mm5, mm4 ; weight_src - movq mm6, [pw_32 GOT_ebx] ; rounding + picgetgot ecx + movq mm5, [pw_64 GOT_ecx] + movq mm6, [pw_32 GOT_ecx] ; rounding + mov edi, [esp+12] ; dst + mov esi, [esp+16] ; i_dst + mov edx, [esp+20] ; src + mov ecx, [esp+24] ; i_src + pshufw mm4, [esp+28], 0 ; weight_dst pxor mm7, mm7 + psubw mm5, mm4 ; weight_src %endmacro + %macro BIWEIGHT_END_MMX 0 - picpop ebx pop esi pop edi ret @@ -278,7 +276,7 @@ AVG2_END ;----------------------------------------------------------------------------- cglobal x264_pixel_avg_weight_w16_mmxext BIWEIGHT_START_MMX - mov eax, [picesp+32] ; i_height + mov eax, [esp+32] ; i_height ALIGN 4 .height_loop @@ -298,7 +296,7 @@ cglobal x264_pixel_avg_weight_w16_mmxext ;----------------------------------------------------------------------------- cglobal x264_pixel_avg_weight_w8_mmxext BIWEIGHT_START_MMX - mov eax, [picesp+32] + mov eax, [esp+32] ALIGN 4 .height_loop diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm index 01156cc8..baafd68e 100644 --- a/common/i386/predict-a.asm +++ b/common/i386/predict-a.asm @@ -39,17 +39,23 @@ BITS 32 movq [edx + 7*FDEC_STRIDE], %2 %endmacro -%macro SAVE_0_1 1 - movq [%1] , mm0 - movq [%1 + 8] , mm1 -%endmacro - -%macro SAVE_0_0 1 - movq [%1] , mm0 - movq [%1 + 8] , mm0 +%macro STORE16x16 2 + mov eax, 4 +.loop: + movq [edx + 0*FDEC_STRIDE], %1 + movq [edx + 1*FDEC_STRIDE], %1 + movq [edx + 2*FDEC_STRIDE], %1 + movq [edx + 3*FDEC_STRIDE], %1 + movq [edx + 0*FDEC_STRIDE + 8], %2 + movq [edx + 1*FDEC_STRIDE + 8], %2 + movq [edx + 2*FDEC_STRIDE + 8], %2 + movq [edx + 3*FDEC_STRIDE + 8], %2 + add edx, 4*FDEC_STRIDE + dec eax + jg .loop + nop %endmacro - SECTION_RODATA ALIGN 8 @@ -76,7 +82,7 @@ SECTION .text pavgb %2, %3 pxor %3, %5 movq %1, %4 - pand %3, [pb_1 GOT_ebx] + pand %3, [pb_1 GOT_ecx] psubusb %2, %3 pavgb %1, %2 %endmacro @@ -96,21 +102,19 @@ cglobal predict_8x8_v_mmxext ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_dc_mmxext - picpush ebx - picgetgot ebx - mov eax, [picesp + 8] - mov edx, [picesp + 4] + picgetgot ecx + mov eax, [esp + 8] + mov edx, [esp + 4] pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [eax+7] psadbw mm1, [eax+16] - paddw mm0, [pw_8 GOT_ebx] + paddw mm0, [pw_8 GOT_ecx] paddw mm0, mm1 psrlw mm0, 4 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 - picpop ebx ret ;----------------------------------------------------------------------------- @@ -118,18 +122,16 @@ cglobal predict_8x8_dc_mmxext ;----------------------------------------------------------------------------- %macro PRED8x8_DC 2 cglobal %1 - picpush ebx - picgetgot ebx - mov eax, [picesp + 8] - mov edx, [picesp + 4] + picgetgot ecx + mov eax, [esp + 8] + mov edx, [esp + 4] pxor mm0, mm0 psadbw mm0, [eax+%2] - paddw mm0, [pw_4 GOT_ebx] + paddw mm0, [pw_4 GOT_ecx] psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 - picpop ebx ret %endmacro @@ -140,10 +142,9 @@ PRED8x8_DC predict_8x8_dc_left_mmxext, 7 ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl_mmxext - picpush ebx - picgetgot ebx - mov eax, [picesp + 8] - mov edx, [picesp + 4] + picgetgot ecx + mov eax, [esp + 8] + mov edx, [esp + 4] movq mm1, [eax + 15] movq mm2, [eax + 17] movq mm3, [eax + 23] @@ -168,17 +169,15 @@ cglobal predict_8x8_ddl_mmxext %assign Y (Y-1) movq [edx + Y*FDEC_STRIDE], mm1 - picpop ebx ret ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr_mmxext - picpush ebx - picgetgot ebx - mov eax, [picesp + 8] - mov edx, [picesp + 4] + picgetgot ecx + mov eax, [esp + 8] + mov edx, [esp + 4] movq mm1, [eax + 7] movq mm2, [eax + 9] movq mm3, [eax + 15] @@ -203,7 +202,6 @@ cglobal predict_8x8_ddr_mmxext %assign Y (Y-1) movq [edx + Y*FDEC_STRIDE], mm0 - picpop ebx ret ;----------------------------------------------------------------------------- @@ -222,10 +220,9 @@ cglobal predict_8x8_ddr_mmxext ; 7 ,,,,, cglobal predict_8x8_vr_core_mmxext - picpush ebx - picgetgot ebx - mov eax, [picesp + 8] - mov edx, [picesp + 4] + picgetgot ecx + mov eax, [esp + 8] + mov edx, [esp + 4] movq mm2, [eax + 16] movq mm3, [eax + 15] movq mm1, [eax + 14] @@ -244,13 +241,12 @@ cglobal predict_8x8_vr_core_mmxext movq [edx + Y *FDEC_STRIDE], mm3 movq [edx + (Y+1)*FDEC_STRIDE], mm0 - picpop ebx ret ;----------------------------------------------------------------------------- ; void predict_8x8c_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_8x8c_v_mmx +cglobal predict_8x8c_v_mmx mov edx, [esp + 4] movq mm0, [edx - FDEC_STRIDE] STORE8x8 mm0, mm0 @@ -260,10 +256,9 @@ cglobal predict_8x8c_v_mmx ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_dc_core_mmxext - picpush ebx - picgetgot ebx + picgetgot ecx - mov edx, [picesp + 4] + mov edx, [esp + 4] movq mm0, [edx - FDEC_STRIDE] pxor mm1, mm1 @@ -273,10 +268,10 @@ cglobal predict_8x8c_dc_core_mmxext psadbw mm1, mm2 ; s1 psadbw mm0, mm2 ; s0 - paddw mm0, [picesp + 8] - pshufw mm2, [picesp + 12], 0 + paddw mm0, [esp + 8] + pshufw mm2, [esp + 12], 0 psrlw mm0, 3 - paddw mm1, [pw_2 GOT_ebx] + paddw mm1, [pw_2 GOT_ecx] movq mm3, mm2 pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) @@ -289,24 +284,20 @@ cglobal predict_8x8c_dc_core_mmxext packuswb mm2, mm3 ; dc2,dc3 (b) STORE8x8 mm0, mm2 - - picpop ebx ret ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_p_core_mmxext - picpush ebx - picgetgot ebx - - mov edx, [picesp + 4] - mov ecx, FDEC_STRIDE - pshufw mm0, [picesp + 8], 0 - pshufw mm2, [picesp +12], 0 - pshufw mm4, [picesp +16], 0 + picgetgot ecx + + mov edx, [esp + 4] + pshufw mm0, [esp + 8], 0 + pshufw mm2, [esp +12], 0 + pshufw mm4, [esp +16], 0 movq mm1, mm2 - pmullw mm2, [pw_3210 GOT_ebx] + pmullw mm2, [pw_3210 GOT_ecx] psllw mm1, 2 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} @@ -323,29 +314,26 @@ ALIGN 4 paddsw mm0, mm4 paddsw mm1, mm4 - add edx, ecx + add edx, FDEC_STRIDE dec eax jg .loop nop - picpop ebx ret ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_16x16_p_core_mmxext - picpush ebx - picgetgot ebx - - mov edx, [picesp + 4] - mov ecx, FDEC_STRIDE - pshufw mm0, [picesp + 8], 0 - pshufw mm2, [picesp +12], 0 - pshufw mm4, [picesp +16], 0 + picgetgot ecx + + mov edx, [esp + 4] + pshufw mm0, [esp + 8], 0 + pshufw mm2, [esp +12], 0 + pshufw mm4, [esp +16], 0 movq mm5, mm2 movq mm1, mm2 - pmullw mm5, [pw_3210 GOT_ebx] + pmullw mm5, [pw_3210 GOT_ecx] psllw mm2, 3 psllw mm1, 2 movq mm3, mm2 @@ -375,91 +363,47 @@ ALIGN 4 paddsw mm1, mm4 paddsw mm2, mm4 paddsw mm3, mm4 - add edx, ecx + add edx, FDEC_STRIDE dec eax jg .loop nop - picpop ebx ret ;----------------------------------------------------------------------------- ; void predict_16x16_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_v_mmx +cglobal predict_16x16_v_mmx mov edx, [esp + 4] - mov ecx, FDEC_STRIDE - sub edx, ecx ; edx <-- line -1 - - movq mm0, [edx] - movq mm1, [edx + 8] - lea eax, [ecx + 2*ecx] ; eax <-- 3* stride - - SAVE_0_1 (edx + ecx) ; 0 - SAVE_0_1 (edx + 2 * ecx) ; 1 - SAVE_0_1 (edx + eax) ; 2 - SAVE_0_1 (edx + 4 * ecx) ; 3 - SAVE_0_1 (edx + 2 * eax) ; 5 - SAVE_0_1 (edx + 8 * ecx) ; 7 - SAVE_0_1 (edx + 4 * eax) ; 11 - add edx, ecx ; edx <-- line 0 - SAVE_0_1 (edx + 4 * ecx) ; 4 - SAVE_0_1 (edx + 2 * eax) ; 6 - SAVE_0_1 (edx + 8 * ecx) ; 8 - SAVE_0_1 (edx + 4 * eax) ; 12 - lea edx, [edx + 8 * ecx] ; edx <-- line 8 - SAVE_0_1 (edx + ecx) ; 9 - SAVE_0_1 (edx + 2 * ecx) ; 10 - lea edx, [edx + 4 * ecx] ; edx <-- line 12 - SAVE_0_1 (edx + ecx) ; 13 - SAVE_0_1 (edx + 2 * ecx) ; 14 - SAVE_0_1 (edx + eax) ; 15 - + movq mm0, [edx - FDEC_STRIDE] + movq mm1, [edx + 8 - FDEC_STRIDE] + STORE16x16 mm0, mm1 ret ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- -%macro PRED16x16_DC 3 - mov edx, [%3 + 4] - mov ecx, FDEC_STRIDE - sub edx, ecx ; edx <-- line -1 - +%macro PRED16x16_DC 2 + mov edx, [esp+4] pxor mm0, mm0 pxor mm1, mm1 - psadbw mm0, [edx] - psadbw mm1, [edx + 8] + psadbw mm0, [edx - FDEC_STRIDE] + psadbw mm1, [edx - FDEC_STRIDE + 8] paddusw mm0, mm1 - paddusw mm0, %1 ; FIXME is stack alignment guaranteed? + paddusw mm0, %1 psrlw mm0, %2 ; dc - push edi pshufw mm0, mm0, 0 - lea eax, [ecx + 2*ecx] ; eax <-- 3* stride packuswb mm0, mm0 ; dc in bytes - - mov edi, 4 -ALIGN 4 -.loop: - SAVE_0_0 (edx + ecx) ; 0 - SAVE_0_0 (edx + 2 * ecx) ; 1 - SAVE_0_0 (edx + eax) ; 2 - SAVE_0_0 (edx + 4 * ecx) ; 3 - dec edi - lea edx, [edx + 4 * ecx] - jg .loop - - pop edi + STORE16x16 mm0, mm0 %endmacro cglobal predict_16x16_dc_core_mmxext - PRED16x16_DC [esp+8], 5, esp + PRED16x16_DC [esp+8], 5 ret cglobal predict_16x16_dc_top_mmxext - picpush ebx - picgetgot ebx - PRED16x16_DC [pw_8 GOT_ebx], 4, picesp - picpop ebx + picgetgot ecx + PRED16x16_DC [pw_8 GOT_ecx], 4 ret diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm index 40f9df93..dc7611ae 100644 --- a/common/i386/quant-a.asm +++ b/common/i386/quant-a.asm @@ -275,11 +275,9 @@ cglobal %1 .rshift32: neg eax - picpush ebx - picgetgot ebx - movq mm6, [pd_1 GOT_ebx] - picpop ebx movd mm5, eax + picgetgot eax + movq mm6, [pd_1 GOT_eax] pxor mm7, mm7 pslld mm6, mm5 psrld mm6, 1