From: Loren Merritt Date: Wed, 3 Aug 2011 14:53:29 +0000 (+0000) Subject: asm cosmetics part 2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1921c6824e37bdf5a8436a6cbe36b0d3a8c376b3;p=libx264 asm cosmetics part 2 These changes were split out of the cpuflags commit because they change the output executable. --- diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index c3f52ce1..9db4ee31 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -359,7 +359,7 @@ INIT_MMX ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- %macro SUB_NxN_DCT 6 -cglobal %1, 3,3,11*(mmsize/16) +cglobal %1, 3,3,11 %ifndef HIGH_BIT_DEPTH %if mmsize == 8 pxor m7, m7 @@ -398,9 +398,9 @@ cglobal %1, 3,3,11*(mmsize/16) ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 6-7 %ifdef HIGH_BIT_DEPTH -cglobal %1, 2,2,6*(mmsize/16) +cglobal %1, 2,2,6 %else -cglobal %1, 2,2,11*(mmsize/16) +cglobal %1, 2,2,11 pxor m7, m7 %endif %if mmsize==16 @@ -661,6 +661,7 @@ cglobal add16x16_idct_dc_mmx, 2,3 movdqa [r0+%1+FDEC_STRIDE*3], xmm7 %endmacro +INIT_XMM cglobal add16x16_idct_dc_sse2, 2,2,8 call .loop add r0, FDEC_STRIDE*4 @@ -939,7 +940,7 @@ SCAN_8x8 ; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] ) ;----------------------------------------------------------------------------- %macro SCAN_8x8_FRAME 5 -cglobal zigzag_scan_8x8_frame, 2,2,8*(mmsize/16) +cglobal zigzag_scan_8x8_frame, 2,2,8 mova m0, [r1] mova m1, [r1+ 8*SIZEOF_DCTCOEF] movu m2, [r1+14*SIZEOF_DCTCOEF] @@ -1149,7 +1150,7 @@ cglobal zigzag_scan_4x4_field_mmx2, 2,3 ; 54 55 58 59 60 61 62 63 %undef SCAN_8x8 %macro SCAN_8x8 5 -cglobal zigzag_scan_8x8_field, 2,3,8*(mmsize/16) +cglobal zigzag_scan_8x8_field, 2,3,8 mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00 mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04 mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08 @@ -1330,7 +1331,7 @@ ZIGZAG_SUB_4x4 ac, field %endmacro %macro ZIGZAG_8x8_CAVLC 1 -cglobal zigzag_interleave_8x8_cavlc, 3,3,8*(mmsize/16) +cglobal zigzag_interleave_8x8_cavlc, 3,3,8 INTERLEAVE 0, %1 INTERLEAVE 8, %1 INTERLEAVE 16, %1 diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 617166b7..c6a2a6bf 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -162,7 +162,7 @@ cextern pw_pixel_max ;----------------------------------------------------------------------------- ; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_luma, 5,5,8*(mmsize/16) +cglobal deblock_v_luma, 5,5,8 %assign pad 5*mmsize+12-(stack_offset&15) %define tcm [rsp] %define ms1 [rsp+mmsize] @@ -216,7 +216,7 @@ cglobal deblock_v_luma, 5,5,8*(mmsize/16) ADD rsp, pad RET -cglobal deblock_h_luma, 5,6,8*(mmsize/16) +cglobal deblock_h_luma, 5,6,8 %assign pad 7*mmsize+12-(stack_offset&15) %define tcm [rsp] %define ms1 [rsp+mmsize] @@ -724,7 +724,7 @@ DEBLOCK_LUMA_INTRA_64 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_luma_intra, 4,7,8*(mmsize/16) +cglobal deblock_v_luma_intra, 4,7,8 LUMA_INTRA_INIT 3 lea r4, [r1*4] lea r5, [r1*3] @@ -750,7 +750,7 @@ cglobal deblock_v_luma_intra, 4,7,8*(mmsize/16) ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra, 4,7,8*(mmsize/16) +cglobal deblock_h_luma_intra, 4,7,8 LUMA_INTRA_INIT 8 %if mmsize == 8 lea r4, [r1*3] @@ -1673,11 +1673,22 @@ DEBLOCK_LUMA_INTRA v8 mova [r0+2*r1], m2 %endmacro -%macro DEBLOCK_CHROMA 1 +%macro DEBLOCK_CHROMA 0 +cglobal deblock_inter_body + RESET_MM_PERMUTATION + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + pxor m4, m4 + LOAD_TC m6, r4 + pmaxsw m6, m4 + pand m7, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 + ret + ;----------------------------------------------------------------------------- ; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma, 7,7,8*(mmsize/16) +cglobal deblock_v_chroma, 7,7,8 FIX_STRIDES r1 mov r5, r0 sub r0, r1 @@ -1685,7 +1696,7 @@ cglobal deblock_v_chroma, 7,7,8*(mmsize/16) mov r6, 32/mmsize .loop: CHROMA_V_LOAD r5 - call deblock_inter_body_%1 + call deblock_inter_body CHROMA_V_STORE add r0, mmsize add r5, mmsize @@ -1697,7 +1708,7 @@ cglobal deblock_v_chroma, 7,7,8*(mmsize/16) ;----------------------------------------------------------------------------- ; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma, 5,7,8*(mmsize/16) +cglobal deblock_h_chroma, 5,7,8 add r1, r1 mov r5, 32/mmsize %if mmsize == 16 @@ -1705,7 +1716,7 @@ cglobal deblock_h_chroma, 5,7,8*(mmsize/16) %endif .loop: CHROMA_H_LOAD r6 - call deblock_inter_body_%1 + call deblock_inter_body CHROMA_H_STORE r6 lea r0, [r0+r1*(mmsize/4)] add r4, mmsize/8 @@ -1713,21 +1724,18 @@ cglobal deblock_h_chroma, 5,7,8*(mmsize/16) jg .loop REP_RET -deblock_inter_body_%1: + +cglobal deblock_intra_body RESET_MM_PERMUTATION LOAD_AB m4, m5, r2, r3 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 - pxor m4, m4 - LOAD_TC m6, r4 - pmaxsw m6, m4 - pand m7, m6 - DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 + CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 ret ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_intra, 4,6,8*(mmsize/16) +cglobal deblock_v_chroma_intra, 4,6,8 add r1, r1 mov r5, 32/mmsize movd m5, r3 @@ -1737,7 +1745,7 @@ cglobal deblock_v_chroma_intra, 4,6,8*(mmsize/16) SPLATW m5, m5 .loop: CHROMA_V_LOAD r4 - call deblock_intra_body_%1 + call deblock_intra_body CHROMA_V_STORE add r0, mmsize add r4, mmsize @@ -1748,7 +1756,7 @@ cglobal deblock_v_chroma_intra, 4,6,8*(mmsize/16) ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_intra, 4,6,8*(mmsize/16) +cglobal deblock_h_chroma_intra, 4,6,8 add r1, r1 mov r4, 32/mmsize %if mmsize == 16 @@ -1756,29 +1764,22 @@ cglobal deblock_h_chroma_intra, 4,6,8*(mmsize/16) %endif .loop: CHROMA_H_LOAD r5 - call deblock_intra_body_%1 + call deblock_intra_body CHROMA_H_STORE r5 lea r0, [r0+r1*(mmsize/4)] dec r4 jg .loop REP_RET - -deblock_intra_body_%1: - RESET_MM_PERMUTATION - LOAD_AB m4, m5, r2, r3 - LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 - CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 - ret %endmacro %ifndef ARCH_X86_64 INIT_MMX mmx2 -DEBLOCK_CHROMA mmx2 +DEBLOCK_CHROMA %endif INIT_XMM sse2 -DEBLOCK_CHROMA sse2 +DEBLOCK_CHROMA INIT_XMM avx -DEBLOCK_CHROMA avx +DEBLOCK_CHROMA %endif ; HIGH_BIT_DEPTH %ifndef HIGH_BIT_DEPTH @@ -1834,7 +1835,16 @@ DEBLOCK_CHROMA avx %define t5 r5 %define t6 r6 -%macro DEBLOCK_CHROMA 1 +%macro DEBLOCK_CHROMA 0 +cglobal chroma_inter_body + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + punpcklbw m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + ret + ;----------------------------------------------------------------------------- ; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- @@ -1844,7 +1854,7 @@ cglobal deblock_v_chroma, 5,6,8 mova m1, [t5+r1] mova m2, [r0] mova m3, [r0+r1] - call chroma_inter_body_%1 + call chroma_inter_body mova [t5+r1], m1 mova [r0], m2 CHROMA_V_LOOP 1 @@ -1856,30 +1866,19 @@ cglobal deblock_v_chroma, 5,6,8 cglobal deblock_h_chroma, 5,7,8 CHROMA_H_START TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) - call chroma_inter_body_%1 + call chroma_inter_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) CHROMA_H_LOOP 1 RET - -ALIGN 16 -RESET_MM_PERMUTATION -chroma_inter_body_%1: - LOAD_MASK r2d, r3d - movd m6, [r4] ; tc0 - punpcklbw m6, m6 - punpcklbw m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - ret %endmacro ; DEBLOCK_CHROMA INIT_XMM sse2 -DEBLOCK_CHROMA sse2 +DEBLOCK_CHROMA INIT_XMM avx -DEBLOCK_CHROMA avx +DEBLOCK_CHROMA %ifndef ARCH_X86_64 INIT_MMX mmx2 -DEBLOCK_CHROMA mmx2 +DEBLOCK_CHROMA %endif @@ -1896,7 +1895,21 @@ DEBLOCK_CHROMA mmx2 %define t5 r4 %define t6 r5 -%macro DEBLOCK_CHROMA_INTRA 1 +%macro DEBLOCK_CHROMA_INTRA 0 +cglobal chroma_intra_body + LOAD_MASK r2d, r3d + mova m5, m1 + mova m6, m2 + CHROMA_INTRA_P0 m1, m0, m3 + CHROMA_INTRA_P0 m2, m3, m0 + psubb m1, m5 + psubb m2, m6 + pand m1, m7 + pand m2, m7 + paddb m1, m5 + paddb m2, m6 + ret + ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- @@ -1906,7 +1919,7 @@ cglobal deblock_v_chroma_intra, 4,5,8 mova m1, [t5+r1] mova m2, [r0] mova m3, [r0+r1] - call chroma_intra_body_%1 + call chroma_intra_body mova [t5+r1], m1 mova [r0], m2 CHROMA_V_LOOP 0 @@ -1918,35 +1931,19 @@ cglobal deblock_v_chroma_intra, 4,5,8 cglobal deblock_h_chroma_intra, 4,6,8 CHROMA_H_START TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) - call chroma_intra_body_%1 + call chroma_intra_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) CHROMA_H_LOOP 0 RET - -ALIGN 16 -RESET_MM_PERMUTATION -chroma_intra_body_%1: - LOAD_MASK r2d, r3d - mova m5, m1 - mova m6, m2 - CHROMA_INTRA_P0 m1, m0, m3 - CHROMA_INTRA_P0 m2, m3, m0 - psubb m1, m5 - psubb m2, m6 - pand m1, m7 - pand m2, m7 - paddb m1, m5 - paddb m2, m6 - ret %endmacro ; DEBLOCK_CHROMA_INTRA INIT_XMM sse2 -DEBLOCK_CHROMA_INTRA sse2 +DEBLOCK_CHROMA_INTRA INIT_XMM avx -DEBLOCK_CHROMA_INTRA avx +DEBLOCK_CHROMA_INTRA %ifndef ARCH_X86_64 INIT_MMX mmx2 -DEBLOCK_CHROMA_INTRA mmx2 +DEBLOCK_CHROMA_INTRA %endif %endif ; !HIGH_BIT_DEPTH diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 09c5fbbd..4555526a 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -430,7 +430,7 @@ AVG_WEIGHT 16, 7 %endif %macro WEIGHTER 1 - cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS*(mmsize/16) + cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS FIX_STRIDES r1, r3 WEIGHT_START %1 LOAD_HEIGHT @@ -695,7 +695,7 @@ AVGH 4, 2 ; uint16_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W_ONE 1 -cglobal pixel_avg2_w%1, 6,7,4*(mmsize/16) +cglobal pixel_avg2_w%1, 6,7,4 sub r4, r2 lea r6, [r4+r3*2] .height_loop: @@ -720,7 +720,7 @@ cglobal pixel_avg2_w%1, 6,7,4*(mmsize/16) %endmacro %macro AVG2_W_TWO 3 -cglobal pixel_avg2_w%1, 6,7,8*(mmsize/16) +cglobal pixel_avg2_w%1, 6,7,8 sub r4, r2 lea r6, [r4+r3*2] .height_loop: @@ -1203,7 +1203,7 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k ; pixel copy ;============================================================================= -%macro COPY4 2-* +%macro COPY1 2 movu m0, [r2] movu m1, [r2+r3] movu m2, [r2+r3*2] @@ -1214,27 +1214,28 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k mova [r0+%1], m3 %endmacro -%macro COPY_ONE 4 - COPY4 %1, %2 +%macro COPY2 2-4 0, 1 + movu m0, [r2+%3*mmsize] + movu m1, [r2+%4*mmsize] + movu m2, [r2+r3+%3*mmsize] + movu m3, [r2+r3+%4*mmsize] + movu m4, [r2+r3*2+%3*mmsize] + movu m5, [r2+r3*2+%4*mmsize] + movu m6, [r2+%2+%3*mmsize] + movu m7, [r2+%2+%4*mmsize] + mova [r0+%3*mmsize], m0 + mova [r0+%4*mmsize], m1 + mova [r0+r1+%3*mmsize], m2 + mova [r0+r1+%4*mmsize], m3 + mova [r0+r1*2+%3*mmsize], m4 + mova [r0+r1*2+%4*mmsize], m5 + mova [r0+%1+%3*mmsize], m6 + mova [r0+%1+%4*mmsize], m7 %endmacro -%macro COPY_TWO 4 - movu m0, [r2+%3] - movu m1, [r2+%4] - movu m2, [r2+r3+%3] - movu m3, [r2+r3+%4] - movu m4, [r2+r3*2+%3] - movu m5, [r2+r3*2+%4] - movu m6, [r2+%2+%3] - movu m7, [r2+%2+%4] - mova [r0+%3], m0 - mova [r0+%4], m1 - mova [r0+r1+%3], m2 - mova [r0+r1+%4], m3 - mova [r0+r1*2+%3], m4 - mova [r0+r1*2+%4], m5 - mova [r0+%1+%3], m6 - mova [r0+%1+%4], m7 +%macro COPY4 2 + COPY2 %1, %2, 0, 1 + COPY2 %1, %2, 2, 3 %endmacro ;----------------------------------------------------------------------------- @@ -1252,76 +1253,38 @@ cglobal mc_copy_w4_mmx, 4,6 %define mova movd %define movu movd %endif - COPY4 r4, r5 + COPY1 r4, r5 lea r2, [r2+r3*4] lea r0, [r0+r1*4] .end: - COPY4 r4, r5 + COPY1 r4, r5 RET -%ifdef HIGH_BIT_DEPTH -cglobal mc_copy_w16_mmx, 5,7 +%macro MC_COPY 1 +%assign %%w %1*SIZEOF_PIXEL/mmsize +%if %%w > 0 +cglobal mc_copy_w%1, 5,7,8*(%%w/2) FIX_STRIDES r1, r3 lea r6, [r3*3] lea r5, [r1*3] .height_loop: - COPY_TWO r5, r6, mmsize*0, mmsize*1 - COPY_TWO r5, r6, mmsize*2, mmsize*3 - sub r4d, 4 + COPY %+ %%w r5, r6 lea r2, [r2+r3*4] lea r0, [r0+r1*4] - jg .height_loop - REP_RET - -%macro MC_COPY 2 -cglobal mc_copy_w%2, 5,7,%2-8 - FIX_STRIDES r1, r3 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - COPY_%1 r5, r6, 0, mmsize sub r4d, 4 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] jg .height_loop REP_RET +%endif %endmacro INIT_MMX mmx -MC_COPY TWO, 8 -INIT_XMM sse2 -MC_COPY ONE, 8 -MC_COPY TWO, 16 -INIT_XMM aligned, sse2 -MC_COPY TWO, 16 -%endif ; HIGH_BIT_DEPTH - -%ifndef HIGH_BIT_DEPTH -%macro MC_COPY 2 -cglobal mc_copy_w%2, 5,7 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - %1 r5, r6, 0, mmsize - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET -%endmacro - -INIT_MMX mmx -MC_COPY COPY4, 8 -MC_COPY COPY_TWO, 16 +MC_COPY 8 +MC_COPY 16 INIT_XMM sse2 -MC_COPY COPY4, 16 -; cacheline split with mmx has too much overhead; the speed benefit is near-zero. -; but with SSE3 the overhead is zero, so there's no reason not to include it. -INIT_XMM sse3 -MC_COPY COPY4, 16 +MC_COPY 8 +MC_COPY 16 INIT_XMM aligned, sse2 -MC_COPY COPY4, 16 -%endif ; !HIGH_BIT_DEPTH +MC_COPY 16 diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 17ca51a0..1295662c 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -151,7 +151,7 @@ cextern pd_ffff ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width ); ;----------------------------------------------------------------------------- %macro HPEL_FILTER 0 -cglobal hpel_filter_v, 5,6,11*(mmsize/16) +cglobal hpel_filter_v, 5,6,11 FIX_STRIDES r3d, r4d %ifdef WIN64 movsxd r4, r4d @@ -211,7 +211,7 @@ cglobal hpel_filter_v, 5,6,11*(mmsize/16) ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width ); ;----------------------------------------------------------------------------- -cglobal hpel_filter_c, 3,3,10*(mmsize/16) +cglobal hpel_filter_c, 3,3,10 add r2, r2 add r0, r2 lea r1, [r1+r2] @@ -260,7 +260,7 @@ cglobal hpel_filter_c, 3,3,10*(mmsize/16) ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width ); ;----------------------------------------------------------------------------- -cglobal hpel_filter_h, 3,4,8*(mmsize/16) +cglobal hpel_filter_h, 3,4,8 %define src r1+r2 add r2, r2 add r0, r2 @@ -370,6 +370,7 @@ cglobal hpel_filter_v, 5,6,%1 ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); ;----------------------------------------------------------------------------- +INIT_MMX cglobal hpel_filter_c_mmx2, 3,3 add r0, r2 lea r1, [r1+r2*2] @@ -1480,7 +1481,7 @@ cglobal integral_init4v_ssse3, 3,5 ; int src_stride, int dst_stride, int width, int height ) ;----------------------------------------------------------------------------- %macro FRAME_INIT_LOWRES 0 -cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise +cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise %ifdef HIGH_BIT_DEPTH shl dword r6m, 1 FIX_STRIDES r5d diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index b3a01a74..5d62114e 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -85,7 +85,6 @@ void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int ); void x264_mc_copy_w8_aligned_sse2( pixel *, int, pixel *, int, int ); void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int ); void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int ); -void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int ); void x264_prefetch_fenc_mmx2( uint8_t *, int, uint8_t *, int, int ); void x264_prefetch_ref_mmx2( uint8_t *, int, int ); diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 831794c0..981ae684 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -75,7 +75,7 @@ cextern hsub_mul ; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int ) ;----------------------------------------------------------------------------- %macro SSD_ONE 2 -cglobal pixel_ssd_%1x%2, 4,5,6*(mmsize/16) +cglobal pixel_ssd_%1x%2, 4,5,6 mov r4, %1*%2/mmsize pxor m0, m0 .loop @@ -306,11 +306,7 @@ cglobal pixel_ssd_%1x%2, 0,0,0 .startloop: %ifdef ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3 -%if cpuflag(ssse3) ; FIXME wrong, but correcting this modifies the binary PROLOGUE 0,0,8 -%else - PROLOGUE 0,0,8*(mmsize/16) -%endif %else PROLOGUE 0,5 DECLARE_REG_TMP 1,2,3,4 @@ -402,7 +398,7 @@ SSD 4, 8 ;----------------------------------------------------------------------------- %ifdef HIGH_BIT_DEPTH %macro SSD_NV12 0 -cglobal pixel_ssd_nv12_core, 6,7,7*(mmsize/16) +cglobal pixel_ssd_nv12_core, 6,7,7 shl r4d, 2 FIX_STRIDES r1, r3 add r0, r4 @@ -1575,7 +1571,7 @@ cglobal intra_sa8d_x3_8x8_core, 3,3,16 ABS2 m10, m11, m12, m13 paddusw m8, m10 paddusw m9, m11 -%ifidn cpuname, ssse3 +%if cpuflag(ssse3) pabsw m10, m6 pabsw m11, m7 pabsw m15, m1 diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index c0f1efba..bb1fa940 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -260,7 +260,7 @@ cglobal predict_4x4_ddr, 1,1 %endrep RET -cglobal predict_4x4_vr, 1,1,6*(mmsize/16) +cglobal predict_4x4_vr, 1,1,6 movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0 mova m5, m0 %ifdef HIGH_BIT_DEPTH @@ -296,7 +296,7 @@ cglobal predict_4x4_vr, 1,1,6*(mmsize/16) movh [r0+3*FDEC_STRIDEB], m3 RET -cglobal predict_4x4_hd, 1,1,6*(mmsize/16) +cglobal predict_4x4_hd, 1,1,6 movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt .. %ifdef HIGH_BIT_DEPTH movh m1, [r0-1*FDEC_STRIDEB] @@ -490,7 +490,7 @@ cglobal predict_4x4_hu_mmx2, 1,1 ; void predict_4x4_vl( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4_V1 3 -cglobal predict_4x4_vl, 1,1,6*(mmsize/16) +cglobal predict_4x4_vl, 1,1,6 movu m1, [r0-FDEC_STRIDEB] psrl%1 m3, m1, %2 psrl%1 m2, m1, %2*2 @@ -598,7 +598,7 @@ cglobal predict_4x4_dc_mmx2, 1,4 ;----------------------------------------------------------------------------- ;void predict_8x8_filter( pixel *src, pixel edge[33], int i_neighbor, int i_filters ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_filter, 4,5,7*(mmsize/16) +cglobal predict_8x8_filter, 4,5,7 add r0, 0x58*SIZEOF_PIXEL %define src r0-0x58*SIZEOF_PIXEL %ifndef ARCH_X86_64 @@ -830,7 +830,7 @@ PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl, 2,2,8*(mmsize/16) +cglobal predict_8x8_ddl, 2,2,8 mova m5, [r1+16*SIZEOF_PIXEL] movu m2, [r1+17*SIZEOF_PIXEL] movu m3, [r1+23*SIZEOF_PIXEL] @@ -863,7 +863,7 @@ cglobal predict_8x8_ddl, 2,2,8*(mmsize/16) ; void predict_8x8_ddr( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %if avx_enabled == 0 -cglobal predict_8x8_ddr, 2,2,7*(mmsize/16) +cglobal predict_8x8_ddr, 2,2,7 movu m1, [r1+ 7*SIZEOF_PIXEL] movu m2, [r1+ 9*SIZEOF_PIXEL] movu m3, [r1+15*SIZEOF_PIXEL] @@ -904,7 +904,7 @@ PREDICT_8x8 b, q , 8 ; void predict_8x8_hu( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HU 5 -cglobal predict_8x8_hu, 2,2,8*(mmsize/16) +cglobal predict_8x8_hu, 2,2,8 movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 add r0, 4*FDEC_STRIDEB pshuf%3 m0, m1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 @@ -960,7 +960,7 @@ PREDICT_8x8_HU b, q , w, bw, 8 ; void predict_8x8_vr( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_VR 3 -cglobal predict_8x8_vr, 2,3,7*(mmsize/16) +cglobal predict_8x8_vr, 2,3,7 mova m2, [r1+16*SIZEOF_PIXEL] movu m3, [r1+15*SIZEOF_PIXEL] movu m1, [r1+14*SIZEOF_PIXEL] @@ -1316,7 +1316,7 @@ PREDICT_8x8 ; void predict_8x8_hd( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HD 4 -cglobal predict_8x8_hd, 2,2,8*(mmsize/16) +cglobal predict_8x8_hd, 2,2,8 add r0, 4*FDEC_STRIDEB mova m0, [r1] ; l7 .. .. .. .. .. .. .. mova m1, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6 diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 124b0a79..05b453b5 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -83,24 +83,20 @@ cextern pd_1 cextern pb_01 cextern pd_1024 -%macro QUANT_DC_START_MMX 0 +%macro QUANT_DC_START 0 movd m6, r1m ; mf movd m7, r2m ; bias %ifdef HIGH_BIT_DEPTH SPLATD m6, m6 SPLATD m7, m7 -%else - SPLATW m6, m6 - SPLATW m7, m7 -%endif ; HIGH_BIT_DEPTH -%endmacro - -%macro QUANT_DC_START_SSSE3 0 +%elif cpuflag(sse4) ; ssse3, but not faster on conroe movdqa m5, [pb_01] - movd m6, r1m ; mf - movd m7, r2m ; bias pshufb m6, m5 pshufb m7, m5 +%else + SPLATW m6, m6 + SPLATW m7, m7 +%endif %endmacro ; PABSW mmx and PSIGNW mmx do not individually perform the same operations as @@ -304,8 +300,8 @@ cextern pd_1024 ; int quant_2x2( int32_t dct[M*N], int mf, int bias ) ;----------------------------------------------------------------------------- %macro QUANT_DC 2 -cglobal quant_%1x%2_dc, 3,3,8*(mmsize/16) - QUANT_DC_START_MMX +cglobal quant_%1x%2_dc, 3,3,8 + QUANT_DC_START %if %1*%2 <= mmsize/4 QUANT_ONE_DC r0, m6, m7, 0 %else @@ -323,7 +319,7 @@ cglobal quant_%1x%2_dc, 3,3,8*(mmsize/16) ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] ) ;----------------------------------------------------------------------------- %macro QUANT_AC 2 -cglobal quant_%1x%2, 3,3,8*(mmsize/16) +cglobal quant_%1x%2, 3,3,8 %assign x 0 %rep %1*%2/(mmsize/2) QUANT_TWO_AC r0+x, r1+x, r2+x, x @@ -427,7 +423,6 @@ cglobal %1, 3,3 %endmacro INIT_MMX mmx2 -%define QUANT_DC_START QUANT_DC_START_MMX QUANT_DC quant_2x2_dc, 1 %ifndef ARCH_X86_64 ; not needed because sse2 is faster QUANT_DC quant_4x4_dc, 4 @@ -451,7 +446,6 @@ QUANT_DC quant_2x2_dc, 1 INIT_XMM sse4 ;Not faster on Conroe, so only used in SSE4 versions -%define QUANT_DC_START QUANT_DC_START_SSSE3 QUANT_DC quant_4x4_dc, 2, 8 QUANT_AC quant_4x4, 2 QUANT_AC quant_8x8, 8 @@ -567,7 +561,7 @@ QUANT_AC quant_8x8, 8 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp ) ;----------------------------------------------------------------------------- %macro DEQUANT 3 -cglobal dequant_%1x%1, 0,3,6*(mmsize/16) +cglobal dequant_%1x%1, 0,3,6 .skip_prologue: DEQUANT_START %2+2, %2 @@ -630,11 +624,9 @@ cglobal dequant_%1x%1_flat16, 0,3 %ifdef HIGH_BIT_DEPTH INIT_XMM sse2 DEQUANT 4, 4, 1 -INIT_XMM sse4 -DEQUANT 4, 4, 1 -INIT_XMM sse2 DEQUANT 8, 6, 1 INIT_XMM sse4 +DEQUANT 4, 4, 1 DEQUANT 8, 6, 1 %else %ifndef ARCH_X86_64 @@ -651,7 +643,7 @@ DEQUANT 8, 6, 2 %endif %macro DEQUANT_DC 2 -cglobal dequant_4x4dc, 0,3,6*(mmsize/16) +cglobal dequant_4x4dc, 0,3,6 DEQUANT_START 6, 6 .lshift: @@ -855,7 +847,7 @@ OPTIMIZE_CHROMA_DC ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 0 -cglobal denoise_dct, 4,4,8*(mmsize/16) +cglobal denoise_dct, 4,4,8 pxor m6, m6 .loop: sub r3, mmsize/2 @@ -900,7 +892,7 @@ DENOISE_DCT ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 0 -cglobal denoise_dct, 4,4,7*(mmsize/16) +cglobal denoise_dct, 4,4,7 pxor m6, m6 .loop: sub r3, mmsize diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 97bf1a57..8ed96789 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -697,7 +697,7 @@ INTRA_SAD_8x8C ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score %macro INTRA_SAD16 0 -cglobal intra_sad_x3_16x16, 3,5,8*(mmsize/16) +cglobal intra_sad_x3_16x16, 3,5,8 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1-FDEC_STRIDE+0] diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index dcc5e9f3..ba8a51ab 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -297,6 +297,9 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 + %if mmsize == 8 + %assign xmm_regs_used 0 + %endif ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 6 sub rsp, (xmm_regs_used-6)*16+16 @@ -641,7 +644,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits INIT_CPUFLAGS %1 %endmacro -INIT_MMX +INIT_XMM ; I often want to use macros that permute their arguments. e.g. there's no ; efficient way to implement butterfly or transpose or dct without swapping some diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm index 0df0217c..e737e052 100644 --- a/tools/checkasm-a.asm +++ b/tools/checkasm-a.asm @@ -59,6 +59,7 @@ cextern_naked puts ;----------------------------------------------------------------------------- ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- +INIT_XMM cglobal checkasm_call, 4,7,16 sub rsp, max_args*8 %assign stack_offset stack_offset+max_args*8