From b597966bfa8a481489e5af93eb25988456c51a5d Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Fri, 12 Aug 2011 19:13:07 +0000 Subject: [PATCH] Optimize x86 intra_sa8d_x3_8x8 ~40% faster. Also some other minor asm cosmetics. --- common/x86/const-a.asm | 3 + common/x86/pixel-32.asm | 152 +++++++++++++++++++++++----------------- common/x86/pixel-a.asm | 100 +++++++++++++++++--------- common/x86/pixel.h | 4 -- common/x86/predict-c.c | 62 +--------------- 5 files changed, 163 insertions(+), 158 deletions(-) diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index 999afc5f..2f2492c3 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -48,6 +48,9 @@ const pw_32_0, times 4 dw 32, const pw_8000, times 8 dw 0x8000 const pw_3fff, times 8 dw 0x3fff const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1) +const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1 +const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1 +const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 const pd_1, times 4 dd 1 const pd_32, times 4 dd 32 diff --git a/common/x86/pixel-32.asm b/common/x86/pixel-32.asm index fd7e4c05..316c0fe8 100644 --- a/common/x86/pixel-32.asm +++ b/common/x86/pixel-32.asm @@ -27,6 +27,9 @@ %include "x86inc.asm" %include "x86util.asm" +cextern pw_ppmmppmm +cextern pw_pmpmpmpm + SECTION .text INIT_MMX mmx2 @@ -151,37 +154,71 @@ cglobal pixel_sa8d_8x8_internal %macro LOAD_4x8P 1 ; dx pxor m7, m7 - movd m6, [eax+%1+7*FENC_STRIDE] - movd m0, [eax+%1+0*FENC_STRIDE] - movd m1, [eax+%1+1*FENC_STRIDE] - movd m2, [eax+%1+2*FENC_STRIDE] - movd m3, [eax+%1+3*FENC_STRIDE] - movd m4, [eax+%1+4*FENC_STRIDE] - movd m5, [eax+%1+5*FENC_STRIDE] + movd m6, [r0+%1+7*FENC_STRIDE] + movd m0, [r0+%1+0*FENC_STRIDE] + movd m1, [r0+%1+1*FENC_STRIDE] + movd m2, [r0+%1+2*FENC_STRIDE] + movd m3, [r0+%1+3*FENC_STRIDE] + movd m4, [r0+%1+4*FENC_STRIDE] + movd m5, [r0+%1+5*FENC_STRIDE] punpcklbw m6, m7 punpcklbw m0, m7 punpcklbw m1, m7 movq [spill], m6 punpcklbw m2, m7 punpcklbw m3, m7 - movd m6, [eax+%1+6*FENC_STRIDE] + movd m6, [r0+%1+6*FENC_STRIDE] punpcklbw m4, m7 punpcklbw m5, m7 punpcklbw m6, m7 movq m7, [spill] %endmacro +%macro HSUMSUB2 4 + pshufw m4, %1, %3 + pshufw m5, %2, %3 + pmullw %1, %4 + pmullw m5, %4 + paddw %1, m4 + paddw %2, m5 +%endmacro + ;----------------------------------------------------------------------------- -; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res ) +; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) ;----------------------------------------------------------------------------- -cglobal intra_sa8d_x3_8x8_core - mov eax, [esp+4] - mov ecx, [esp+8] - sub esp, 0x70 -%define args esp+0x74 +cglobal intra_sa8d_x3_8x8, 2,3 + SUB esp, 0x94 +%define edge esp+0x70 ; +32 %define spill esp+0x60 ; +16 %define trans esp+0 ; +96 %define sum esp+0 ; +32 + + pxor m7, m7 + movq m0, [r1+7] + movq m2, [r1+16] + movq m1, m0 + movq m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + movq m6, [pw_ppmmppmm] + HSUMSUB2 m0, m2, q1032, m6 + HSUMSUB2 m1, m3, q1032, m6 + movq m6, [pw_pmpmpmpm] + HSUMSUB2 m0, m2, q2301, m6 + HSUMSUB2 m1, m3, q2301, m6 + movq m4, m0 + movq m5, m2 + paddw m0, m1 + paddw m2, m3 + psubw m4, m1 + psubw m3, m5 + movq [edge+0], m0 + movq [edge+8], m4 + movq [edge+16], m2 + movq [edge+24], m3 + LOAD_4x8P 0 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 @@ -231,7 +268,7 @@ cglobal intra_sa8d_x3_8x8_core ABSW m1, m1, m4 paddw m2, m1 ; 7x4 sum movq m7, m0 - movq m1, [ecx+8] ; left bottom + movq m1, [edge+8] ; left bottom psllw m1, 3 psubw m7, m1 ABSW2 m0, m7, m0, m7, m5, m3 @@ -276,14 +313,14 @@ cglobal intra_sa8d_x3_8x8_core paddw m2, m1 ; 7x4 sum movq m1, m0 - movq m7, [ecx+0] + movq m7, [edge+0] psllw m7, 3 ; left top - movzx edx, word [ecx+0] - add dx, [ecx+16] - lea edx, [4*edx+32] - and edx, -64 - movd m6, edx ; dc + mov r2, [edge+0] + add r2, [edge+16] + lea r2, [4*r2+32] + and r2, 0xffc0 + movd m6, r2 ; dc psubw m1, m7 psubw m0, m6 @@ -297,8 +334,8 @@ cglobal intra_sa8d_x3_8x8_core psrlq m2, 16 paddw m2, m3 - movq m3, [ecx+16] ; top left - movq m4, [ecx+24] ; top right + movq m3, [edge+16] ; top left + movq m4, [edge+24] ; top right psllw m3, 3 psllw m4, 3 psubw m3, [sum+16] @@ -307,24 +344,17 @@ cglobal intra_sa8d_x3_8x8_core paddw m2, m3 paddw m2, m4 ; v - SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd - mov eax, [args+8] - movd ecx, m2 - movd edx, m1 - add ecx, 2 - add edx, 2 - shr ecx, 2 - shr edx, 2 - mov [eax+0], ecx ; i8x8_v satd - mov [eax+4], edx ; i8x8_h satd - movd ecx, m0 - add ecx, 2 - shr ecx, 2 - mov [eax+8], ecx ; i8x8_dc satd - - add esp, 0x70 - ret -%undef args + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw + mov r2, r2m + pxor m7, m7 + punpckldq m2, m1 + pavgw m0, m7 + pavgw m2, m7 + movd [r2+8], m0 ; dc + movq [r2+0], m2 ; v, h + ADD esp, 0x94 + RET +%undef edge %undef spill %undef trans %undef sum @@ -335,25 +365,23 @@ cglobal intra_sa8d_x3_8x8_core ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1, ; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- -cglobal pixel_ssim_4x4x2_core - push ebx - push edi - mov ebx, [esp+16] - mov edx, [esp+24] - mov edi, 4 +cglobal pixel_ssim_4x4x2_core, 0,5 + mov r1, r1m + mov r3, r3m + mov r4, 4 pxor m0, m0 .loop: - mov eax, [esp+12] - mov ecx, [esp+20] - add eax, edi - add ecx, edi + mov r0, r0m + mov r2, r2m + add r0, r4 + add r2, r4 pxor m1, m1 pxor m2, m2 pxor m3, m3 pxor m4, m4 %rep 4 - movd m5, [eax] - movd m6, [ecx] + movd m5, [r0] + movd m6, [r2] punpcklbw m5, m0 punpcklbw m6, m0 paddw m1, m5 @@ -365,11 +393,11 @@ cglobal pixel_ssim_4x4x2_core paddd m3, m5 paddd m4, m7 paddd m3, m6 - add eax, ebx - add ecx, edx + add r0, r1 + add r2, r3 %endrep - mov eax, [esp+28] - lea eax, [eax+edi*4] + mov r0, r4m + lea r0, [r0+r4*4] pshufw m5, m1, q0032 pshufw m6, m2, q0032 paddusw m1, m5 @@ -383,12 +411,10 @@ cglobal pixel_ssim_4x4x2_core paddd m4, m6 punpcklwd m1, m0 punpckldq m3, m4 - movq [eax+0], m1 - movq [eax+8], m3 - sub edi, 4 + movq [r0+0], m1 + movq [r0+8], m3 + sub r4, 4 jge .loop - pop edi - pop ebx emms - ret + RET diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 0eeac189..61c8c230 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -54,16 +54,22 @@ hmul_8p: times 8 db 1 times 4 db 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 +pb_pppm: times 4 db 1,1,1,-1 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0 +sw_f0: dq 0xfff0, 0 pd_f0: times 4 dd 0xffff0000 sq_0f: times 1 dq 0xffffffff SECTION .text cextern pw_1 +cextern pw_8 cextern pw_00ff - +cextern pw_ppppmmmm +cextern pw_ppmmppmm +cextern pw_pmpmpmpm cextern hsub_mul ;============================================================================= @@ -1525,12 +1531,21 @@ cglobal pixel_sa8d_16x16, 4,7 ; INTRA SATD ;============================================================================= +%macro HSUMSUB2 8 + pshufd %4, %2, %7 + pshufd %5, %3, %7 + %1 %2, %8 + %1 %6, %8 + paddw %2, %4 + paddw %3, %5 +%endmacro + %macro INTRA_SA8D_SSE2 0 %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res ) +; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) ;----------------------------------------------------------------------------- -cglobal intra_sa8d_x3_8x8_core, 3,3,16 +cglobal intra_sa8d_x3_8x8, 3,3,16 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0*FENC_STRIDE] @@ -1550,39 +1565,57 @@ cglobal intra_sa8d_x3_8x8_core, 3,3,16 punpcklbw m6, m8 punpcklbw m7, m8 - HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8 + HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8 - ; dc - movzx r0d, word [r1+0] - add r0w, word [r1+16] - add r0d, 8 - and r0d, -16 - shl r0d, 2 - - pxor m15, m15 - movdqa m8, m2 - movdqa m9, m3 - movdqa m10, m4 - movdqa m11, m5 - ABSW2 m8, m9, m8, m9, m12, m13 - ABSW2 m10, m11, m10, m11, m12, m13 + ABSW2 m8, m9, m2, m3, m2, m3 + ABSW2 m10, m11, m4, m5, m4, m5 paddusw m8, m10 paddusw m9, m11 - ABSW2 m10, m11, m6, m7, m6, m7 + ABSW2 m10, m11, m6, m7, m6, m7 ABSW m15, m1, m1 paddusw m10, m11 paddusw m8, m9 paddusw m15, m10 paddusw m15, m8 - movdqa m8, [r1+0] ; left edge - movd m9, r0d - psllw m8, 3 + ; 1D hadamard of edges + movq m8, [r1+7] + movq m9, [r1+16] +%if cpuflag(ssse3) + punpcklwd m8, m8 + pshufb m9, [intrax3_shuf] + pmaddubsw m8, [pb_pppm] + pmaddubsw m9, [pb_pppm] + HSUMSUB2 psignw, m8, m9, m10, m11, m9, q1032, [pw_ppppmmmm] + HSUMSUB2 psignw, m8, m9, m10, m11, m9, q2301, [pw_ppmmppmm] +%else ; sse2 + pxor m10, m10 + punpcklbw m8, m10 + punpcklbw m9, m10 + HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm] + HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm] + pshuflw m10, m8, q2301 + pshuflw m11, m9, q2301 + pshufhw m10, m10, q2301 + pshufhw m11, m11, q2301 + pmullw m8, [pw_pmpmpmpm] + pmullw m11, [pw_pmpmpmpm] + paddw m8, m10 + paddw m9, m11 +%endif + + ; differences + paddw m10, m8, m9 + paddw m10, [pw_8] + pand m10, [sw_f0] + psllw m10, 2 ; dc + + psllw m8, 3 ; left edge psubw m8, m0 - psubw m9, m0 - ABSW2 m8, m9, m8, m9, m10, m11 ; 1x8 sum - paddusw m14, m15, m8 - paddusw m15, m9 + psubw m10, m0 + ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum + paddusw m14, m8, m15 + paddusw m15, m10 punpcklwd m0, m1 punpcklwd m2, m3 punpcklwd m4, m5 @@ -1590,11 +1623,10 @@ cglobal intra_sa8d_x3_8x8_core, 3,3,16 punpckldq m0, m2 punpckldq m4, m6 punpcklqdq m0, m4 ; transpose - movdqa m1, [r1+16] ; top edge - psllw m1, 3 - psrldq m2, m15, 2 ; 8x7 sum - psubw m0, m1 ; 8x1 sum - ABSW m0, m0, m1 + psllw m9, 3 ; top edge + psrldq m2, m15, 2 ; 8x7 sum + psubw m0, m9 ; 8x1 sum + ABSW m0, m0, m9 paddusw m2, m0 ; 3x HADDW @@ -2424,8 +2456,8 @@ SA8D INIT_XMM sse2 SA8D SATDS_SSE2 -INTRA_SA8D_SSE2 %ifndef HIGH_BIT_DEPTH +INTRA_SA8D_SSE2 INIT_MMX mmx2 INTRA_SATDS_MMX %endif @@ -2446,9 +2478,11 @@ HADAMARD_AC_SSE2 %undef movdqa ; nehalem doesn't like movaps %undef movdqu ; movups %undef punpcklqdq ; or movlhps +%ifndef HIGH_BIT_DEPTH INTRA_SA8D_SSE2 INIT_MMX ssse3 INTRA_SATDS_MMX +%endif %define TRANS TRANS_SSE4 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN @@ -2460,7 +2494,9 @@ HADAMARD_AC_SSE2 INIT_XMM avx SATDS_SSE2 SA8D +%ifndef HIGH_BIT_DEPTH INTRA_SA8D_SSE2 +%endif HADAMARD_AC_SSE2 ;============================================================================= diff --git a/common/x86/pixel.h b/common/x86/pixel.h index c2cc208f..2de815df 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -113,10 +113,6 @@ void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * ); -void x264_intra_sa8d_x3_8x8_core_mmx2 ( uint8_t *, int16_t [2][8], int * ); -void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * ); -void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * ); -void x264_intra_sa8d_x3_8x8_core_avx ( uint8_t *, int16_t [2][8], int * ); void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index e37511cb..59cce04b 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -322,8 +322,8 @@ static void x264_predict_8x8c_p_ssse3( uint8_t *src ) #endif } #endif -#if !HIGH_BIT_DEPTH -#if ARCH_X86_64 + +#if ARCH_X86_64 && !HIGH_BIT_DEPTH static void x264_predict_8x8c_dc_left( uint8_t *src ) { int y; @@ -350,63 +350,7 @@ static void x264_predict_8x8c_dc_left( uint8_t *src ) } } -#endif - -#define PL(y) \ - UNUSED int l##y = edge[14-y]; -#define PT(x) \ - UNUSED int t##x = edge[16+x]; -#define PREDICT_8x8_LOAD_LEFT \ - PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7) -#define PREDICT_8x8_LOAD_TOP \ - PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7) - -#define SUMSUB(a,b,c,d,e,f,g,h)\ - t=a; a+=b; b-=t;\ - t=c; c+=d; d-=t;\ - t=e; e+=f; f-=t;\ - t=g; g+=h; h-=t; - -#define INTRA_SA8D_X3(cpu)\ -void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[36], int res[3] )\ -{\ - PREDICT_8x8_LOAD_TOP\ - PREDICT_8x8_LOAD_LEFT\ - int t;\ - ALIGNED_16( int16_t sa8d_1d[2][8] );\ - SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\ - SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\ - SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\ - sa8d_1d[0][0] = l0;\ - sa8d_1d[0][1] = l1;\ - sa8d_1d[0][2] = l2;\ - sa8d_1d[0][3] = l3;\ - sa8d_1d[0][4] = l4;\ - sa8d_1d[0][5] = l5;\ - sa8d_1d[0][6] = l6;\ - sa8d_1d[0][7] = l7;\ - SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);\ - SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);\ - SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);\ - sa8d_1d[1][0] = t0;\ - sa8d_1d[1][1] = t1;\ - sa8d_1d[1][2] = t2;\ - sa8d_1d[1][3] = t3;\ - sa8d_1d[1][4] = t4;\ - sa8d_1d[1][5] = t5;\ - sa8d_1d[1][6] = t6;\ - sa8d_1d[1][7] = t7;\ - x264_intra_sa8d_x3_8x8_core_##cpu( fenc, sa8d_1d, res );\ -} - -#if ARCH_X86_64 -INTRA_SA8D_X3(sse2) -INTRA_SA8D_X3(ssse3) -INTRA_SA8D_X3(avx) -#else -INTRA_SA8D_X3(mmx2) -#endif -#endif // !HIGH_BIT_DEPTH +#endif // ARCH_X86_64 && !HIGH_BIT_DEPTH /**************************************************************************** * Exported functions: -- 2.40.0