From: Henrik Gramner Date: Fri, 5 Jul 2013 19:15:49 +0000 (+0200) Subject: x86: Faster AVX2 pixel_sad_x3 and pixel_sad_x4 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4becc3e9e031c4207698846369aac2bef1480d15;p=libx264 x86: Faster AVX2 pixel_sad_x3 and pixel_sad_x4 --- diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index a287c780..45f4a106 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -32,7 +32,6 @@ SECTION_RODATA 32 pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1 -deinterleave_sadx4: dd 0,4,2,6 hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11 SECTION .text @@ -1387,12 +1386,12 @@ SAD_X 4, 4, 4 vbroadcasti128 m4, [r0] vbroadcasti128 m5, [r0+FENC_STRIDE] movu xm0, [r1] - movu xm1, [r3] + movu xm1, [r2] movu xm2, [r1+r5] - movu xm3, [r3+r5] - vinserti128 m0, m0, [r2], 1 + movu xm3, [r2+r5] + vinserti128 m0, m0, [r3], 1 vinserti128 m1, m1, [r4], 1 - vinserti128 m2, m2, [r2+r5], 1 + vinserti128 m2, m2, [r3+r5], 1 vinserti128 m3, m3, [r4+r5], 1 psadbw m0, m4 psadbw m1, m4 @@ -1406,12 +1405,12 @@ SAD_X 4, 4, 4 vbroadcasti128 m6, [r0+%1] vbroadcasti128 m7, [r0+%3] movu xm2, [r1+%2] - movu xm3, [r3+%2] + movu xm3, [r2+%2] movu xm4, [r1+%4] - movu xm5, [r3+%4] - vinserti128 m2, m2, [r2+%2], 1 + movu xm5, [r2+%4] + vinserti128 m2, m2, [r3+%2], 1 vinserti128 m3, m3, [r4+%2], 1 - vinserti128 m4, m4, [r2+%4], 1 + vinserti128 m4, m4, [r3+%4], 1 vinserti128 m5, m5, [r4+%4], 1 psadbw m2, m6 psadbw m3, m6 @@ -1443,35 +1442,22 @@ SAD_X 4, 4, 4 %endmacro %macro SAD_X3_END_AVX2 0 - vextracti128 xm4, m0, 1 - vextracti128 xm5, m1, 1 - vextracti128 xm6, m2, 1 - paddw xm0, xm4 - paddw xm1, xm5 - paddw xm2, xm6 - movhlps xm4, xm0 - movhlps xm5, xm1 - movhlps xm6, xm2 - paddw xm0, xm4 - paddw xm1, xm5 - paddw xm2, xm6 movifnidn r5, r5mp - movd [r5+0], xm0 - movd [r5+4], xm1 - movd [r5+8], xm2 + packssdw m0, m1 ; 0 0 1 1 0 0 1 1 + packssdw m2, m2 ; 2 2 _ _ 2 2 _ _ + phaddd m0, m2 ; 0 1 2 _ 0 1 2 _ + vextracti128 xm1, m0, 1 + paddd xm0, xm1 ; 0 1 2 _ + mova [r5], xm0 RET %endmacro %macro SAD_X4_END_AVX2 0 - mov r0, r6mp - punpckhqdq m2, m0, m0 - punpckhqdq m3, m1, m1 - paddw m0, m2 - paddw m1, m3 - packssdw m0, m1 - mova xm2, [deinterleave_sadx4] - vpermd m0, m2, m0 - mova [r0], xm0 + mov r0, r6mp + packssdw m0, m1 ; 0 0 1 1 2 2 3 3 + vextracti128 xm1, m0, 1 + phaddd xm0, xm1 ; 0 1 2 3 + mova [r0], xm0 RET %endmacro diff --git a/encoder/me.c b/encoder/me.c index 0a3fec76..972af474 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -365,14 +365,14 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, /* hexagon */ COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs ); - COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+3 ); + COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+4 ); /* +4 for 16-byte alignment */ bcost <<= 3; COPY1_IF_LT( bcost, (costs[0]<<3)+2 ); COPY1_IF_LT( bcost, (costs[1]<<3)+3 ); COPY1_IF_LT( bcost, (costs[2]<<3)+4 ); - COPY1_IF_LT( bcost, (costs[3]<<3)+5 ); - COPY1_IF_LT( bcost, (costs[4]<<3)+6 ); - COPY1_IF_LT( bcost, (costs[5]<<3)+7 ); + COPY1_IF_LT( bcost, (costs[4]<<3)+5 ); + COPY1_IF_LT( bcost, (costs[5]<<3)+6 ); + COPY1_IF_LT( bcost, (costs[6]<<3)+7 ); if( bcost&7 ) { @@ -671,7 +671,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, for( i = 0; i < xn-2; i += 3 ) { pixel *ref = p_fref_w+min_x+my*stride; - int sads[3]; + ALIGNED_ARRAY_16( int, sads,[4] ); /* padded to [4] for asm */ h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads ); for( int j = 0; j < 3; j++ ) { diff --git a/tools/checkasm.c b/tools/checkasm.c index e731bf64..8cfebeb9 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -406,7 +406,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) } \ else \ call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \ - if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ + if( memcmp(res_c, res_asm, N*sizeof(int)) ) \ { \ ok = 0; \ fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \