From f114746df6ce6a1bcacf46c62b696cc309ab4527 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sun, 28 Apr 2013 11:11:03 +0200 Subject: [PATCH] x86: AVX2 high bit-depth intra_sad_x3_8x8 43->24 cycles --- common/pixel.c | 1 + common/x86/pixel.h | 1 + common/x86/sad16-a.asm | 50 ++++++++++++++++++++++++++++++++++++++++++ encoder/analyse.c | 2 +- 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/common/pixel.c b/common/pixel.c index f5db938b..5c8c974c 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1000,6 +1000,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x4, _avx2 ); pixf->vsad = x264_pixel_vsad_avx2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2; } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH diff --git a/common/x86/pixel.h b/common/x86/pixel.h index aca996ad..7383f512 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -127,6 +127,7 @@ void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_8x8_avx2 ( uint16_t*, uint16_t*, int * ); int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * ); diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm index 62da7cd9..68fa06ae 100644 --- a/common/x86/sad16-a.asm +++ b/common/x86/sad16-a.asm @@ -644,12 +644,21 @@ cglobal intra_sad_x3_8x8, 3,3,8 INTRA_SAD_HVDC_ITER 5, q2222 INTRA_SAD_HVDC_ITER 6, q1111 INTRA_SAD_HVDC_ITER 7, q0000 +%if cpuflag(ssse3) + phaddw m2, m3 ; 2 2 2 2 3 3 3 3 + movhlps m3, m1 + paddw m1, m3 ; 1 1 1 1 _ _ _ _ + phaddw m2, m1 ; 2 2 3 3 1 1 _ _ + pmaddwd m2, [pw_1] ; 2 3 1 _ + mova [r2], m2 +%else HADDW m2, m4 HADDW m3, m4 HADDW m1, m4 movd [r2+0], m2 movd [r2+4], m3 movd [r2+8], m1 +%endif RET %endmacro @@ -657,3 +666,44 @@ INIT_XMM sse2 INTRA_SAD_X3_8x8 INIT_XMM ssse3 INTRA_SAD_X3_8x8 + +%macro INTRA_SAD_HVDC_ITER_YMM 2 + mova xm4, [r0+(%1-4)*FENC_STRIDEB] + vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1 + pshufd m5, m7, %2 + psubw m5, m4 + pabsw m5, m5 + ACCUM paddw, 2, 5, %1 ; H + psubw m5, m4, m6 + psubw m4, m0 + pabsw m5, m5 + pabsw m4, m4 + ACCUM paddw, 1, 5, %1 ; V + ACCUM paddw, 3, 4, %1 ; DC +%endmacro + +INIT_YMM avx2 +cglobal intra_sad_x3_8x8, 3,3,8 + add r0, 4*FENC_STRIDEB + movu xm0, [r1+7*SIZEOF_PIXEL] + vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction + vpermq m7, m0, q0011 + paddw xm0, xm6 + paddw xm0, [pw_1] ; equal to +8 after HADDW + HADDW xm0, xm4 + psrld xm0, 4 + vpbroadcastw m0, xm0 + punpcklwd m7, m7 + INTRA_SAD_HVDC_ITER_YMM 0, q3333 + INTRA_SAD_HVDC_ITER_YMM 1, q2222 + INTRA_SAD_HVDC_ITER_YMM 2, q1111 + INTRA_SAD_HVDC_ITER_YMM 3, q0000 + phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 + punpckhqdq m2, m3, m3 + paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _ + phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _ + vextracti128 xm2, m1, 1 + paddw xm1, xm2 ; 1 1 2 2 3 3 _ _ + pmaddwd xm1, [pw_1] ; 1 2 3 _ + mova [r2], xm1 + RET diff --git a/encoder/analyse.c b/encoder/analyse.c index 8ba435ea..01485383 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -888,7 +888,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ { if( !h->mb.b_lossless && predict_mode[5] >= 0 ) { - int satd[9]; + ALIGNED_ARRAY_16( int32_t, satd,[9] ); h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd ); int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; satd[i_pred_mode] -= 3 * lambda; -- 2.40.0