From 97ad171ae33c51f48e6214abdf7c978e4dd5d2d1 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 16 Apr 2013 23:27:22 +0200 Subject: [PATCH] x86: AVX2 predict_8x8c_p/predict_8x16c_p --- common/x86/predict-a.asm | 95 +++++++++++++++++++++++----------------- common/x86/predict-c.c | 24 ++++++---- common/x86/predict.h | 6 ++- 3 files changed, 74 insertions(+), 51 deletions(-) diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 3265e28e..2337e893 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -32,9 +32,9 @@ SECTION_RODATA 32 pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4 -pw_m3: times 8 dw -3 -pw_m7: times 8 dw -7 +pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 +pw_m3: times 16 dw -3 +pw_m7: times 16 dw -7 pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff @@ -1122,17 +1122,12 @@ PREDICT_CHROMA_P_MMX 8 PREDICT_CHROMA_P_MMX 16 %endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH -%macro PREDICT_CHROMA_P_XMM 1 +%macro PREDICT_CHROMA_P 1 %if HIGH_BIT_DEPTH cglobal predict_8x%1c_p_core, 1,2,7 - movd m0, r1m - movd m2, r2m - movd m4, r3m + LOAD_PLANE_ARGS mova m3, [pw_pixel_max] pxor m1, m1 - SPLATW m0, m0, 0 - SPLATW m2, m2, 0 - SPLATW m4, m4, 0 pmullw m2, [pw_43210123] ; b %if %1 == 16 pmullw m5, m4, [pw_m7] ; c @@ -1140,59 +1135,77 @@ cglobal predict_8x%1c_p_core, 1,2,7 pmullw m5, m4, [pw_m3] %endif paddw m5, [pw_16] - mov r1d, %1 +%if mmsize == 32 + mova xm6, xm4 + paddw m4, m4 + paddw m5, m6 +%endif + mov r1d, %1/(mmsize/16) .loop: paddsw m6, m2, m5 paddsw m6, m0 psraw m6, 5 CLIPW m6, m1, m3 - mova [r0], m6 paddw m5, m4 +%if mmsize == 32 + vextracti128 [r0], m6, 1 + mova [r0+FDEC_STRIDEB], xm6 + add r0, 2*FDEC_STRIDEB +%else + mova [r0], m6 add r0, FDEC_STRIDEB +%endif dec r1d jg .loop RET %else ; !HIGH_BIT_DEPTH cglobal predict_8x%1c_p_core, 1,2 - movd m0, r1m - movd m2, r2m - movd m4, r3m - SPLATW m0, m0, 0 - SPLATW m2, m2, 0 - SPLATW m4, m4, 0 + LOAD_PLANE_ARGS +%if mmsize == 32 + vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 + pmullw m2, m1 + mova xm1, xm4 ; zero upper half + paddsw m4, m4 + paddsw m0, m1 +%else pmullw m2, [pw_0to15] - paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} - paddsw m3, m0, m4 +%endif + paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} + paddsw m1, m0, m4 paddsw m4, m4 - mov r1d, %1/4 + mov r1d, %1/(mmsize/8) .loop: - paddsw m1, m3, m4 - paddsw m5, m0, m4 - psraw m3, 5 - psraw m0, 5 - packuswb m0, m3 - movq [r0+FDEC_STRIDE*0], m0 - movhps [r0+FDEC_STRIDE*1], m0 - paddsw m0, m5, m4 - paddsw m3, m1, m4 - psraw m5, 5 - psraw m1, 5 - packuswb m5, m1 - movq [r0+FDEC_STRIDE*2], m5 - movhps [r0+FDEC_STRIDE*3], m5 - add r0, FDEC_STRIDE*4 + psraw m2, m0, 5 + psraw m3, m1, 5 + paddsw m0, m4 + paddsw m1, m4 + packuswb m2, m3 +%if mmsize == 32 + movq [r0+FDEC_STRIDE*1], xm2 + movhps [r0+FDEC_STRIDE*3], xm2 + vextracti128 xm2, m2, 1 + movq [r0+FDEC_STRIDE*0], xm2 + movhps [r0+FDEC_STRIDE*2], xm2 +%else + movq [r0+FDEC_STRIDE*0], xm2 + movhps [r0+FDEC_STRIDE*1], xm2 +%endif + add r0, FDEC_STRIDE*mmsize/8 dec r1d jg .loop RET %endif ; HIGH_BIT_DEPTH -%endmacro ; PREDICT_CHROMA_P_XMM +%endmacro ; PREDICT_CHROMA_P INIT_XMM sse2 -PREDICT_CHROMA_P_XMM 8 -PREDICT_CHROMA_P_XMM 16 +PREDICT_CHROMA_P 8 +PREDICT_CHROMA_P 16 INIT_XMM avx -PREDICT_CHROMA_P_XMM 8 -PREDICT_CHROMA_P_XMM 16 +PREDICT_CHROMA_P 8 +PREDICT_CHROMA_P 16 +INIT_YMM avx2 +PREDICT_CHROMA_P 8 +PREDICT_CHROMA_P 16 ;----------------------------------------------------------------------------- ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 2319ebfe..7d1e0622 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -115,9 +115,6 @@ static void x264_predict_8x16c_p_##name( uint16_t *src )\ PREDICT_8x16C_P_CORE \ x264_predict_8x16c_p_core_##name( src, a, b, c );\ } - -PREDICT_8x16_P(sse2) -PREDICT_8x16_P(avx) #else #define PREDICT_8x16_P(name)\ static void x264_predict_8x16c_p_##name( uint8_t *src )\ @@ -129,9 +126,10 @@ static void x264_predict_8x16c_p_##name( uint8_t *src )\ #ifndef ARCH_X86_64 PREDICT_8x16_P(mmx2) #endif +#endif PREDICT_8x16_P(sse2) PREDICT_8x16_P(avx) -#endif +PREDICT_8x16_P(avx2) #if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH @@ -225,9 +223,9 @@ static void x264_predict_8x8c_p_##name( uint8_t *src )\ x264_predict_8x8c_p_core_##name( src, i00, b, c );\ } #ifndef ARCH_X86_64 -PREDICT_8x8_P( mmx2 ) +PREDICT_8x8_P(mmx2) #endif -PREDICT_8x8_P( sse2 ) +PREDICT_8x8_P(sse2) #endif //!HIGH_BIT_DEPTH @@ -264,7 +262,6 @@ static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\ } PREDICT_8x8_P2(sse2, sse2) -PREDICT_8x8_P2( avx, avx) #else //!HIGH_BIT_DEPTH #define PREDICT_8x8_P2(cpu1, cpu2)\ @@ -289,8 +286,9 @@ static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\ } PREDICT_8x8_P2(ssse3, sse2) -PREDICT_8x8_P2( avx, avx) #endif +PREDICT_8x8_P2( avx, avx) +PREDICT_8x8_P2( avx2, avx2) #endif #if ARCH_X86_64 && !HIGH_BIT_DEPTH @@ -439,6 +437,11 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx; #endif #endif // HIGH_BIT_DEPTH + + if( cpu&X264_CPU_AVX2 ) + { + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx2; + } } void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] ) @@ -485,6 +488,11 @@ void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] ) return; pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx; #endif // HIGH_BIT_DEPTH + + if( cpu&X264_CPU_AVX2 ) + { + pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx2; + } } void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ) diff --git a/common/x86/predict.h b/common/x86/predict.h index c1964972..7691c095 100644 --- a/common/x86/predict.h +++ b/common/x86/predict.h @@ -65,10 +65,12 @@ void x264_predict_8x16c_h_ssse3( uint8_t *src ); void x264_predict_8x16c_h_avx2( uint16_t *src ); void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c ); -void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c ); +void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c ); void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c ); -void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c ); +void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c ); void x264_predict_8x8c_dc_mmx2( pixel *src ); void x264_predict_8x8c_dc_sse2( uint16_t *src ); void x264_predict_8x8c_dc_top_mmx2( uint8_t *src ); -- 2.40.0