From 8ecdeb2709b4b7095237330e68e9a76ea8060a2f Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 16 Apr 2013 23:27:18 +0200 Subject: [PATCH] x86: AVX2 predict_16x16_p Also fix the AVX implementation to correctly use the SSSE3 inline asm instead of SSE2. --- common/x86/const-a.asm | 2 +- common/x86/predict-a.asm | 102 ++++++++++++++++++++++++------- common/x86/predict-c.c | 127 +++++++++++++++++++++------------------ common/x86/predict.h | 1 + common/x86/x86util.asm | 8 ++- 5 files changed, 156 insertions(+), 84 deletions(-) diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index 9382fd4d..b5637647 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -34,6 +34,7 @@ const pw_1, times 16 dw 1 const pw_16, times 16 dw 16 const pw_32, times 16 dw 32 const pw_00ff, times 16 dw 0x00ff +const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) const pd_1, times 8 dd 1 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 @@ -52,7 +53,6 @@ const pw_32_0, times 4 dw 32, times 4 dw 0 const pw_8000, times 8 dw 0x8000 const pw_3fff, times 8 dw 0x3fff -const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1) const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1 const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 7038c0aa..3265e28e 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -6,6 +6,7 @@ ;* Authors: Loren Merritt ;* Holger Lubitz ;* Fiona Glaser +;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -28,10 +29,9 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -pw_76543210: -pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 +pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 8 dw -3 pw_m7: times 8 dw -7 @@ -1069,17 +1069,21 @@ PREDICT_8x8_VR b %endif %macro LOAD_PLANE_ARGS 0 -%if ARCH_X86_64 - movd mm0, r1d - movd mm2, r2d - movd mm4, r3d - pshufw mm0, mm0, 0 - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 +%if cpuflag(avx2) && ARCH_X86_64 == 0 + vpbroadcastw m0, r1m + vpbroadcastw m2, r2m + vpbroadcastw m4, r3m +%elif mmsize == 8 ; MMX is only used on x86_32 + SPLATW m0, r1m + SPLATW m2, r2m + SPLATW m4, r3m %else - pshufw mm0, r1m, 0 - pshufw mm2, r2m, 0 - pshufw mm4, r3m, 0 + movd xm0, r1m + movd xm2, r2m + movd xm4, r3m + SPLATW m0, xm0 + SPLATW m2, xm2 + SPLATW m4, xm4 %endif %endmacro @@ -1091,7 +1095,7 @@ PREDICT_8x8_VR b cglobal predict_8x%1c_p_core, 1,2 LOAD_PLANE_ARGS movq m1, m2 - pmullw m2, [pw_3210] + pmullw m2, [pw_0to15] psllw m1, 2 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b} paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b} @@ -1156,7 +1160,7 @@ cglobal predict_8x%1c_p_core, 1,2 SPLATW m0, m0, 0 SPLATW m2, m2, 0 SPLATW m4, m4, 0 - pmullw m2, [pw_76543210] + pmullw m2, [pw_0to15] paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} paddsw m3, m0, m4 paddsw m4, m4 @@ -1193,13 +1197,13 @@ PREDICT_CHROMA_P_XMM 16 ;----------------------------------------------------------------------------- ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -%if ARCH_X86_64 == 0 +%if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0 INIT_MMX mmx2 cglobal predict_16x16_p_core, 1,2 LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 - pmullw mm5, [pw_3210] + pmullw mm5, [pw_0to15] psllw mm2, 3 psllw mm1, 2 movq mm3, mm2 @@ -1233,7 +1237,7 @@ ALIGN 4 dec r1d jg .loop RET -%endif ; !ARCH_X86_64 +%endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64 %macro PREDICT_16x16_P 0 cglobal predict_16x16_p_core, 1,2,8 @@ -1243,7 +1247,7 @@ cglobal predict_16x16_p_core, 1,2,8 SPLATW m0, m0, 0 SPLATW m1, m1, 0 SPLATW m2, m2, 0 - pmullw m3, m1, [pw_76543210] + pmullw m3, m1, [pw_0to15] psllw m1, 3 %if HIGH_BIT_DEPTH pxor m6, m6 @@ -1264,8 +1268,6 @@ cglobal predict_16x16_p_core, 1,2,8 mova [r0+16], m5 add r0, FDEC_STRIDEB paddw m6, m2 - dec r1d - jg .loop %else ; !HIGH_BIT_DEPTH paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} @@ -1286,9 +1288,9 @@ ALIGN 4 paddsw m0, m7 paddsw m1, m7 add r0, FDEC_STRIDE*2 - dec r1d - jg .loop %endif ; !HIGH_BIT_DEPTH + dec r1d + jg .loop RET %endmacro ; PREDICT_16x16_P @@ -1299,6 +1301,60 @@ INIT_XMM avx PREDICT_16x16_P %endif +INIT_YMM avx2 +cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH + LOAD_PLANE_ARGS +%if HIGH_BIT_DEPTH + pmullw m2, [pw_0to15] + pxor m5, m5 + pxor m6, m6 + mova m7, [pw_pixel_max] + mov r1d, 8 +.loop: + paddsw m1, m2, m5 + paddw m5, m4 + paddsw m1, m0 + paddsw m3, m2, m5 + psraw m1, 5 + paddsw m3, m0 + psraw m3, 5 + CLIPW m1, m6, m7 + mova [r0+0*FDEC_STRIDEB], m1 + CLIPW m3, m6, m7 + mova [r0+1*FDEC_STRIDEB], m3 + paddw m5, m4 + add r0, 2*FDEC_STRIDEB +%else ; !HIGH_BIT_DEPTH + vbroadcasti128 m1, [pw_0to15] + mova xm3, xm4 ; zero high bits + pmullw m1, m2 + psllw m2, 3 + paddsw m0, m3 + paddsw m0, m1 ; X+1*C X+0*C + paddsw m1, m0, m2 ; Y+1*C Y+0*C + paddsw m4, m4 + mov r1d, 4 +.loop: + psraw m2, m0, 5 + psraw m3, m1, 5 + paddsw m0, m4 + paddsw m1, m4 + packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C + vextracti128 [r0+0*FDEC_STRIDE], m2, 1 + mova [r0+1*FDEC_STRIDE], xm2 + psraw m2, m0, 5 + psraw m3, m1, 5 + paddsw m0, m4 + paddsw m1, m4 + packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C + vextracti128 [r0+2*FDEC_STRIDE], m2, 1 + mova [r0+3*FDEC_STRIDE], xm2 + add r0, FDEC_STRIDE*4 +%endif ; !HIGH_BIT_DEPTH + dec r1d + jg .loop + RET + %if HIGH_BIT_DEPTH == 0 %macro PREDICT_8x8 0 ;----------------------------------------------------------------------------- diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 75a74ab6..2319ebfe 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -93,10 +93,9 @@ static void x264_predict_16x16_p_##name( pixel *src )\ x264_predict_16x16_p_core_##name( src, i00, b, c );\ } #ifndef ARCH_X86_64 -PREDICT_16x16_P( mmx2 ) +PREDICT_16x16_P(mmx2) #endif -PREDICT_16x16_P( sse2 ) -PREDICT_16x16_P( avx ) +PREDICT_16x16_P(sse2) #endif //!HIGH_BIT_DEPTH #define PREDICT_8x16C_P_CORE \ @@ -136,67 +135,74 @@ PREDICT_8x16_P(avx) #if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH -static void x264_predict_16x16_p_sse2( uint16_t *src ) -#else -static void x264_predict_16x16_p_ssse3( uint8_t *src ) -#endif -{ - int a, b, c, i00; - int H, V; -#if HIGH_BIT_DEPTH - asm ( - "movdqu %1, %%xmm1 \n" - "movdqa %2, %%xmm0 \n" - "pmaddwd %3, %%xmm0 \n" - "pmaddwd %4, %%xmm1 \n" - "paddd %%xmm1, %%xmm0 \n" - "movhlps %%xmm0, %%xmm1 \n" - "paddd %%xmm1, %%xmm0 \n" - "pshuflw $14, %%xmm0, %%xmm1 \n" - "paddd %%xmm1, %%xmm0 \n" - "movd %%xmm0, %0 \n" - :"=r"(H) - :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]), - "m"(*pw_12345678), "m"(*pw_m87654321) +#define PREDICT_16x16_P_ASM\ + asm (\ + "movdqu %1, %%xmm1 \n"\ + "movdqa %2, %%xmm0 \n"\ + "pmaddwd %3, %%xmm0 \n"\ + "pmaddwd %4, %%xmm1 \n"\ + "paddd %%xmm1, %%xmm0 \n"\ + "movhlps %%xmm0, %%xmm1 \n"\ + "paddd %%xmm1, %%xmm0 \n"\ + "pshuflw $14, %%xmm0, %%xmm1 \n"\ + "paddd %%xmm1, %%xmm0 \n"\ + "movd %%xmm0, %0 \n"\ + :"=r"(H)\ + :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),\ + "m"(*pw_12345678), "m"(*pw_m87654321)\ ); #else - asm ( - "movq %1, %%mm1 \n" - "movq %2, %%mm0 \n" - "palignr $7, %3, %%mm1 \n" - "pmaddubsw %4, %%mm0 \n" - "pmaddubsw %5, %%mm1 \n" - "paddw %%mm1, %%mm0 \n" - "pshufw $14, %%mm0, %%mm1 \n" - "paddw %%mm1, %%mm0 \n" - "pshufw $1, %%mm0, %%mm1 \n" - "paddw %%mm1, %%mm0 \n" - "movd %%mm0, %0 \n" - "movswl %w0, %0 \n" - :"=r"(H) - :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]), - "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321) +#define PREDICT_16x16_P_ASM\ + asm (\ + "movq %1, %%mm1 \n"\ + "movq %2, %%mm0 \n"\ + "palignr $7, %3, %%mm1 \n"\ + "pmaddubsw %4, %%mm0 \n"\ + "pmaddubsw %5, %%mm1 \n"\ + "paddw %%mm1, %%mm0 \n"\ + "pshufw $14, %%mm0, %%mm1 \n"\ + "paddw %%mm1, %%mm0 \n"\ + "pshufw $1, %%mm0, %%mm1 \n"\ + "paddw %%mm1, %%mm0 \n"\ + "movd %%mm0, %0 \n"\ + "movswl %w0, %0 \n"\ + :"=r"(H)\ + :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),\ + "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)\ ); #endif - V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] ) - + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] ) - + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] ) - + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] ) - + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] ) - + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] ) - + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] ) - + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] ); - a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] ); - b = ( 5 * H + 32 ) >> 6; - c = ( 5 * V + 32 ) >> 6; - i00 = a - b * 7 - c * 7 + 16; +#define PREDICT_16x16_P_INLINE(name, name2)\ +static void x264_predict_16x16_p_##name( pixel *src )\ +{\ + int a, b, c, i00;\ + int H, V;\ + PREDICT_16x16_P_ASM\ + V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )\ + + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )\ + + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )\ + + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )\ + + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )\ + + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )\ + + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )\ + + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );\ + a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\ + b = ( 5 * H + 32 ) >> 6;\ + c = ( 5 * V + 32 ) >> 6;\ + i00 = a - b * 7 - c * 7 + 16;\ /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case - * than to try to consider it in the asm. */ - if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) ) - x264_predict_16x16_p_c( src ); - else - x264_predict_16x16_p_core_sse2( src, i00, b, c ); + * than to try to consider it in the asm. */\ + if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\ + x264_predict_16x16_p_c( src );\ + else\ + x264_predict_16x16_p_core_##name2( src, i00, b, c );\ } +#if HIGH_BIT_DEPTH +PREDICT_16x16_P_INLINE( sse2, sse2 ) +#else +PREDICT_16x16_P_INLINE( ssse3, sse2 ) +PREDICT_16x16_P_INLINE( avx, avx ) +#endif +PREDICT_16x16_P_INLINE( avx2, avx2 ) #endif #if !HIGH_BIT_DEPTH @@ -373,6 +379,11 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] ) return; pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx; #endif // HIGH_BIT_DEPTH + + if( cpu&X264_CPU_AVX2 ) + { + pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx2; + } } void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) diff --git a/common/x86/predict.h b/common/x86/predict.h index b6f8b42e..c1964972 100644 --- a/common/x86/predict.h +++ b/common/x86/predict.h @@ -52,6 +52,7 @@ void x264_predict_16x16_dc_top_ssse3( uint16_t *src ); void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c ); void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c ); void x264_predict_8x16c_dc_mmx2( pixel *src ); void x264_predict_8x16c_dc_sse2( uint16_t *src ); void x264_predict_8x16c_dc_top_mmx2( uint8_t *src ); diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index e3c91837..a77016cf 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -267,9 +267,13 @@ %endmacro %imacro SPLATW 2-3 0 - PSHUFLW %1, %2, (%3)*q1111 +%if cpuflag(avx2) && %3 == 0 + vpbroadcastw %1, %2 +%else + PSHUFLW %1, %2, (%3)*q1111 %if mmsize == 16 - punpcklqdq %1, %1 + punpcklqdq %1, %1 +%endif %endif %endmacro -- 2.40.0