const pw_16, times 16 dw 16
const pw_32, times 16 dw 32
const pw_00ff, times 16 dw 0x00ff
+const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
const pd_1, times 8 dd 1
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
times 4 dw 0
const pw_8000, times 8 dw 0x8000
const pw_3fff, times 8 dw 0x3fff
-const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;* Fiona Glaser <fiona@x264.com>
+;* Henrik Gramner <henrik@gramner.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
-pw_76543210:
-pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
+pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 8 dw -3
pw_m7: times 8 dw -7
%endif
%macro LOAD_PLANE_ARGS 0
-%if ARCH_X86_64
- movd mm0, r1d
- movd mm2, r2d
- movd mm4, r3d
- pshufw mm0, mm0, 0
- pshufw mm2, mm2, 0
- pshufw mm4, mm4, 0
+%if cpuflag(avx2) && ARCH_X86_64 == 0
+ vpbroadcastw m0, r1m
+ vpbroadcastw m2, r2m
+ vpbroadcastw m4, r3m
+%elif mmsize == 8 ; MMX is only used on x86_32
+ SPLATW m0, r1m
+ SPLATW m2, r2m
+ SPLATW m4, r3m
%else
- pshufw mm0, r1m, 0
- pshufw mm2, r2m, 0
- pshufw mm4, r3m, 0
+ movd xm0, r1m
+ movd xm2, r2m
+ movd xm4, r3m
+ SPLATW m0, xm0
+ SPLATW m2, xm2
+ SPLATW m4, xm4
%endif
%endmacro
cglobal predict_8x%1c_p_core, 1,2
LOAD_PLANE_ARGS
movq m1, m2
- pmullw m2, [pw_3210]
+ pmullw m2, [pw_0to15]
psllw m1, 2
paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
SPLATW m0, m0, 0
SPLATW m2, m2, 0
SPLATW m4, m4, 0
- pmullw m2, [pw_76543210]
+ pmullw m2, [pw_0to15]
paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
paddsw m3, m0, m4
paddsw m4, m4
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-%if ARCH_X86_64 == 0
+%if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0
INIT_MMX mmx2
cglobal predict_16x16_p_core, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
- pmullw mm5, [pw_3210]
+ pmullw mm5, [pw_0to15]
psllw mm2, 3
psllw mm1, 2
movq mm3, mm2
dec r1d
jg .loop
RET
-%endif ; !ARCH_X86_64
+%endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64
%macro PREDICT_16x16_P 0
cglobal predict_16x16_p_core, 1,2,8
SPLATW m0, m0, 0
SPLATW m1, m1, 0
SPLATW m2, m2, 0
- pmullw m3, m1, [pw_76543210]
+ pmullw m3, m1, [pw_0to15]
psllw m1, 3
%if HIGH_BIT_DEPTH
pxor m6, m6
mova [r0+16], m5
add r0, FDEC_STRIDEB
paddw m6, m2
- dec r1d
- jg .loop
%else ; !HIGH_BIT_DEPTH
paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
paddsw m0, m7
paddsw m1, m7
add r0, FDEC_STRIDE*2
- dec r1d
- jg .loop
%endif ; !HIGH_BIT_DEPTH
+ dec r1d
+ jg .loop
RET
%endmacro ; PREDICT_16x16_P
PREDICT_16x16_P
%endif
+INIT_YMM avx2
+cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH
+ LOAD_PLANE_ARGS
+%if HIGH_BIT_DEPTH
+ pmullw m2, [pw_0to15]
+ pxor m5, m5
+ pxor m6, m6
+ mova m7, [pw_pixel_max]
+ mov r1d, 8
+.loop:
+ paddsw m1, m2, m5
+ paddw m5, m4
+ paddsw m1, m0
+ paddsw m3, m2, m5
+ psraw m1, 5
+ paddsw m3, m0
+ psraw m3, 5
+ CLIPW m1, m6, m7
+ mova [r0+0*FDEC_STRIDEB], m1
+ CLIPW m3, m6, m7
+ mova [r0+1*FDEC_STRIDEB], m3
+ paddw m5, m4
+ add r0, 2*FDEC_STRIDEB
+%else ; !HIGH_BIT_DEPTH
+ vbroadcasti128 m1, [pw_0to15]
+ mova xm3, xm4 ; zero high bits
+ pmullw m1, m2
+ psllw m2, 3
+ paddsw m0, m3
+ paddsw m0, m1 ; X+1*C X+0*C
+ paddsw m1, m0, m2 ; Y+1*C Y+0*C
+ paddsw m4, m4
+ mov r1d, 4
+.loop:
+ psraw m2, m0, 5
+ psraw m3, m1, 5
+ paddsw m0, m4
+ paddsw m1, m4
+ packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C
+ vextracti128 [r0+0*FDEC_STRIDE], m2, 1
+ mova [r0+1*FDEC_STRIDE], xm2
+ psraw m2, m0, 5
+ psraw m3, m1, 5
+ paddsw m0, m4
+ paddsw m1, m4
+ packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C
+ vextracti128 [r0+2*FDEC_STRIDE], m2, 1
+ mova [r0+3*FDEC_STRIDE], xm2
+ add r0, FDEC_STRIDE*4
+%endif ; !HIGH_BIT_DEPTH
+ dec r1d
+ jg .loop
+ RET
+
%if HIGH_BIT_DEPTH == 0
%macro PREDICT_8x8 0
;-----------------------------------------------------------------------------
x264_predict_16x16_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
-PREDICT_16x16_P( mmx2 )
+PREDICT_16x16_P(mmx2)
#endif
-PREDICT_16x16_P( sse2 )
-PREDICT_16x16_P( avx )
+PREDICT_16x16_P(sse2)
#endif //!HIGH_BIT_DEPTH
#define PREDICT_8x16C_P_CORE \
#if HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
-static void x264_predict_16x16_p_sse2( uint16_t *src )
-#else
-static void x264_predict_16x16_p_ssse3( uint8_t *src )
-#endif
-{
- int a, b, c, i00;
- int H, V;
-#if HIGH_BIT_DEPTH
- asm (
- "movdqu %1, %%xmm1 \n"
- "movdqa %2, %%xmm0 \n"
- "pmaddwd %3, %%xmm0 \n"
- "pmaddwd %4, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "movhlps %%xmm0, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "pshuflw $14, %%xmm0, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "movd %%xmm0, %0 \n"
- :"=r"(H)
- :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),
- "m"(*pw_12345678), "m"(*pw_m87654321)
+#define PREDICT_16x16_P_ASM\
+ asm (\
+ "movdqu %1, %%xmm1 \n"\
+ "movdqa %2, %%xmm0 \n"\
+ "pmaddwd %3, %%xmm0 \n"\
+ "pmaddwd %4, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "movhlps %%xmm0, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "pshuflw $14, %%xmm0, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "movd %%xmm0, %0 \n"\
+ :"=r"(H)\
+ :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),\
+ "m"(*pw_12345678), "m"(*pw_m87654321)\
);
#else
- asm (
- "movq %1, %%mm1 \n"
- "movq %2, %%mm0 \n"
- "palignr $7, %3, %%mm1 \n"
- "pmaddubsw %4, %%mm0 \n"
- "pmaddubsw %5, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "pshufw $14, %%mm0, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "pshufw $1, %%mm0, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "movd %%mm0, %0 \n"
- "movswl %w0, %0 \n"
- :"=r"(H)
- :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),
- "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)
+#define PREDICT_16x16_P_ASM\
+ asm (\
+ "movq %1, %%mm1 \n"\
+ "movq %2, %%mm0 \n"\
+ "palignr $7, %3, %%mm1 \n"\
+ "pmaddubsw %4, %%mm0 \n"\
+ "pmaddubsw %5, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "pshufw $14, %%mm0, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "pshufw $1, %%mm0, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "movd %%mm0, %0 \n"\
+ "movswl %w0, %0 \n"\
+ :"=r"(H)\
+ :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),\
+ "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)\
);
#endif
- V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )
- + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )
- + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )
- + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )
- + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )
- + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )
- + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )
- + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );
- a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
- b = ( 5 * H + 32 ) >> 6;
- c = ( 5 * V + 32 ) >> 6;
- i00 = a - b * 7 - c * 7 + 16;
+#define PREDICT_16x16_P_INLINE(name, name2)\
+static void x264_predict_16x16_p_##name( pixel *src )\
+{\
+ int a, b, c, i00;\
+ int H, V;\
+ PREDICT_16x16_P_ASM\
+ V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )\
+ + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )\
+ + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )\
+ + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )\
+ + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )\
+ + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )\
+ + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )\
+ + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );\
+ a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
+ b = ( 5 * H + 32 ) >> 6;\
+ c = ( 5 * V + 32 ) >> 6;\
+ i00 = a - b * 7 - c * 7 + 16;\
/* b*15 + c*15 can overflow: it's easier to just branch away in this rare case
- * than to try to consider it in the asm. */
- if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )
- x264_predict_16x16_p_c( src );
- else
- x264_predict_16x16_p_core_sse2( src, i00, b, c );
+ * than to try to consider it in the asm. */\
+ if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\
+ x264_predict_16x16_p_c( src );\
+ else\
+ x264_predict_16x16_p_core_##name2( src, i00, b, c );\
}
+#if HIGH_BIT_DEPTH
+PREDICT_16x16_P_INLINE( sse2, sse2 )
+#else
+PREDICT_16x16_P_INLINE( ssse3, sse2 )
+PREDICT_16x16_P_INLINE( avx, avx )
+#endif
+PREDICT_16x16_P_INLINE( avx2, avx2 )
#endif
#if !HIGH_BIT_DEPTH
return;
pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx;
#endif // HIGH_BIT_DEPTH
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx2;
+ }
}
void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c );
void x264_predict_8x16c_dc_mmx2( pixel *src );
void x264_predict_8x16c_dc_sse2( uint16_t *src );
void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
%endmacro
%imacro SPLATW 2-3 0
- PSHUFLW %1, %2, (%3)*q1111
+%if cpuflag(avx2) && %3 == 0
+ vpbroadcastw %1, %2
+%else
+ PSHUFLW %1, %2, (%3)*q1111
%if mmsize == 16
- punpcklqdq %1, %1
+ punpcklqdq %1, %1
+%endif
%endif
%endmacro