Patch from Google Code-In.
src += FDEC_STRIDE;
}
}
-static void x264_predict_16x16_p_c( pixel *src )
+void x264_predict_16x16_p_c( pixel *src )
{
int H = 0, V = 0;
src += FDEC_STRIDE;
}
}
-static void x264_predict_8x8c_p_c( pixel *src )
+void x264_predict_8x8c_p_c( pixel *src )
{
int H = 0, V = 0;
void x264_predict_16x16_dc_c( pixel *src );
void x264_predict_16x16_h_c ( pixel *src );
void x264_predict_16x16_v_c ( pixel *src );
+void x264_predict_16x16_p_c ( pixel *src );
void x264_predict_8x8c_dc_c ( pixel *src );
void x264_predict_8x8c_h_c ( pixel *src );
void x264_predict_8x8c_v_c ( pixel *src );
+void x264_predict_8x8c_p_c ( pixel *src );
void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
void x264_predict_8x8c_init ( int cpu, x264_predict_t pf[7] );
cextern pw_8
cextern pw_ff00
cextern pb_reverse
+cextern pw_pixel_max
%macro STORE8x8 2
add r0, 4*FDEC_STRIDEB
PREDICT_8x8_VR mmxext, b, q , 8
%endif
-%ifndef ARCH_X86_64
-INIT_MMX
;-----------------------------------------------------------------------------
; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
+%ifndef ARCH_X86_64
+INIT_MMX
cglobal predict_8x8c_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm1, mm2
dec r1d
jg .loop
REP_RET
+%endif ; !ARCH_X86_64
+
+INIT_XMM
+cglobal predict_8x8c_p_core_sse2, 1,1
+ movd m0, r1m
+ movd m2, r2m
+ movd m4, r3m
+%ifdef HIGH_BIT_DEPTH
+ mova m3, [pw_pixel_max]
+ pxor m1, m1
+%endif
+ SPLATW m0, m0, 0
+ SPLATW m2, m2, 0
+ SPLATW m4, m4, 0
+ pmullw m2, [pw_76543210]
+%ifdef HIGH_BIT_DEPTH
+ mov r1d, 8
+.loop:
+ mova m5, m0
+ paddsw m5, m2
+ psraw m5, 5
+ CLIPW m5, m1, m3
+ mova [r0], m5
+ paddw m2, m4
+ add r0, FDEC_STRIDEB
+ dec r1d
+ jg .loop
+%else ;!HIGH_BIT_DEPTH
+ paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
+ mova m3, m0
+ paddsw m3, m4
+ paddsw m4, m4
+call .loop
+ add r0, FDEC_STRIDE*4
+.loop:
+ mova m5, m0
+ mova m1, m3
+ psraw m0, 5
+ psraw m3, 5
+ packuswb m0, m3
+ movq [r0+FDEC_STRIDE*0], m0
+ movhps [r0+FDEC_STRIDE*1], m0
+ paddsw m5, m4
+ paddsw m1, m4
+ mova m0, m5
+ mova m3, m1
+ psraw m5, 5
+ psraw m1, 5
+ packuswb m5, m1
+ movq [r0+FDEC_STRIDE*2], m5
+ movhps [r0+FDEC_STRIDE*3], m5
+ paddsw m0, m4
+ paddsw m3, m4
+%endif ;!HIGH_BIT_DEPTH
+ RET
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
+%ifndef ARCH_X86_64
cglobal predict_16x16_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
dec r1d
jg .loop
REP_RET
-
%endif ; !ARCH_X86_64
+INIT_XMM
+cglobal predict_16x16_p_core_sse2, 1,2,8
+ movd m0, r1m
+ movd m1, r2m
+ movd m2, r3m
+%ifdef HIGH_BIT_DEPTH
+ pxor m6, m6
+ pxor m7, m7
+%endif
+ SPLATW m0, m0, 0
+ SPLATW m1, m1, 0
+ SPLATW m2, m2, 0
+ mova m3, m1
+ pmullw m3, [pw_76543210]
+ psllw m1, 3
+%ifdef HIGH_BIT_DEPTH
+ mov r1d, 16
+.loop:
+ mova m4, m0
+ mova m5, m0
+ mova m7, m3
+ paddsw m7, m6
+ paddsw m4, m7
+ paddsw m7, m1
+ paddsw m5, m7
+ psraw m4, 5
+ psraw m5, 5
+ CLIPW m4, [pb_0], [pw_pixel_max]
+ CLIPW m5, [pb_0], [pw_pixel_max]
+ mova [r0], m4
+ mova [r0+16], m5
+ add r0, FDEC_STRIDEB
+ paddw m6, m2
+ dec r1d
+ jg .loop
+%else ;!HIGH_BIT_DEPTH
+ paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
+ paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
+ mova m7, m2
+ paddsw m7, m7
+ mov r1d, 8
+ALIGN 4
+.loop:
+ mova m3, m0
+ mova m4, m1
+ mova m5, m0
+ mova m6, m1
+ psraw m3, 5
+ psraw m4, 5
+ paddsw m5, m2
+ paddsw m6, m2
+ psraw m5, 5
+ psraw m6, 5
+ packuswb m3, m4
+ packuswb m5, m6
+ mova [r0+FDEC_STRIDE*0], m3
+ mova [r0+FDEC_STRIDE*1], m5
+ paddsw m0, m7
+ paddsw m1, m7
+ add r0, FDEC_STRIDE*2
+ dec r1d
+ jg .loop
+%endif ;!HIGH_BIT_DEPTH
+ REP_RET
+
INIT_XMM
;-----------------------------------------------------------------------------
; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
RET
%endif
-;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-
-cglobal predict_8x8c_p_core_sse2, 1,1
- movd xmm0, r1m
- movd xmm2, r2m
- movd xmm4, r3m
- pshuflw xmm0, xmm0, 0
- pshuflw xmm2, xmm2, 0
- pshuflw xmm4, xmm4, 0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm4, xmm4
- pmullw xmm2, [pw_76543210]
- paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
- movdqa xmm3, xmm0
- paddsw xmm3, xmm4
- paddsw xmm4, xmm4
-call .loop
- add r0, FDEC_STRIDE*4
-.loop:
- movdqa xmm5, xmm0
- movdqa xmm1, xmm3
- psraw xmm0, 5
- psraw xmm3, 5
- packuswb xmm0, xmm3
- movq [r0+FDEC_STRIDE*0], xmm0
- movhps [r0+FDEC_STRIDE*1], xmm0
- paddsw xmm5, xmm4
- paddsw xmm1, xmm4
- movdqa xmm0, xmm5
- movdqa xmm3, xmm1
- psraw xmm5, 5
- psraw xmm1, 5
- packuswb xmm5, xmm1
- movq [r0+FDEC_STRIDE*2], xmm5
- movhps [r0+FDEC_STRIDE*3], xmm5
- paddsw xmm0, xmm4
- paddsw xmm3, xmm4
- RET
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_sse2, 1,2,8
- movd xmm0, r1m
- movd xmm1, r2m
- movd xmm2, r3m
- pshuflw xmm0, xmm0, 0
- pshuflw xmm1, xmm1, 0
- pshuflw xmm2, xmm2, 0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- movdqa xmm3, xmm1
- pmullw xmm3, [pw_76543210]
- psllw xmm1, 3
- paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
- paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
- movdqa xmm7, xmm2
- paddsw xmm7, xmm7
- mov r1d, 8
-ALIGN 4
-.loop:
- movdqa xmm3, xmm0
- movdqa xmm4, xmm1
- movdqa xmm5, xmm0
- movdqa xmm6, xmm1
- psraw xmm3, 5
- psraw xmm4, 5
- paddsw xmm5, xmm2
- paddsw xmm6, xmm2
- psraw xmm5, 5
- psraw xmm6, 5
- packuswb xmm3, xmm4
- packuswb xmm5, xmm6
- movdqa [r0+FDEC_STRIDE*0], xmm3
- movdqa [r0+FDEC_STRIDE*1], xmm5
- paddsw xmm0, xmm7
- paddsw xmm1, xmm7
- add r0, FDEC_STRIDE*2
- dec r1d
- jg .loop
- REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_v( pixel *src )
void x264_predict_16x16_dc_top_sse2( pixel *src );
void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
- void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
- void x264_predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_dc_mmxext( pixel *src );
void x264_predict_8x8c_dc_sse2( uint16_t *src );
void x264_predict_8x8c_dc_top_mmxext( uint8_t *src );
PREDICT_16x16_DC_LEFT( mmxext )
PREDICT_16x16_DC_LEFT( sse2 )
-#if !HIGH_BIT_DEPTH
-ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
-ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
-ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
-
#define PREDICT_P_SUM(j,i)\
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
+ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8};
+ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
+ALIGNED_16( static const int16_t pw_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
+ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
+ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+
+#if !HIGH_BIT_DEPTH
#define PREDICT_16x16_P(name)\
-static void x264_predict_16x16_p_##name( uint8_t *src )\
+static void x264_predict_16x16_p_##name( pixel *src )\
{\
int a, b, c;\
int H = 0;\
i00 = a - b * 7 - c * 7 + 16;\
x264_predict_16x16_p_core_##name( src, i00, b, c );\
}
-
#ifndef ARCH_X86_64
PREDICT_16x16_P( mmxext )
#endif
PREDICT_16x16_P( sse2 )
+#endif //!HIGH_BIT_DEPTH
#ifdef __GNUC__
+#if HIGH_BIT_DEPTH
+static void x264_predict_16x16_p_sse2( uint16_t *src )
+#else
static void x264_predict_16x16_p_ssse3( uint8_t *src )
+#endif
{
int a, b, c, i00;
int H, V;
+#if HIGH_BIT_DEPTH
+ asm (
+ "movdqu -2+%1, %%xmm1 \n"
+ "movdqa 16+%1, %%xmm0 \n"
+ "pmaddwd %2, %%xmm0 \n"
+ "pmaddwd %3, %%xmm1 \n"
+ "paddd %%xmm1, %%xmm0 \n"
+ "movhlps %%xmm0, %%xmm1 \n"
+ "paddd %%xmm1, %%xmm0 \n"
+ "pshuflw $14, %%xmm0, %%xmm1 \n"
+ "paddd %%xmm1, %%xmm0 \n"
+ "movd %%xmm0, %0 \n"
+ :"=r"(H)
+ :"m"(src[-FDEC_STRIDE]), "m"(*pw_12345678), "m"(*pw_m87654321)
+ );
+#else
asm (
"movq %1, %%mm1 \n"
"movq 8+%1, %%mm0 \n"
:"=r"(H)
:"m"(src[-FDEC_STRIDE]), "m"(*pb_12345678), "m"(*pb_m87654321)
);
+#endif
V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )
+ 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )
+ 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )
b = ( 5 * H + 32 ) >> 6;
c = ( 5 * V + 32 ) >> 6;
i00 = a - b * 7 - c * 7 + 16;
- x264_predict_16x16_p_core_sse2( src, i00, b, c );
+ /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case
+ * than to try to consider it in the asm. */
+ if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )
+ x264_predict_16x16_p_c( src );
+ else
+ x264_predict_16x16_p_core_sse2( src, i00, b, c );
}
#endif
+#if !HIGH_BIT_DEPTH
+
#define PREDICT_8x8_P(name)\
static void x264_predict_8x8c_p_##name( uint8_t *src )\
{\
i00 = a -3*b -3*c + 16;\
x264_predict_8x8c_p_core_##name( src, i00, b, c );\
}
-
#ifndef ARCH_X86_64
PREDICT_8x8_P( mmxext )
#endif
PREDICT_8x8_P( sse2 )
+#endif //!HIGH_BIT_DEPTH
+
#ifdef __GNUC__
+#if HIGH_BIT_DEPTH
+static void x264_predict_8x8c_p_sse2( uint16_t *src )
+#else
static void x264_predict_8x8c_p_ssse3( uint8_t *src )
+#endif
{
int a, b, c, i00;
int H, V;
+#if HIGH_BIT_DEPTH
+ asm (
+ "movdqa %1, %%xmm0 \n"
+ "pmaddwd %2, %%xmm0 \n"
+ "movhlps %%xmm0, %%xmm1 \n"
+ "paddd %%xmm1, %%xmm0 \n"
+ "pshuflw $14, %%xmm0, %%xmm1 \n"
+ "paddd %%xmm1, %%xmm0 \n"
+ "movd %%xmm0, %0 \n"
+ :"=r"(H)
+ :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)
+ );
+#else
asm (
"movq %1, %%mm0 \n"
"pmaddubsw %2, %%mm0 \n"
:"=r"(H)
:"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)
);
+#endif
V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )
+ 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )
+ 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )
b = ( 17 * H + 16 ) >> 5;
c = ( 17 * V + 16 ) >> 5;
i00 = a -3*b -3*c + 16;
- x264_predict_8x8c_p_core_sse2( src, i00, b, c );
+ /* b*7 + c*7 can overflow: it's easier to just branch away in this rare case
+ * than to try to consider it in the asm. */
+ if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 2340 || abs(c) > 2340) )
+ x264_predict_8x8c_p_c( src );
+ else
+ x264_predict_8x8c_p_core_sse2( src, i00, b, c );
}
#endif
-
+#if !HIGH_BIT_DEPTH
#if ARCH_X86_64
static void x264_predict_8x8c_dc_left( uint8_t *src )
{
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2;
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
#else
#if !ARCH_X86_64
pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmxext;
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_sse2;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2;
#else
#if ARCH_X86_64
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
int ret = 0, ok = 1, used_asm = 0;
ALIGNED_16( pixel edge[33] );
ALIGNED_16( pixel edge2[33] );
+ ALIGNED_16( pixel fdec[FDEC_STRIDE*20] );
struct
{
x264_predict_t predict_16x16[4+3];
x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
- ip_c.predict_8x8_filter( pbuf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+ memcpy( fdec, pbuf1, 32*20 * sizeof(pixel) );\
-#define INTRA_TEST( name, dir, w, ... )\
+ ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+
+#define INTRA_TEST( name, dir, w, bench, ... )\
if( ip_a.name[dir] != ip_ref.name[dir] )\
{\
set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
used_asm = 1;\
- memcpy( pbuf3, pbuf1, 32*20 * sizeof(pixel) );\
- memcpy( pbuf4, pbuf1, 32*20 * sizeof(pixel) );\
- call_c( ip_c.name[dir], pbuf3+48, ##__VA_ARGS__ );\
- call_a( ip_a.name[dir], pbuf4+48, ##__VA_ARGS__ );\
- if( memcmp( pbuf3, pbuf4, 32*20 * sizeof(pixel) ) )\
+ memcpy( pbuf3, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
+ memcpy( pbuf4, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
+ call_c##bench( ip_c.name[dir], pbuf3+48, ##__VA_ARGS__ );\
+ call_a##bench( ip_a.name[dir], pbuf4+48, ##__VA_ARGS__ );\
+ if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*20 * sizeof(pixel) ) )\
{\
fprintf( stderr, #name "[%d] : [FAILED]\n", dir );\
ok = 0;\
{\
printf( "%2x ", edge[14-j] );\
for( int k = 0; k < w; k++ )\
- printf( "%2x ", pbuf4[48+k+j*32] );\
+ printf( "%2x ", pbuf4[48+k+j*FDEC_STRIDE] );\
printf( "\n" );\
}\
printf( "\n" );\
{\
printf( " " );\
for( int k = 0; k < w; k++ )\
- printf( "%2x ", pbuf3[48+k+j*32] );\
+ printf( "%2x ", pbuf3[48+k+j*FDEC_STRIDE] );\
printf( "\n" );\
}\
}\
}
for( int i = 0; i < 12; i++ )
- INTRA_TEST( predict_4x4, i, 4 );
+ INTRA_TEST( predict_4x4, i, 4, );
for( int i = 0; i < 7; i++ )
- INTRA_TEST( predict_8x8c, i, 8 );
+ INTRA_TEST( predict_8x8c, i, 8, );
for( int i = 0; i < 7; i++ )
- INTRA_TEST( predict_16x16, i, 16 );
+ INTRA_TEST( predict_16x16, i, 16, );
for( int i = 0; i < 12; i++ )
- INTRA_TEST( predict_8x8, i, 8, edge );
+ INTRA_TEST( predict_8x8, i, 8, , edge );
set_func_name("intra_predict_8x8_filter");
if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
}
}
+#define EXTREMAL_PLANE(size) \
+ { \
+ int max[7]; \
+ for( int j = 0; j < 7; j++ ) \
+ max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \
+ fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \
+ for( int j = 0; j < size/2; j++ ) \
+ fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \
+ for( int j = size/2; j < size-1; j++ ) \
+ fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \
+ fdec[48+(size-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
+ for( int j = 0; j < size/2; j++ ) \
+ fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \
+ for( int j = size/2; j < size-1; j++ ) \
+ fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \
+ fdec[48+(size-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
+ }
+ /* Extremal test case for planar prediction. */
+ for( int test = 0; test < 100 && ok; test++ )
+ for( int i = 0; i < 128 && ok; i++ )
+ {
+ EXTREMAL_PLANE( 8 );
+ INTRA_TEST( predict_8x8c, I_PRED_CHROMA_P, 8, 1 );
+ EXTREMAL_PLANE( 16 );
+ INTRA_TEST( predict_16x16, I_PRED_16x16_P, 16, 1 );
+ }
report( "intra pred :" );
return ret;
}