From: Fiona Glaser Date: Thu, 20 Mar 2008 06:31:42 +0000 (-0600) Subject: faster intra predict 8x8 hu/hd X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5442dafdab1f18eb5d6f27813dbd0d8b1f37a300;p=libx264 faster intra predict 8x8 hu/hd --- diff --git a/common/macroblock.h b/common/macroblock.h index 60d87ed0..442ce2a7 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -317,7 +317,7 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int for( dx = 0; dx < width; dx++ ) ((uint32_t*)dst)[dx+8*dy] = val; } -static ALWAYS_INLINE uint32_t pack16to32( int a, int b ) +static ALWAYS_INLINE uint32_t pack16to32_clip( int a, int b ) { #ifdef WORDS_BIGENDIAN return (b&0xFFFF) + (a<<16); @@ -332,11 +332,11 @@ static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, in } static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, int mvx, int mvy ) { - x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, pack16to32(mvx,mvy) ); + x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, pack16to32_clip(mvx,mvy) ); } static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, int mdx, int mdy ) { - x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, pack16to32(mdx,mdy) ); + x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, pack16to32_clip(mdx,mdy) ); } static ALWAYS_INLINE void x264_macroblock_cache_skip( x264_t *h, int x, int y, int width, int height, int b_skip ) { diff --git a/common/predict.c b/common/predict.c index d6cb2a56..252b20af 100644 --- a/common/predict.c +++ b/common/predict.c @@ -37,6 +37,24 @@ # include "ppc/predict.h" #endif +static ALWAYS_INLINE uint32_t pack16to32( int a, int b ) +{ +#ifdef WORDS_BIGENDIAN + return b + (a<<16); +#else + return a + (b<<16); +#endif +} + +static ALWAYS_INLINE uint32_t pack8to16( int a, int b ) +{ +#ifdef WORDS_BIGENDIAN + return b + (a<<8); +#else + return a + (b<<8); +#endif +} + /**************************************************************************** * 16x16 prediction for intra luma block ****************************************************************************/ @@ -573,6 +591,7 @@ static void predict_4x4_hu( uint8_t *src ) ****************************************************************************/ #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE] +#define SRC32(x,y) *(uint32_t*)&SRC(x,y) #define PL(y) \ edge[14-y] = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; #define PT(x) \ @@ -645,6 +664,7 @@ void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, in src += FDEC_STRIDE; \ } +/* SIMD is much faster than C for all of these except HU and HD. */ static void predict_8x8_dc_128( uint8_t *src, uint8_t edge[33] ) { PREDICT_8x8_DC(0x80808080); @@ -759,28 +779,27 @@ static void predict_8x8_hd( uint8_t *src, uint8_t edge[33] ) PREDICT_8x8_LOAD_TOP PREDICT_8x8_LOAD_LEFT PREDICT_8x8_LOAD_TOPLEFT - SRC(0,7)= (l6 + l7 + 1) >> 1; - SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; - SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; - SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; - SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; - SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; - SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; - SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; - SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; - SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; - SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; - SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; - SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; - SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; - SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; - SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; - SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; - SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; - SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; - SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; - SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; - SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; + int p1 = pack8to16((l6 + l7 + 1) >> 1, (l5 + 2*l6 + l7 + 2) >> 2); + int p2 = pack8to16((l5 + l6 + 1) >> 1, (l4 + 2*l5 + l6 + 2) >> 2); + int p3 = pack8to16((l4 + l5 + 1) >> 1, (l3 + 2*l4 + l5 + 2) >> 2); + int p4 = pack8to16((l3 + l4 + 1) >> 1, (l2 + 2*l3 + l4 + 2) >> 2); + int p5 = pack8to16((l2 + l3 + 1) >> 1, (l1 + 2*l2 + l3 + 2) >> 2); + int p6 = pack8to16((l1 + l2 + 1) >> 1, (l0 + 2*l1 + l2 + 2) >> 2); + int p7 = pack8to16((l0 + l1 + 1) >> 1, (lt + 2*l0 + l1 + 2) >> 2); + int p8 = pack8to16((lt + l0 + 1) >> 1, (l0 + 2*lt + t0 + 2) >> 2); + int p9 = pack8to16((t1 + 2*t0 + lt + 2) >> 2, (t2 + 2*t1 + t0 + 2) >> 2); + int p10 = pack8to16((t3 + 2*t2 + t1 + 2) >> 2, (t4 + 2*t3 + t2 + 2) >> 2); + int p11 = pack8to16((t5 + 2*t4 + t3 + 2) >> 2, (t6 + 2*t5 + t4 + 2) >> 2); + SRC32(0,7)= pack16to32(p1,p2); + SRC32(0,6)= pack16to32(p2,p3); + SRC32(4,7)=SRC32(0,5)= pack16to32(p3,p4); + SRC32(4,6)=SRC32(0,4)= pack16to32(p4,p5); + SRC32(4,5)=SRC32(0,3)= pack16to32(p5,p6); + SRC32(4,4)=SRC32(0,2)= pack16to32(p6,p7); + SRC32(4,3)=SRC32(0,1)= pack16to32(p7,p8); + SRC32(4,2)=SRC32(0,0)= pack16to32(p8,p9); + SRC32(4,1)= pack16to32(p9,p10); + SRC32(4,0)= pack16to32(p10,p11); } static void predict_8x8_vl( uint8_t *src, uint8_t edge[33] ) { @@ -812,24 +831,22 @@ static void predict_8x8_vl( uint8_t *src, uint8_t edge[33] ) static void predict_8x8_hu( uint8_t *src, uint8_t edge[33] ) { PREDICT_8x8_LOAD_LEFT - SRC(0,0)= (l0 + l1 + 1) >> 1; - SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; - SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; - SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; - SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; - SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; - SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; - SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; - SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; - SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; - SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; - SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; - SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; - SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; - SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= - SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= - SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= - SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; + int p1 = pack8to16((l0 + l1 + 1) >> 1, (l0 + 2*l1 + l2 + 2) >> 2); + int p2 = pack8to16((l1 + l2 + 1) >> 1, (l1 + 2*l2 + l3 + 2) >> 2); + int p3 = pack8to16((l2 + l3 + 1) >> 1, (l2 + 2*l3 + l4 + 2) >> 2); + int p4 = pack8to16((l3 + l4 + 1) >> 1, (l3 + 2*l4 + l5 + 2) >> 2); + int p5 = pack8to16((l4 + l5 + 1) >> 1, (l4 + 2*l5 + l6 + 2) >> 2); + int p6 = pack8to16((l5 + l6 + 1) >> 1, (l5 + 2*l6 + l7 + 2) >> 2); + int p7 = pack8to16((l6 + l7 + 1) >> 1, (l6 + 3*l7 + 2) >> 2); + int p8 = pack8to16(l7,l7); + SRC32(0,0)= pack16to32(p1,p2); + SRC32(0,1)= pack16to32(p2,p3); + SRC32(4,0)=SRC32(0,2)= pack16to32(p3,p4); + SRC32(4,1)=SRC32(0,3)= pack16to32(p4,p5); + SRC32(4,2)=SRC32(0,4)= pack16to32(p5,p6); + SRC32(4,3)=SRC32(0,5)= pack16to32(p6,p7); + SRC32(4,4)=SRC32(0,6)= pack16to32(p7,p8); + SRC32(4,5)=SRC32(4,6)= SRC32(0,7) = SRC32(4,7) = pack16to32(p8,p8); } /****************************************************************************