From: Loren Merritt Date: Tue, 12 Sep 2006 22:18:29 +0000 (+0000) Subject: faster ESA X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f8652aab3dda281aa446ead0674d7e1f1c6d6e74;p=libx264 faster ESA git-svn-id: svn://svn.videolan.org/x264/trunk@561 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/frame.c b/common/frame.c index b64cbb58..8afbc9a4 100644 --- a/common/frame.c +++ b/common/frame.c @@ -92,7 +92,7 @@ x264_frame_t *x264_frame_new( x264_t *h ) if( h->param.analyse.i_me_method == X264_ME_ESA ) { CHECKED_MALLOC( frame->buffer[11], - frame->i_stride[0] * (frame->i_lines[0] + 64) * sizeof(uint16_t) ); + 2 * frame->i_stride[0] * (frame->i_lines[0] + 64) * sizeof(uint16_t) ); frame->integral = (uint16_t*)frame->buffer[11] + frame->i_stride[0] * 32 + 32; } diff --git a/common/mc.c b/common/mc.c index ed9a5cfd..68802690 100644 --- a/common/mc.c +++ b/common/mc.c @@ -417,27 +417,31 @@ void x264_frame_filter( int cpu, x264_frame_t *frame ) } /* generate integral image: - * each entry in frame->integral is the sum of all luma samples above and - * to the left of its location (inclusive). - * this allows us to calculate the DC of any rectangle by looking only - * at the corner entries. - * individual entries will overflow 16 bits, but that's ok: - * we only need the differences between entries, and those will be correct - * as long as we don't try to evaluate a rectangle bigger than 16x16. - * likewise, we don't really have to init the edges to 0, leaving garbage - * there wouldn't affect the results.*/ + * frame->integral contains 2 planes. in the upper plane, each element is + * the sum of an 8x8 pixel region with top-left corner on that point. + * in the lower plane, 4x4 sums (needed only with --analyse p4x4). */ if( frame->integral ) { memset( frame->integral - 32 * stride - 32, 0, stride * sizeof(uint16_t) ); - for( y = -31; y < frame->i_lines[0] + 32; y++ ) + for( y = -32; y < frame->i_lines[0] + 31; y++ ) { uint8_t *ref = frame->plane[0] + y * stride - 32; - uint16_t *line = frame->integral + y * stride - 32; + uint16_t *line = frame->integral + (y+1) * stride - 31; uint16_t v = line[0] = 0; - for( x = 1; x < stride; x++ ) + for( x = 0; x < stride-1; x++ ) line[x] = v += ref[x] + line[x-stride] - line[x-stride-1]; } + for( y = -31; y < frame->i_lines[0] + 24; y++ ) + { + uint16_t *line = frame->integral + y * stride - 31; + uint16_t *sum4 = line + frame->i_stride[0] * (frame->i_lines[0] + 64); + for( x = -31; x < stride - 40; x++, line++, sum4++ ) + { + sum4[0] = line[4+4*stride] - line[4] - line[4*stride] + line[0]; + line[0] += line[8+8*stride] - line[8] - line[8*stride]; + } + } } } diff --git a/encoder/me.c b/encoder/me.c index e0e8e24d..d113e0c7 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -104,10 +104,10 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite p_fref + (m2x) + (m2y)*m->i_stride[0],\ p_fref + (m3x) + (m3y)*m->i_stride[0],\ m->i_stride[0], costs );\ - costs[0] += BITS_MVD( m0x, m0y );\ - costs[1] += BITS_MVD( m1x, m1y );\ - costs[2] += BITS_MVD( m2x, m2y );\ - costs[3] += BITS_MVD( m3x, m3y );\ + costs[0] += p_cost_mvx[m0x<<2]; /* no cost_mvy */\ + costs[1] += p_cost_mvx[m1x<<2];\ + costs[2] += p_cost_mvx[m2x<<2];\ + costs[3] += p_cost_mvx[m3x<<2];\ COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\ COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\ COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\ @@ -462,35 +462,64 @@ me_hex2: /* successive elimination by comparing DC before a full SAD, * because sum(abs(diff)) >= abs(diff(sum)). */ const int stride = m->i_stride[0]; - const int dw = x264_pixel_size[i_pixel].w; - const int dh = x264_pixel_size[i_pixel].h * stride; + const uint16_t *integral_base = m->integral; static uint8_t zero[16*16] = {0,}; - const int enc_dc = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, zero, 16 ); - const uint16_t *integral_base = &m->integral[ -1 - 1*stride ]; + int enc_dc[4]; + int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; + int sad_w = x264_pixel_size[sad_size].w; + h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+sad_w, + m->p_fenc[0]+sad_w*FENC_STRIDE, m->p_fenc[0]+sad_w+sad_w*FENC_STRIDE, + FENC_STRIDE, enc_dc ); + if( sad_w == 4 ) + integral_base += stride * (h->fenc->i_lines[0] + 64); + +#define ESA(ADS) \ + for( my = min_y; my <= max_y; my++ )\ + {\ + int mvs[3], i_mvs=0;\ + bcost -= p_cost_mvy[my<<2];\ + for( mx = min_x; mx <= max_x; mx++ )\ + {\ + const uint16_t *integral = &integral_base[ mx + my * stride ];\ + if( ADS < bcost - p_cost_mvx[mx<<2] )\ + {\ + if( i_mvs == 3 )\ + {\ + COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );\ + i_mvs = 0;\ + }\ + else\ + mvs[i_mvs++] = mx;\ + }\ + }\ + bcost += p_cost_mvy[my<<2];\ + for( i=0; i