}
/* generate integral image:
- * each entry in frame->integral is the sum of all luma samples above and
- * to the left of its location (inclusive).
- * this allows us to calculate the DC of any rectangle by looking only
- * at the corner entries.
- * individual entries will overflow 16 bits, but that's ok:
- * we only need the differences between entries, and those will be correct
- * as long as we don't try to evaluate a rectangle bigger than 16x16.
- * likewise, we don't really have to init the edges to 0, leaving garbage
- * there wouldn't affect the results.*/
+ * frame->integral contains 2 planes. in the upper plane, each element is
+ * the sum of an 8x8 pixel region with top-left corner on that point.
+ * in the lower plane, 4x4 sums (needed only with --analyse p4x4). */
if( frame->integral )
{
memset( frame->integral - 32 * stride - 32, 0, stride * sizeof(uint16_t) );
- for( y = -31; y < frame->i_lines[0] + 32; y++ )
+ for( y = -32; y < frame->i_lines[0] + 31; y++ )
{
uint8_t *ref = frame->plane[0] + y * stride - 32;
- uint16_t *line = frame->integral + y * stride - 32;
+ uint16_t *line = frame->integral + (y+1) * stride - 31;
uint16_t v = line[0] = 0;
- for( x = 1; x < stride; x++ )
+ for( x = 0; x < stride-1; x++ )
line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
}
+ for( y = -31; y < frame->i_lines[0] + 24; y++ )
+ {
+ uint16_t *line = frame->integral + y * stride - 31;
+ uint16_t *sum4 = line + frame->i_stride[0] * (frame->i_lines[0] + 64);
+ for( x = -31; x < stride - 40; x++, line++, sum4++ )
+ {
+ sum4[0] = line[4+4*stride] - line[4] - line[4*stride] + line[0];
+ line[0] += line[8+8*stride] - line[8] - line[8*stride];
+ }
+ }
}
}
p_fref + (m2x) + (m2y)*m->i_stride[0],\
p_fref + (m3x) + (m3y)*m->i_stride[0],\
m->i_stride[0], costs );\
- costs[0] += BITS_MVD( m0x, m0y );\
- costs[1] += BITS_MVD( m1x, m1y );\
- costs[2] += BITS_MVD( m2x, m2y );\
- costs[3] += BITS_MVD( m3x, m3y );\
+ costs[0] += p_cost_mvx[m0x<<2]; /* no cost_mvy */\
+ costs[1] += p_cost_mvx[m1x<<2];\
+ costs[2] += p_cost_mvx[m2x<<2];\
+ costs[3] += p_cost_mvx[m3x<<2];\
COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
/* successive elimination by comparing DC before a full SAD,
* because sum(abs(diff)) >= abs(diff(sum)). */
const int stride = m->i_stride[0];
- const int dw = x264_pixel_size[i_pixel].w;
- const int dh = x264_pixel_size[i_pixel].h * stride;
+ const uint16_t *integral_base = m->integral;
static uint8_t zero[16*16] = {0,};
- const int enc_dc = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, zero, 16 );
- const uint16_t *integral_base = &m->integral[ -1 - 1*stride ];
+ int enc_dc[4];
+ int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
+ int sad_w = x264_pixel_size[sad_size].w;
+ h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+sad_w,
+ m->p_fenc[0]+sad_w*FENC_STRIDE, m->p_fenc[0]+sad_w+sad_w*FENC_STRIDE,
+ FENC_STRIDE, enc_dc );
+ if( sad_w == 4 )
+ integral_base += stride * (h->fenc->i_lines[0] + 64);
+
+#define ESA(ADS) \
+ for( my = min_y; my <= max_y; my++ )\
+ {\
+ int mvs[3], i_mvs=0;\
+ bcost -= p_cost_mvy[my<<2];\
+ for( mx = min_x; mx <= max_x; mx++ )\
+ {\
+ const uint16_t *integral = &integral_base[ mx + my * stride ];\
+ if( ADS < bcost - p_cost_mvx[mx<<2] )\
+ {\
+ if( i_mvs == 3 )\
+ {\
+ COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );\
+ i_mvs = 0;\
+ }\
+ else\
+ mvs[i_mvs++] = mx;\
+ }\
+ }\
+ bcost += p_cost_mvy[my<<2];\
+ for( i=0; i<i_mvs; i++ )\
+ COST_MV( mvs[i], my );\
+ }
- for( my = min_y; my <= max_y; my++ )
+ if( i_pixel == PIXEL_16x16 )
{
- int mvs[3], i_mvs=0;
- for( mx = min_x; mx <= max_x; mx++ )
+ ESA( abs( enc_dc[0] - integral[0] )
+ + abs( enc_dc[1] - integral[8] )
+ + abs( enc_dc[2] - integral[8*stride] )
+ + abs( enc_dc[3] - integral[8*stride+8] ) );
+ }
+ else if( i_pixel == PIXEL_8x8 || i_pixel == PIXEL_4x4 )
+ {
+ ESA( abs( enc_dc[0] - integral[0] ) );
+ }
+ else
+ {
+ int dw = i_pixel < PIXEL_8x8 ? 8 : 4;
+ if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
{
- const uint16_t *integral = &integral_base[ mx + my * stride ];
- const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ]
- - integral[ dw ] - integral[ dh ];
- const int bsad = bcost - BITS_MVD(mx,my);
- if( abs( ref_dc - enc_dc ) < bsad )
- {
- if( i_mvs == 3 )
- {
- COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );
- i_mvs = 0;
- }
- else
- mvs[i_mvs++] = mx;
- }
+ dw *= stride;
+ enc_dc[1] = enc_dc[2];
}
- for( i=0; i<i_mvs; i++ )
- COST_MV( mvs[i], my );
+ ESA( abs( enc_dc[0] - integral[0] )
+ + abs( enc_dc[1] - integral[dw] ) );
}
+#undef ESA
#endif
}
break;