Modify pixel_var slightly to return the necessary information and use it for weight analysis instead of sad/ssd.
Various minor cosmetics.
VAR_SQR_SUM q1, q9, q14, d24
vld1.64 {d26}, [r0,:64], r1
VAR_SQR_SUM q2, q10, q15, d26
-
- mov r2, #6
b x264_var_end
.endfunc
VAR_SQR_SUM q2, q13, q15, d19, vpaddl.u16
mov ip, #7
- mov r2, #8
var16_loop:
subs ip, ip, #1
vld1.64 {d16-d17}, [r0,:128], r1
vpadd.u32 d0, d0, d2
vmov r0, r1, d0
- mul r0, r0, r0
- sub r0, r1, r0, lsr r2
bx lr
.endfunc
int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );
-int x264_pixel_var_8x8_neon( uint8_t *, int );
-int x264_pixel_var_16x16_neon( uint8_t *, int );
+uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
/****************************************************************************
* pixel_var_wxh
****************************************************************************/
-#define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride ) \
+#define PIXEL_VAR_C( name, w ) \
+static uint64_t name( uint8_t *pix, int i_stride ) \
{ \
- uint32_t var = 0, sum = 0, sqr = 0; \
+ uint32_t sum = 0, sqr = 0; \
int x, y; \
for( y = 0; y < w; y++ ) \
{ \
} \
pix += i_stride; \
} \
- var = sqr - (sum * sum >> shift); \
- return var; \
+ return sum + ((uint64_t)sqr << 32); \
}
-PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
-PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 )
+PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x8, 8 )
/****************************************************************************
* pixel_var2_wxh
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
- int (*var[4])( uint8_t *pix, int stride );
+ uint64_t (*var[4])( uint8_t *pix, int stride );
uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
/****************************************************************************
* variance
****************************************************************************/
-static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
{
ALIGNED_16(uint32_t sum_tab[4]);
ALIGNED_16(uint32_t sqr_tab[4]);
uint32_t sum = sum_tab[3];
uint32_t sqr = sqr_tab[3];
- uint32_t var = sqr - (sum * sum >> 8);
- return var;
+ return sum + ((uint64_t)sqr<<32);
}
-static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
{
ALIGNED_16(uint32_t sum_tab[4]);
ALIGNED_16(uint32_t sqr_tab[4]);
uint32_t sum = sum_tab[3];
uint32_t sqr = sqr_tab[3];
- uint32_t var = sqr - (sum * sum >> 6);
- return var;
+ return sum + ((uint64_t)sqr<<32);
}
%endif
%endmacro
-%macro VAR_END 1
+%macro VAR_END 0
HADDW m5, m7
- movd r1d, m5
- imul r1d, r1d
+ movd eax, m5
HADDD m6, m1
- shr r1d, %1
- movd eax, m6
- sub eax, r1d ; sqr - (sum * sum >> shift)
+ movd edx, m6
+%ifdef ARCH_X86_64
+ shl rdx, 32
+ add rax, rdx
+%endif
RET
%endmacro
cglobal x264_pixel_var_16x16_mmxext, 2,3
VAR_START 0
VAR_2ROW 8, 16
- VAR_END 8
+ VAR_END
cglobal x264_pixel_var_8x8_mmxext, 2,3
VAR_START 0
VAR_2ROW r1, 4
- VAR_END 6
+ VAR_END
INIT_XMM
cglobal x264_pixel_var_16x16_sse2, 2,3,8
VAR_CORE
dec r2d
jg .loop
- VAR_END 8
+ VAR_END
cglobal x264_pixel_var_8x8_sse2, 2,4,8
VAR_START 1
VAR_CORE
dec r2d
jg .loop
- VAR_END 6
+ VAR_END
%macro VAR2_END 0
HADDW m5, m7
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
-DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
+ rce->misc_bits;
}
+static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i )
+{
+ int w = i ? 8 : 16;
+ int shift = i ? 6 : 8;
+ int stride = frame->i_stride[i];
+ int offset = h->mb.b_interlaced
+ ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
+ : w * (mb_x + mb_y * stride);
+ int pix = i ? PIXEL_8x8 : PIXEL_16x16;
+ stride <<= h->mb.b_interlaced;
+ uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
+ uint32_t sum = (uint32_t)res;
+ uint32_t sqr = res >> 32;
+ return sqr - (sum * sum >> shift);
+}
+
// Find the total AC energy of the block in all planes.
static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
{
* and putting it after floating point ops. As a result, we put the emms at the end of the
* function and make sure that its always called before the float math. Noinline makes
* sure no reordering goes on. */
- uint32_t var = 0, i;
- for( i = 0; i < 3; i++ )
- {
- int w = i ? 8 : 16;
- int stride = frame->i_stride[i];
- int offset = h->mb.b_interlaced
- ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
- : w * (mb_x + mb_y * stride);
- int pix = i ? PIXEL_8x8 : PIXEL_16x16;
- stride <<= h->mb.b_interlaced;
- var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
- }
+ uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 );
+ var += ac_energy_plane( h, mb_x, mb_y, frame, 1 );
+ var += ac_energy_plane( h, mb_x, mb_y, frame, 2 );
x264_emms();
return var;
}
}
w->i_scale = X264_MIN( w->i_scale, 127 );
}
-/* due to a GCC bug on some platforms (win32), flat[16] may not actually be aligned. */
-ALIGNED_16( static uint8_t flat[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
static NOINLINE void weights_plane_analyse( x264_t *h, uint8_t *plane, int width, int height, int stride, unsigned int *sum, uint64_t *var )
{
int x,y;
- unsigned int sad = 0;
+ uint64_t sad = 0;
uint64_t ssd = 0;
uint8_t *p = plane;
for( y = 0; y < height>>4; y++, p += stride*16 )
for( x = 0; x < width; x+=16 )
{
- sad += h->pixf.sad_aligned[PIXEL_16x16]( p + x, stride, flat, 0 );
- ssd += h->pixf.ssd[PIXEL_16x16]( p + x, stride, flat, 0 );
+ uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
+ sad += (uint32_t)res;
+ ssd += res >> 32;
}
*sum = sad;
- *var = ssd - (uint64_t) sad * sad / (width * height);
+ *var = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
x264_emms();
}
int i_lines = fenc->i_lines_lowres;
int i_width = fenc->i_width_lowres;
uint8_t *fenc_plane = fenc->lowres[0];
- ALIGNED_ARRAY_16( uint8_t, buf, [8*8] );
+ ALIGNED_8( uint8_t buf[8*8] );
int pixoff = 0;
int i_mb = 0;
if( w )
+ {
for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
{
w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
}
- else
- for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
- for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
- cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
-
- if( w )
- {
+ /* Add cost of weights in the slice header. */
int numslices;
if( h->param.i_slice_count )
numslices = h->param.i_slice_count;
numslices = (h->sps->i_mb_width * h->sps->i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
else
numslices = 1;
- // FIXME still need to calculate for --slice-max-size
- // Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
- // Since using lowres frames, assume lambda = 1.
+ /* FIXME: find a way to account for --slice-max-size?
+ * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
+ * Since using lowres frames, assume lambda = 1. */
cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
}
+ else
+ for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+ for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
+ cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
x264_emms();
return cost;
}
int i_delta_index = fenc->i_frame - ref->i_frame - 1;
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
const float epsilon = 1.0/128.0;
-
float guess_scale;
int found;
x264_weight_t *weights = fenc->weight[0];
weights_plane_analyse( h, fenc->plane[0], fenc->i_width[0], fenc->i_lines[0], fenc->i_stride[0], &fenc_sum, &fenc_var );
- weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var );
+ weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var );
fenc_var = round( sqrt( fenc_var ) );
- ref_var = round( sqrt( ref_var ) );
+ ref_var = round( sqrt( ref_var ) );
fenc_mean = (float)fenc_sum / (fenc->i_lines[0] * fenc->i_width[0]);
- ref_mean = (float)ref_sum / (fenc->i_lines[0] * fenc->i_width[0]);
+ ref_mean = (float) ref_sum / (fenc->i_lines[0] * fenc->i_width[0]);
//early termination
if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon )
x264_emms();
/* FIXME: More analysis can be done here on SAD vs. SATD termination. */
- /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
+ /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
{
SET_WEIGHT( weights[0], 0, 1, 0, 0 );
#define TEST_PIXEL_VAR( i ) \
if( pixel_asm.var[i] != pixel_ref.var[i] ) \
{ \
- int res_c, res_asm; \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
- res_c = call_c( pixel_c.var[i], buf1, 16 ); \
- res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
+ /* abi-check wrapper can't return uint64_t, so separate it from return value check */\
+ call_c1( pixel_c.var[i], buf1, 16 ); \
+ call_a1( pixel_asm.var[i], buf1, 16 ); \
+ uint64_t res_c = pixel_c.var[i]( buf1, 16 ); \
+ uint64_t res_asm = pixel_asm.var[i]( buf1, 16 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
- fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
+ fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
} \
+ call_c2( pixel_c.var[i], buf1, 16 ); \
+ call_a2( pixel_asm.var[i], buf1, 16 ); \
}
ok = 1; used_asm = 0;
for( j=0; j<32; j++ )
{
uint8_t *pix = (j&16 ? buf1 : buf3) + (j&15)*256;
+ call_c1( pixel_c.hadamard_ac[i], buf1, 16 );
+ call_a1( pixel_asm.hadamard_ac[i], buf1, 16 );
uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
if( rc != ra )