Remove SAD argument from var, not needed anymore.
Speed up var asm a bit by eliminating psadbw and instead HADDWing at end.
Eliminate all remaining warnings on gcc 3.4 on cygwin
Port another minor optimization from lavc (pskip)
int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
if( i_refa == -2 || i_refb == -2 ||
- ( i_refa == 0 && *(uint32_t*)mv_a == 0 ) ||
- ( i_refb == 0 && *(uint32_t*)mv_b == 0 ) )
+ !( i_refa | *(uint32_t*)mv_a ) ||
+ !( i_refb | *(uint32_t*)mv_b ) )
{
*(uint32_t*)mv = 0;
}
* pixel_var_wxh
****************************************************************************/
#define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
+static int name( uint8_t *pix, int i_stride ) \
{ \
uint32_t var = 0, sum = 0, sqr = 0; \
int x, y; \
pix += i_stride; \
} \
var = sqr - (sum * sum >> shift); \
- *sad = sum; \
return var; \
}
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
- int (*var[4])( uint8_t *pix, int stride, uint32_t *sad );
+ int (*var[4])( uint8_t *pix, int stride );
uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
%endmacro
%macro VAR_END 1
-%if mmsize == 16
- movhlps m0, m5
- paddw m5, m0
-%endif
- movifnidn r2d, r2m
+ HADDW m5, m7
movd r1d, m5
- movd [r2], m5 ; return sum
imul r1d, r1d
HADDD m6, m1
shr r1d, %1
mova m0, [r0]
mova m1, m0
mova m3, [r0+%1]
- mova m2, m0
- punpcklbw m0, m7
mova m4, m3
+ punpcklbw m0, m7
punpckhbw m1, m7
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
- punpckhbw m4, m7
- psadbw m2, m7
- paddw m5, m2
- mova m2, m3
punpcklbw m3, m7
+ punpckhbw m4, m7
+ paddw m5, m0
dec t3d
- psadbw m2, m7
pmaddwd m0, m0
- paddw m5, m2
+ paddw m5, m1
pmaddwd m1, m1
+ paddw m5, m3
paddd m6, m0
pmaddwd m3, m3
+ paddw m5, m4
paddd m6, m1
pmaddwd m4, m4
paddd m6, m3
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
+; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_pixel_var_16x16_mmxext, 2,3
lea r0, [r0+r1*2]
mova m1, m0
punpcklbw m0, m7
- mova m2, m1
punpckhbw m1, m7
dec t3d
+ paddw m5, m0
+ paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
- psadbw m2, m7
- paddw m5, m2
paddd m6, m0
paddd m6, m1
jnz .loop
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride, uint32_t *sad ))
-DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride, uint32_t *sad ))
+DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
* and putting it after floating point ops. As a result, we put the emms at the end of the
* function and make sure that its always called before the float math. Noinline makes
* sure no reordering goes on. */
- unsigned int var=0, sad, i;
- for( i=0; i<3; i++ )
+ unsigned int var = 0, i;
+ for( i = 0; i < 3; i++ )
{
int w = i ? 8 : 16;
int stride = frame->i_stride[i];
: w * (mb_x + mb_y * stride);
int pix = i ? PIXEL_8x8 : PIXEL_16x16;
stride <<= h->mb.b_interlaced;
- var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad );
+ var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
}
var = X264_MAX(var,1);
x264_emms();
header[slen] = 0;
if (strncmp(header, Y4M_FRAME_MAGIC, slen))
{
- fprintf(stderr, "Bad header magic (%08X <=> %s)\n",
+ fprintf(stderr, "Bad header magic (%"PRIx32" <=> %s)\n",
*((uint32_t*)header), header);
return -1;
}
#define TEST_PIXEL_VAR( i ) \
if( pixel_asm.var[i] != pixel_ref.var[i] ) \
{ \
- uint32_t res_c, res_asm; \
- uint32_t sad_c, sad_asm; \
+ int res_c, res_asm; \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
- res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
- res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
- if( (res_c != res_asm) || (sad_c != sad_asm) ) \
+ res_c = call_c( pixel_c.var[i], buf1, 16 ); \
+ res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
+ if( res_c != res_asm ) \
{ \
ok = 0; \
- fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
+ fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
} \
}