On modern CPUs movdqu isn't slower than movdqa when used on aligned data and using the same code in both cases saves cache.
This was already done for the high bit-depth AVX2 implementation but the aligned version still exists as dead code so remove that.
}
if( cpu&X264_CPU_AVX )
{
+ INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
INIT_ADS( _avx );
INIT6( satd, _avx );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
if( cpu&X264_CPU_AVX )
{
+ INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
DECL_X1( sad, ssse3 )
DECL_X1( sad, ssse3_aligned )
DECL_X1( sad, avx2 )
-DECL_X1( sad, avx2_aligned )
DECL_X4( sad, mmx2 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
INIT_YMM avx2
SAD 16, 16
SAD 16, 8
-INIT_YMM avx2, aligned
-SAD 16, 16
-SAD 16, 8
;=============================================================================
; SAD x3/x4