From 8e3212863cd22b2c6f71acd61d575b7b25a7f1c1 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Tue, 4 Jan 2011 15:27:38 -0500 Subject: [PATCH] SSE2 high bit depth SSIM functions Patch from Google Code-In. --- common/pixel.c | 2 ++ common/x86/pixel-a.asm | 41 +++++++++++++++++++++++++++++++++++++++-- common/x86/pixel.h | 4 ++-- 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/common/pixel.c b/common/pixel.c index 8c3c1963..19aa9712 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -793,6 +793,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; + pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->var2_8x8 = x264_pixel_var2_8x8_sse2; diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 1b811eb0..2efbd8ca 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -34,8 +34,18 @@ SECTION_RODATA 32 mask_ff: times 16 db 0xff times 16 db 0 -ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 -ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 +%if BIT_DEPTH == 10 +ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 +ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 +pf_64: times 4 dd 64.0 +pf_128: times 4 dd 128.0 +%elif BIT_DEPTH == 9 +ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64 +ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63 +%else ; 8-bit +ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 +ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 +%endif mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1 mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1 mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1 @@ -2461,10 +2471,15 @@ HADAMARD_AC_SSE2 sse4 ;----------------------------------------------------------------------------- %macro SSIM_ITER 1 +%ifdef HIGH_BIT_DEPTH + movdqu m5, [r0+(%1&1)*r1] + movdqu m6, [r2+(%1&1)*r3] +%else movq m5, [r0+(%1&1)*r1] movq m6, [r2+(%1&1)*r3] punpcklbw m5, m0 punpcklbw m6, m0 +%endif %if %1==1 lea r0, [r0+r1*2] lea r2, [r2+r3*2] @@ -2491,6 +2506,7 @@ HADAMARD_AC_SSE2 sse4 %endmacro cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8 + FIX_STRIDES r1, r3 pxor m0, m0 SSIM_ITER 0 SSIM_ITER 1 @@ -2548,6 +2564,26 @@ cglobal pixel_ssim_end4_sse2, 3,3,7 TRANSPOSE4x4D 0, 1, 2, 3, 4 ; s1=m0, s2=m1, ss=m2, s12=m3 +%if BIT_DEPTH == 10 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + mulps m2, [pf_64] ; ss*64 + mulps m3, [pf_128] ; s12*128 + movdqa m4, m1 + mulps m4, m0 ; s1*s2 + mulps m1, m1 ; s2*s2 + mulps m0, m0 ; s1*s1 + addps m4, m4 ; s1*s2*2 + addps m0, m1 ; s1*s1 + s2*s2 + subps m2, m0 ; vars + subps m3, m4 ; covar*2 + addps m4, m5 ; s1*s2*2 + ssim_c1 + addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1 + addps m2, m6 ; vars + ssim_c2 + addps m3, m6 ; covar*2 + ssim_c2 +%else movdqa m4, m1 pslld m1, 16 pmaddwd m4, m0 ; s1*s2 @@ -2566,6 +2602,7 @@ cglobal pixel_ssim_end4_sse2, 3,3,7 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) +%endif mulps m4, m3 mulps m0, m2 divps m4, m0 ; ssim diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 50c9faf1..cfd00685 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -117,8 +117,8 @@ void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, int stride1, int height, uint64_t *ssd_u, uint64_t *ssd_v ); void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); -void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, - const uint8_t *pix2, int stride2, int sums[2][4] ); +void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, int stride1, + const pixel *pix2, int stride2, int sums[2][4] ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); int x264_pixel_var2_8x8_mmxext( pixel *, int, pixel *, int, int * ); int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * ); -- 2.40.0