From: Loren Merritt Date: Thu, 12 Jun 2008 09:00:23 +0000 (-0600) Subject: mc_chroma_sse2/ssse3 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d97bcbcbebcbe37d9e36b414a3eb371fdc0f4450;p=libx264 mc_chroma_sse2/ssse3 --- diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 21c7b0d8..35d2d76f 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -26,8 +26,8 @@ SECTION_RODATA -pw_4: times 4 dw 4 -pw_8: times 4 dw 8 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 pw_32: times 8 dw 32 pw_64: times 8 dw 64 sw_64: dd 64 @@ -645,173 +645,293 @@ cglobal x264_prefetch_ref_mmxext, 3,3 ; chroma MC ;============================================================================= -;----------------------------------------------------------------------------- -; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, -; int dx, int dy, -; int i_width, int i_height ) -;----------------------------------------------------------------------------- -cglobal x264_mc_chroma_mmxext, 0,6,1 + %define t0d eax + %define t0 rax %ifdef ARCH_X86_64 - %define t0 r10d + %define t1d r10d %else - %define t0 r1d + %define t1d r1d %endif + +%macro MC_CHROMA_START 0 movifnidn r2d, r2m movifnidn r3d, r3m movifnidn r4d, r4m movifnidn r5d, r5m - mov eax, r5d - mov t0, r4d - sar eax, 3 - sar t0, 3 - imul eax, r3d - pxor mm3, mm3 - add eax, t0 - movsxdifnidn rax, eax - add r2, rax ; src += (dx>>3) + (dy>>3) * src_stride - and r4d, 7 ; dx &= 7 - je .mc1d - and r5d, 7 ; dy &= 7 - je .mc1d - - movd mm0, r4d - movd mm1, r5d - pshufw mm5, mm0, 0 ; mm5 = dx - pshufw mm6, mm1, 0 ; mm6 = dy - - movq mm4, [pw_8 GLOBAL] - movq mm0, mm4 - psubw mm4, mm5 ; mm4 = 8-dx - psubw mm0, mm6 ; mm0 = 8-dy - - movq mm7, mm5 - pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB - pmullw mm7, mm6 ; mm7 = dx*dy = cD - pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC - pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA - - mov r4d, r7m + mov t0d, r5d + mov t1d, r4d + sar t0d, 3 + sar t1d, 3 + imul t0d, r3d + add t0d, t1d + movsxdifnidn t0, t0d + add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride +%endmacro + +;----------------------------------------------------------------------------- +; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride, +; uint8_t *src, int src_stride, +; int dx, int dy, +; int width, int height ) +;----------------------------------------------------------------------------- +%macro MC_CHROMA 1 +cglobal x264_mc_chroma_%1, 0,6,1 +%if regsize == 16 + cmp dword r6m, 4 + jle x264_mc_chroma_mmxext %+ .skip_prologue +%endif +.skip_prologue: + MC_CHROMA_START + pxor m3, m3 + and r4d, 7 ; dx &= 7 + jz .mc1dy + and r5d, 7 ; dy &= 7 + jz .mc1dx + + movd m5, r4d + movd m6, r5d + SPLATW m5, m5 ; m5 = dx + SPLATW m6, m6 ; m6 = dy + + mova m4, [pw_8 GLOBAL] + mova m0, m4 + psubw m4, m5 ; m4 = 8-dx + psubw m0, m6 ; m0 = 8-dy + + mova m7, m5 + pmullw m5, m0 ; m5 = dx*(8-dy) = cB + pmullw m7, m6 ; m7 = dx*dy = cD + pmullw m6, m4 ; m6 = (8-dx)*dy = cC + pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA + + mov r4d, r7m %ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 + mov r10, r0 + mov r11, r2 %else - mov r0, r0m - mov r1, r1m - mov r5, r2 + mov r0, r0m + mov r1, r1m + mov r5, r2 %endif -ALIGN 4 -.height_loop: - - movd mm1, [r2+r3] - movd mm0, [r2] - punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 - punpcklbw mm0, mm3 - pmullw mm1, mm6 ; 2nd line * cC - pmullw mm0, mm4 ; 1st line * cA - paddw mm0, mm1 ; mm0 <- result - - movd mm2, [r2+1] - movd mm1, [r2+r3+1] - punpcklbw mm2, mm3 - punpcklbw mm1, mm3 - - paddw mm0, [pw_32 GLOBAL] - - pmullw mm2, mm5 ; line * cB - pmullw mm1, mm7 ; line * cD - paddw mm0, mm2 - paddw mm0, mm1 - psrlw mm0, 6 - - packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4 - movd [r0], mm0 - - add r2, r3 - add r0, r1 ; i_dst_stride - dec r4d - jnz .height_loop - +.loop2d: + movh m1, [r2+r3] + movh m0, [r2] + punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 + punpcklbw m0, m3 + pmullw m1, m6 ; 2nd line * cC + pmullw m0, m4 ; 1st line * cA + paddw m0, m1 ; m0 <- result + + movh m2, [r2+1] + movh m1, [r2+r3+1] + punpcklbw m2, m3 + punpcklbw m1, m3 + + paddw m0, [pw_32 GLOBAL] + + pmullw m2, m5 ; line * cB + pmullw m1, m7 ; line * cD + paddw m0, m2 + paddw m0, m1 + psrlw m0, 6 + + packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4 + movh [r0], m0 + + add r2, r3 + add r0, r1 ; dst_stride + dec r4d + jnz .loop2d + +%if regsize == 8 sub dword r6m, 8 - jnz .finish ; width != 8 so assume 4 - + jnz .finish ; width != 8 so assume 4 %ifdef ARCH_X86_64 - lea r0, [r10+4] ; dst - lea r2, [r11+4] ; src + lea r0, [r10+4] ; dst + lea r2, [r11+4] ; src %else - mov r0, r0m - lea r2, [r5+4] - add r0, 4 + mov r0, r0m + lea r2, [r5+4] + add r0, 4 %endif - mov r4d, r7m ; i_height - jmp .height_loop - -ALIGN 4 -.mc1d: - mov eax, r4d - or eax, r5d - and eax, 7 - cmp r4d, 0 + mov r4d, r7m ; height + jmp .loop2d +%else + REP_RET +%endif ; regsize + +.mc1dy: + and r5d, 7 + movd m6, r5d + mov r5, r3 ; pel_offset = dx ? 1 : src_stride + jmp .mc1d +.mc1dx: + movd m6, r4d mov r5d, 1 - cmove r5, r3 ; pel_offset = dx ? 1 : src_stride - movd mm6, eax - movq mm5, [pw_8 GLOBAL] - pshufw mm6, mm6, 0 - movq mm7, [pw_4 GLOBAL] - psubw mm5, mm6 - - cmp dword r6m, 8 +.mc1d: + mova m5, [pw_8 GLOBAL] + SPLATW m6, m6 + mova m7, [pw_4 GLOBAL] + psubw m5, m6 movifnidn r0d, r0m movifnidn r1d, r1m mov r4d, r7m - je .height_loop1_w8 - -ALIGN 4 -.height_loop1_w4: - movd mm0, [r2+r5] - movd mm1, [r2] - punpcklbw mm0, mm3 - punpcklbw mm1, mm3 - pmullw mm0, mm6 - pmullw mm1, mm5 - paddw mm0, mm7 - paddw mm0, mm1 - psrlw mm0, 3 - packuswb mm0, mm3 - movd [r0], mm0 - add r2, r3 - add r0, r1 - dec r4d - jnz .height_loop1_w4 +%if regsize == 8 + cmp dword r6m, 8 + je .loop1d_w8 +%endif + +.loop1d_w4: + movh m0, [r2+r5] + movh m1, [r2] + punpcklbw m0, m3 + punpcklbw m1, m3 + pmullw m0, m6 + pmullw m1, m5 + paddw m0, m7 + paddw m0, m1 + psrlw m0, 3 + packuswb m0, m3 + movh [r0], m0 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop1d_w4 .finish: REP_RET -ALIGN 4 -.height_loop1_w8: - movq mm0, [r2+r5] - movq mm1, [r2] - movq mm2, mm0 - movq mm4, mm1 - punpcklbw mm0, mm3 - punpcklbw mm1, mm3 - punpckhbw mm2, mm3 - punpckhbw mm4, mm3 - pmullw mm0, mm6 - pmullw mm1, mm5 - pmullw mm2, mm6 - pmullw mm4, mm5 - paddw mm0, mm7 - paddw mm2, mm7 - paddw mm0, mm1 - paddw mm2, mm4 - psrlw mm0, 3 - psrlw mm2, 3 - packuswb mm0, mm2 - movq [r0], mm0 - add r2, r3 - add r0, r1 - dec r4d - jnz .height_loop1_w8 +%if regsize == 8 +.loop1d_w8: + movu m0, [r2+r5] + mova m1, [r2] + mova m2, m0 + mova m4, m1 + punpcklbw m0, m3 + punpcklbw m1, m3 + punpckhbw m2, m3 + punpckhbw m4, m3 + pmullw m0, m6 + pmullw m1, m5 + pmullw m2, m6 + pmullw m4, m5 + paddw m0, m7 + paddw m2, m7 + paddw m0, m1 + paddw m2, m4 + psrlw m0, 3 + psrlw m2, 3 + packuswb m0, m2 + mova [r0], m0 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop1d_w8 + REP_RET +%endif ; regsize +%endmacro ; MC_CHROMA + +INIT_MMX +MC_CHROMA mmxext +INIT_XMM +MC_CHROMA sse2 + +INIT_MMX +cglobal x264_mc_chroma_ssse3, 0,6,1 + MC_CHROMA_START + and r4d, 7 + and r5d, 7 + mov t0d, r4d + shl t0d, 8 + sub t0d, r4d + mov r4d, 8 + add t0d, 8 + sub r4d, r5d + imul r5d, t0d ; (x*255+8)*y + imul r4d, t0d ; (x*255+8)*(8-y) + cmp dword r6m, 4 + jg .width8 + mova m5, [pw_32 GLOBAL] + movd m6, r5d + movd m7, r4d + movifnidn r0d, r0m + movifnidn r1d, r1m + movifnidn r4d, r7m + SPLATW m6, m6 + SPLATW m7, m7 + movh m0, [r2] + punpcklbw m0, [r2+1] + add r2, r3 +.loop4: + movh m1, [r2] + movh m3, [r2+r3] + punpcklbw m1, [r2+1] + punpcklbw m3, [r2+r3+1] + lea r2, [r2+2*r3] + mova m2, m1 + mova m4, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + pmaddubsw m3, m6 + paddw m0, m5 + paddw m2, m5 + paddw m1, m0 + paddw m3, m2 + mova m0, m4 + psrlw m1, 6 + psrlw m3, 6 + packuswb m1, m1 + packuswb m3, m3 + movh [r0], m1 + movh [r0+r1], m3 + sub r4d, 2 + lea r0, [r0+2*r1] + jg .loop4 + REP_RET + +INIT_XMM +.width8: + mova m5, [pw_32 GLOBAL] + movd m6, r5d + movd m7, r4d + movifnidn r0d, r0m + movifnidn r1d, r1m + movifnidn r4d, r7m + SPLATW m6, m6 + SPLATW m7, m7 + movh m0, [r2] + movh m1, [r2+1] + punpcklbw m0, m1 + add r2, r3 +.loop8: + movh m1, [r2] + movh m2, [r2+1] + movh m3, [r2+r3] + movh m4, [r2+r3+1] + punpcklbw m1, m2 + punpcklbw m3, m4 + lea r2, [r2+2*r3] + mova m2, m1 + mova m4, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + pmaddubsw m3, m6 + paddw m0, m5 + paddw m2, m5 + paddw m1, m0 + paddw m3, m2 + mova m0, m4 + psrlw m1, 6 + psrlw m3, 6 + packuswb m1, m3 + movh [r0], m1 + movhps [r0+r1], m1 + sub r4d, 2 + lea r0, [r0+2*r1] + jg .loop8 REP_RET +; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size + diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 45516065..658cccfb 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -54,6 +54,12 @@ extern void x264_prefetch_ref_mmxext( uint8_t *, int, int ); extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int dx, int dy, int i_width, int i_height ); +extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); +extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); @@ -299,6 +305,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2; pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2; pf->hpel_filter = x264_hpel_filter_sse2; + pf->mc_chroma = x264_mc_chroma_sse2; if( cpu&X264_CPU_SSE2_IS_FAST ) { @@ -315,4 +322,5 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) return; pf->hpel_filter = x264_hpel_filter_ssse3; + pf->mc_chroma = x264_mc_chroma_ssse3; }