From 78798908acafad0ec536bcf6a81a95f50f5461a4 Mon Sep 17 00:00:00 2001 From: Holger Lubitz Date: Sat, 20 Sep 2008 02:36:55 -0700 Subject: [PATCH] Merging Holger's GSOC branch part 1: hpel_filter speedups --- common/x86/mc-a2.asm | 281 +++++++++++++++++++++++++++++++++++++------ common/x86/mc-c.c | 7 +- 2 files changed, 248 insertions(+), 40 deletions(-) diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index d2e88354..845806d1 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -32,12 +32,25 @@ pw_32: times 8 dw 32 SECTION .text -%macro LOAD_ADD 3 +%macro LOAD_ADD 4 + movh %4, %3 movh %1, %2 - movh m7, %3 + punpcklbw %4, m0 punpcklbw %1, m0 - punpcklbw m7, m0 - paddw %1, m7 + paddw %1, %4 +%endmacro + +%macro LOAD_ADD_2 6 + mova %5, %3 + mova %1, %4 + mova %6, %5 + mova %2, %1 + punpcklbw %5, m0 + punpcklbw %1, m0 + punpckhbw %6, m0 + punpckhbw %2, m0 + paddw %1, %5 + paddw %2, %6 %endmacro %macro FILT_V2 0 @@ -64,27 +77,27 @@ SECTION .text paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 %endmacro -%macro FILT_H2 0 - psubw m1, m2 - psubw m4, m5 - psraw m1, 2 - psraw m4, 2 - psubw m1, m2 - psubw m4, m5 - paddw m1, m3 - paddw m4, m6 - psraw m1, 2 - psraw m4, 2 - paddw m1, m3 - paddw m4, m6 +%macro FILT_H2 6 + psubw %1, %2 + psubw %4, %5 + psraw %1, 2 + psraw %4, 2 + psubw %1, %2 + psubw %4, %5 + paddw %1, %3 + paddw %4, %6 + psraw %1, 2 + psraw %4, 2 + paddw %1, %3 + paddw %4, %6 %endmacro -%macro FILT_PACK 1 - paddw m1, m7 - paddw m4, m7 - psraw m1, %1 - psraw m4, %1 - packuswb m1, m4 +%macro FILT_PACK 3 + paddw %1, m7 + paddw %2, m7 + psraw %1, %3 + psraw %2, %3 + packuswb %1, %2 %endmacro %macro PALIGNR_MMX 4 @@ -120,13 +133,10 @@ cglobal x264_hpel_filter_v_%1, 5,6 neg r4 pxor m0, m0 .loop: - prefetcht0 [r5+r3*2+64] - LOAD_ADD m1, [r1 ], [r5+r3*2] ; a0 - LOAD_ADD m2, [r1+r3 ], [r5+r3 ] ; b0 - LOAD_ADD m3, [r1+r3*2], [r5 ] ; c0 - LOAD_ADD m4, [r1 +mmsize/2], [r5+r3*2+mmsize/2] ; a1 - LOAD_ADD m5, [r1+r3 +mmsize/2], [r5+r3 +mmsize/2] ; b1 - LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5 +mmsize/2] ; c1 + LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1 + LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1 + LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0 + LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1 FILT_V2 mova m7, [pw_16 GLOBAL] mova [r2+r4*2], m1 @@ -136,7 +146,7 @@ cglobal x264_hpel_filter_v_%1, 5,6 psraw m1, 5 psraw m4, 5 packuswb m1, m4 - movnt [r0+r4], m1 + mova [r0+r4], m1 add r1, mmsize add r5, mmsize add r4, mmsize @@ -167,8 +177,8 @@ cglobal x264_hpel_filter_c_mmxext, 3,3 paddw m4, [src+14] ; a1 paddw m5, [src+12] ; b1 paddw m6, [src+10] ; c1 - FILT_H2 - FILT_PACK 6 + FILT_H2 m1, m2, m3, m4, m5, m6 + FILT_PACK m1, m4, 6 movntq [r0+r2], m1 add r2, 8 jl .loop @@ -211,8 +221,8 @@ cglobal x264_hpel_filter_h_mmxext, 3,3 punpcklbw m6, m0 paddw m6, m7 ; a1 movq m7, [pw_1 GLOBAL] - FILT_H2 - FILT_PACK 1 + FILT_H2 m1, m2, m3, m4, m5, m6 + FILT_PACK m1, m4, 1 movntq [r0+r2], m1 add r2, 8 jl .loop @@ -305,24 +315,217 @@ cglobal x264_hpel_filter_h_sse2, 3,3 punpcklbw m7, m0 paddw m6, m7 ; c1 mova m7, [pw_1 GLOBAL] ; FIXME xmm8 - FILT_H2 - FILT_PACK 1 + FILT_H2 m1, m2, m3, m4, m5, m6 + FILT_PACK m1, m4, 1 movntdq [r0+r2], m1 add r2, 16 jl .loop REP_RET +;----------------------------------------------------------------------------- +; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width ); +;----------------------------------------------------------------------------- +cglobal x264_hpel_filter_h_ssse3, 3,3 + add r0, r2 + add r1, r2 + neg r2 + %define src r1+r2 + pxor m0, m0 + movh m1, [src-8] + punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8 + movh m2, [src] + punpcklbw m2, m0 + mova m7, [pw_1 GLOBAL] +.loop: + movh m3, [src+8] + punpcklbw m3, m0 + + mova m4, m2 + palignr m2, m1, 14 + mova m5, m3 + palignr m3, m4, 4 + paddw m3, m2 + + mova m2, m4 + palignr m4, m1, 12 + mova m1, m5 + palignr m5, m2, 6 + paddw m5, m4 + + mova m4, m1 + palignr m1, m2, 2 + paddw m1, m2 + + FILT_H m5, m3, m1 + + movh m1, [src+16] + punpcklbw m1, m0 + + mova m3, m4 + palignr m4, m2, 14 + mova m6, m1 + palignr m1, m3, 4 + paddw m1, m4 + + mova m4, m3 + palignr m3, m2, 12 + mova m2, m6 + palignr m6, m4, 6 + paddw m6, m3 + + mova m3, m2 + palignr m2, m4, 2 + paddw m2, m4 + + FILT_H m6, m1, m2 + FILT_PACK m5, m6, 1 + movdqa [r0+r2], m5 + + add r2, 16 + mova m2, m3 + mova m1, m4 + + jl .loop + REP_RET + + %define PALIGNR PALIGNR_MMX HPEL_V sse2 HPEL_C sse2 %define PALIGNR PALIGNR_SSSE3 HPEL_C ssse3 -cglobal x264_sfence +%ifdef ARCH_X86_64 + +%macro DO_FILT_V 5 + LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1 + LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1 + LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1 + FILT_V2 + mova %1, m1 + mova %2, m4 + paddw m1, m15 + paddw m4, m15 + add r3, 16 + add r1, 16 + psraw m1, 5 + psraw m4, 5 + packuswb m1, m4 + movntps [r11+r4+%5], m1 +%endmacro + +%macro DO_FILT_H 4 + mova m1, %2 + PALIGNR m1, %1, 12, m4 + mova m2, %2 + PALIGNR m2, %1, 14, m4 + mova %1, %3 + PALIGNR %3, %2, 6, m4 + mova m3, %1 + PALIGNR m3, %2, 4, m4 + mova m4, %1 + paddw %3, m1 + PALIGNR m4, %2, 2, m1 + paddw m3, m2 + paddw m4, %2 + FILT_H %3, m3, m4 + paddw %3, m15 + psraw %3, %4 +%endmacro + +%macro DO_FILT_CC 4 + DO_FILT_H %1, %2, %3, 6 + DO_FILT_H %2, %1, %4, 6 + packuswb %3, %4 + movntps [r5+r4], %3 +%endmacro + +%macro DO_FILT_HH 4 + DO_FILT_H %1, %2, %3, 1 + DO_FILT_H %2, %1, %4, 1 + packuswb %3, %4 + movntps [r0+r4], %3 +%endmacro + +%macro DO_FILT_H2 6 + DO_FILT_H %1, %2, %3, 6 + psrlw m15, 5 + DO_FILT_H %4, %5, %6, 1 + packuswb %6, %3 +%endmacro + +%macro HPEL 1 +;----------------------------------------------------------------------------- +; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, +; uint8_t *src, int stride, int width, int height) +;----------------------------------------------------------------------------- +cglobal x264_hpel_filter_%1, 7,7 + mov r10, r3 + sub r5, 16 + mov r11, r1 + and r10, 15 + sub r3, r10 + add r0, r5 + add r11, r5 + add r10, r5 + add r5, r2 + mov r2, r4 + neg r10 + lea r1, [r3+r2] + sub r3, r2 + sub r3, r2 + mov r4, r10 + pxor m0, m0 + pcmpeqw m15, m15 + psrlw m15, 15 ; pw_1 + psllw m15, 4 +;ALIGN 16 +.loopy: +; first filter_v +; prefetching does not help here! lots of variants tested, all slower + DO_FILT_V m8, m7, m13, m12, 0 +;ALIGN 16 +.loopx: + DO_FILT_V m6, m5, m11, m10, 16 +.lastx: + paddw m15, m15 + DO_FILT_CC m9, m8, m7, m6 + movdqa m7, m12 ; not really necessary, but seems free and + movdqa m6, m11 ; gives far shorter code + psrlw m15, 5 + DO_FILT_HH m14, m13, m7, m6 + psllw m15, 4 ; pw_16 + movdqa m7, m5 + movdqa m12, m10 + add r4, 16 + jl .loopx + cmp r4, 16 + jl .lastx +; setup regs for next y + sub r4, r10 + sub r4, r2 + sub r1, r4 + sub r3, r4 + add r0, r2 + add r11, r2 + add r5, r2 + mov r4, r10 + sub r6d, 1 + jg .loopy sfence - ret + RET +%endmacro + +%define PALIGNR PALIGNR_MMX +HPEL sse2 +%define PALIGNR PALIGNR_SSSE3 +HPEL ssse3 +%endif +cglobal x264_sfence + sfence + ret ;----------------------------------------------------------------------------- ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 29dd888e..ec32ad8f 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -219,8 +219,13 @@ static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, HPEL(8, mmxext, mmxext, mmxext, mmxext) HPEL(16, sse2_amd, mmxext, mmxext, sse2) +#ifdef ARCH_X86_64 +void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height ); +void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height ); +#else HPEL(16, sse2, sse2, sse2, sse2) -HPEL(16, ssse3, sse2, ssse3, sse2) +HPEL(16, ssse3, sse2, ssse3, ssse3) +#endif void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { -- 2.40.0