From: Fiona Glaser Date: Wed, 22 Jul 2009 02:56:21 +0000 (-0700) Subject: SSSE3 cachesplit workaround for avg2_w16 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d68f3b076acb1674c7cce95aaa2dc62372bbf7f4;p=libx264 SSSE3 cachesplit workaround for avg2_w16 Palignr-based solution for the most commonly used qpel function. 1-1.5% faster overall on Core 2 chips. --- diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 69033b4f..16aa7e73 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -511,6 +511,66 @@ AVG_CACHELINE_CHECK 12, 64, mmxext AVG_CACHELINE_CHECK 16, 64, sse2 AVG_CACHELINE_CHECK 20, 64, sse2 +; computed jump assumes this loop is exactly 48 bytes +%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment +ALIGN 16 +avg_w16_align%1_%2_ssse3: +%if %2&15==0 + movdqa xmm1, [r2+16] + palignr xmm1, [r2], %1 + pavgb xmm1, [r2+r4] +%else + movdqa xmm1, [r2+16] + movdqa xmm2, [r2+r4+16] + palignr xmm1, [r2], %1 + palignr xmm2, [r2+r4], %2 + pavgb xmm1, xmm2 +%endif + movdqa [r0], xmm1 + add r2, r3 + add r0, r1 + dec r5d + jg avg_w16_align%1_%2_ssse3 + rep ret +%endmacro + +%assign j 1 +%assign k 2 +%rep 15 +AVG16_CACHELINE_LOOP_SSSE3 j, j +AVG16_CACHELINE_LOOP_SSSE3 j, k +%assign j j+1 +%assign k k+1 +%endrep + +cglobal x264_pixel_avg2_w16_cache64_ssse3 + mov eax, r2m + and eax, 0x3f + cmp eax, 0x30 + jle x264_pixel_avg2_w16_sse2 + PROLOGUE 6,7 + lea r6, [r4+r2] + and r4, ~0xf + and r6, 0x1f + and r2, ~0xf + lea r6, [r6*3] ;(offset + align*2)*3 + sub r4, r2 + shl r6, 4 ;jump = (offset + align*2)*48 +%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) +%ifdef PIC + lea r11, [avg_w16_addr GLOBAL] + add r6, r11 +%else + lea r6, [avg_w16_addr + r6 GLOBAL] +%endif +%ifdef UNIX64 + jmp r6 +%else + call r6 + RET +%endif + + ;============================================================================= ; pixel copy ;============================================================================= diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 1241a232..78bc9638 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -94,6 +94,7 @@ PIXEL_AVG_WALL(cache64_mmxext) PIXEL_AVG_WALL(cache64_sse2) PIXEL_AVG_WALL(sse2) PIXEL_AVG_WALL(sse2_misalign) +PIXEL_AVG_WALL(cache64_ssse3) #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\ @@ -119,6 +120,7 @@ PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_m PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2) PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign) PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2) +PIXEL_AVG_WTAB(cache64_ssse3, mmxext, cache64_mmxext, cache64_sse2, cache64_ssse3, cache64_sse2) #define MC_COPY_WTAB(instr, name1, name2, name3)\ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\ @@ -166,6 +168,7 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx) #endif MC_LUMA(sse2,sse2,sse2) MC_LUMA(cache64_sse2,cache64_sse2,sse2) +MC_LUMA(cache64_ssse3,cache64_ssse3,sse2) #define GET_REF(name)\ static uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\ @@ -199,6 +202,7 @@ GET_REF(cache64_mmxext) GET_REF(sse2) GET_REF(sse2_misalign) GET_REF(cache64_sse2) +GET_REF(cache64_ssse3) #define HPEL(align, cpu, cpuv, cpuc, cpuh)\ void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\ @@ -344,7 +348,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) + { pf->mc_chroma = x264_mc_chroma_ssse3_cache64; + pf->mc_luma = mc_luma_cache64_ssse3; + pf->get_ref = get_ref_cache64_ssse3; + } if( cpu&X264_CPU_SHUFFLE_IS_FAST ) pf->integral_init4v = x264_integral_init4v_ssse3;