AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2
+; computed jump assumes this loop is exactly 48 bytes
+%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
+ALIGN 16
+avg_w16_align%1_%2_ssse3:
+%if %2&15==0
+ movdqa xmm1, [r2+16]
+ palignr xmm1, [r2], %1
+ pavgb xmm1, [r2+r4]
+%else
+ movdqa xmm1, [r2+16]
+ movdqa xmm2, [r2+r4+16]
+ palignr xmm1, [r2], %1
+ palignr xmm2, [r2+r4], %2
+ pavgb xmm1, xmm2
+%endif
+ movdqa [r0], xmm1
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg avg_w16_align%1_%2_ssse3
+ rep ret
+%endmacro
+
+%assign j 1
+%assign k 2
+%rep 15
+AVG16_CACHELINE_LOOP_SSSE3 j, j
+AVG16_CACHELINE_LOOP_SSSE3 j, k
+%assign j j+1
+%assign k k+1
+%endrep
+
+cglobal x264_pixel_avg2_w16_cache64_ssse3
+ mov eax, r2m
+ and eax, 0x3f
+ cmp eax, 0x30
+ jle x264_pixel_avg2_w16_sse2
+ PROLOGUE 6,7
+ lea r6, [r4+r2]
+ and r4, ~0xf
+ and r6, 0x1f
+ and r2, ~0xf
+ lea r6, [r6*3] ;(offset + align*2)*3
+ sub r4, r2
+ shl r6, 4 ;jump = (offset + align*2)*48
+%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
+%ifdef PIC
+ lea r11, [avg_w16_addr GLOBAL]
+ add r6, r11
+%else
+ lea r6, [avg_w16_addr + r6 GLOBAL]
+%endif
+%ifdef UNIX64
+ jmp r6
+%else
+ call r6
+ RET
+%endif
+
+
;=============================================================================
; pixel copy
;=============================================================================
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse2_misalign)
+PIXEL_AVG_WALL(cache64_ssse3)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3, mmxext, cache64_mmxext, cache64_sse2, cache64_ssse3, cache64_sse2)
#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
#endif
MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
+MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
#define GET_REF(name)\
static uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
GET_REF(sse2)
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
+GET_REF(cache64_ssse3)
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
+ {
pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
+ pf->mc_luma = mc_luma_cache64_ssse3;
+ pf->get_ref = get_ref_cache64_ssse3;
+ }
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->integral_init4v = x264_integral_init4v_ssse3;