%undef sfence
%endif ; !HIGH_BIT_DEPTH
+%macro PREFETCHNT_ITER 2 ; src, bytes/iteration
+ %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
+ %rep (%2+63) / 64 ; assume 64 byte cache lines
+ prefetchnta [%1+%%i]
+ %assign %%i %%i + 64
+ %endrep
+%endmacro
+
;-----------------------------------------------------------------------------
; void plane_copy_core( pixel *dst, intptr_t i_dst,
; pixel *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
-; assumes i_dst and w are multiples of 16, and i_dst>w
-INIT_MMX
-cglobal plane_copy_core_mmx2, 6,7
+; assumes i_dst and w are multiples of mmsize, and i_dst>w
+%macro PLANE_COPY_CORE 0
+cglobal plane_copy_core, 6,7
FIX_STRIDES r1, r3, r4d
%if HIGH_BIT_DEPTH == 0
movsxdifnidn r4, r4d
%endif
- sub r1, r4
- sub r3, r4
+ add r0, r4
+ add r2, r4
+ neg r4
.loopy:
- lea r6d, [r4-63]
+ lea r6, [r4+4*mmsize]
.loopx:
- prefetchnta [r2+256]
- movq m0, [r2 ]
- movq m1, [r2+ 8]
- movntq [r0 ], m0
- movntq [r0+ 8], m1
- movq m2, [r2+16]
- movq m3, [r2+24]
- movntq [r0+16], m2
- movntq [r0+24], m3
- movq m4, [r2+32]
- movq m5, [r2+40]
- movntq [r0+32], m4
- movntq [r0+40], m5
- movq m6, [r2+48]
- movq m7, [r2+56]
- movntq [r0+48], m6
- movntq [r0+56], m7
- add r2, 64
- add r0, 64
- sub r6d, 64
- jg .loopx
- prefetchnta [r2+256]
- add r6d, 63
- jle .end16
-.loop16:
- movq m0, [r2 ]
- movq m1, [r2+8]
- movntq [r0 ], m0
- movntq [r0+8], m1
- add r2, 16
- add r0, 16
- sub r6d, 16
- jg .loop16
-.end16:
+ PREFETCHNT_ITER r2+r6, 4*mmsize
+ movu m0, [r2+r6-4*mmsize]
+ movu m1, [r2+r6-3*mmsize]
+ movu m2, [r2+r6-2*mmsize]
+ movu m3, [r2+r6-1*mmsize]
+ movnta [r0+r6-4*mmsize], m0
+ movnta [r0+r6-3*mmsize], m1
+ movnta [r0+r6-2*mmsize], m2
+ movnta [r0+r6-1*mmsize], m3
+ add r6, 4*mmsize
+ jle .loopx
+ PREFETCHNT_ITER r2+r6, 4*mmsize
+ sub r6, 4*mmsize
+ jz .end
+.loop_end:
+ movu m0, [r2+r6]
+ movnta [r0+r6], m0
+ add r6, mmsize
+ jl .loop_end
+.end:
add r0, r1
add r2, r3
- dec r5d
+ dec r5d
jg .loopy
sfence
- emms
RET
+%endmacro
+INIT_XMM sse
+PLANE_COPY_CORE
+INIT_YMM avx
+PLANE_COPY_CORE
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
%if HIGH_BIT_DEPTH
void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
-void x264_plane_copy_core_mmx2( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
#endif
#endif // HIGH_BIT_DEPTH
-static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )
-{
- int c_w = 16/sizeof(pixel) - 1;
- if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
- x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
- } else if( !(w&c_w) ) {
- x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, w, h );
- } else if( i_src > 0 ) {
- // have to use plain memcpy on the last line (in memory order) to avoid overreading src
- x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 );
- memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) );
- } else {
- memcpy( dst, src, w*sizeof(pixel) );
- x264_plane_copy_core_mmx2( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 );
- }
+#define PLANE_COPY(align, cpu)\
+static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+ int c_w = (align) / sizeof(pixel) - 1;\
+ if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
+ x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
+ else if( !(w&c_w) )\
+ x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
+ else\
+ {\
+ if( --h > 0 )\
+ {\
+ if( i_src > 0 )\
+ {\
+ x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+ dst += i_dst * h;\
+ src += i_src * h;\
+ }\
+ else\
+ x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+ }\
+ /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
+ memcpy( dst, src, w*sizeof(pixel) );\
+ }\
}
+PLANE_COPY(16, sse)
+PLANE_COPY(32, avx)
+
#define PLANE_INTERLEAVE(cpu) \
static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
pixel *srcu, intptr_t i_srcu,\
pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
pf->prefetch_ref = x264_prefetch_ref_mmx2;
- pf->plane_copy = x264_plane_copy_mmx2;
pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
{
pf->memcpy_aligned = x264_memcpy_aligned_sse;
pf->memzero_aligned = x264_memzero_aligned_sse;
+ pf->plane_copy = x264_plane_copy_sse;
}
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_AVX) )
return;
pf->memzero_aligned = x264_memzero_aligned_avx;
+ pf->plane_copy = x264_plane_copy_avx;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;