rep ret
%endmacro
-%assign j 1
-%assign k 2
-%rep 15
-AVG16_CACHELINE_LOOP_SSSE3 j, j
-AVG16_CACHELINE_LOOP_SSSE3 j, k
-%assign j j+1
-%assign k k+1
-%endrep
-
cglobal x264_pixel_avg2_w16_cache64_ssse3
mov eax, r2m
and eax, 0x3f
RET
%endif
+%assign j 1
+%assign k 2
+%rep 15
+AVG16_CACHELINE_LOOP_SSSE3 j, j
+AVG16_CACHELINE_LOOP_SSSE3 j, k
+%assign j j+1
+%assign k k+1
+%endrep
;=============================================================================
; pixel copy
; in: r0 = fenc
; out: m0..m3 = hadamard coefs
INIT_MMX
-ALIGN 16
-load_hadamard:
+cglobal x264_hadamard_load
+; not really a global, but otherwise cycles get attributed to the wrong function in profiling
pxor m7, m7
movd m0, [r0+0*FENC_STRIDE]
movd m1, [r0+1*FENC_STRIDE]
punpcklbw m2, m7
punpcklbw m3, m7
HADAMARD4_2D 0, 1, 2, 3, 4
- SAVE_MM_PERMUTATION load_hadamard
+ SAVE_MM_PERMUTATION x264_hadamard_load
ret
%macro SCALAR_SUMSUB 4
%define t0 r2
%endif
- call load_hadamard
+ call x264_hadamard_load
SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
mov t0d, r0d
SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
.loop_y:
xor r4d, r4d
.loop_x:
- call load_hadamard
+ call x264_hadamard_load
SUM3x4 %1
SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
.loop_y:
xor r4d, r4d
.loop_x:
- call load_hadamard
+ call x264_hadamard_load
SUM3x4 %1
SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]