Also add an ACCUM macro to handle accumulator-induced add-or-swap more concisely.
INTRA_MBCMP_8x8(sa8d,, _c )
#if HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP_8x8( sad, _mmx2, _c )
-INTRA_MBCMP_8x8( sad, _sse2, _sse2 )
-INTRA_MBCMP_8x8( sad, _ssse3, _sse2 )
INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 )
#endif
#if HAVE_MMX
#if HIGH_BIT_DEPTH
INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c )
-INTRA_MBCMP(satd, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c )
-INTRA_MBCMP(satd, 8x8, dc, h, v, c, _mmx2, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 )
-INTRA_MBCMP(satd, 16x16, v, h, dc, , _mmx2, _mmx2 )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 )
INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
mova [r0+(%1+64)*SIZEOF_PIXEL], m2
mova [r0+(%1+96)*SIZEOF_PIXEL], m3
packsswb m0, m1
-%if %1
- por m6, m2
- por m7, m3
- por m5, m0
-%else
- SWAP 5, 0
- SWAP 6, 2
- SWAP 7, 3
-%endif
+ ACCUM por, 6, 2, %1
+ ACCUM por, 7, 3, %1
+ ACCUM por, 5, 0, %1
%endmacro
%macro ZIGZAG_8x8_CAVLC 1
cextern pw_1
cextern pw_8
cextern pw_16
-cextern pw_64
+cextern pw_32
cextern pw_00ff
cextern pw_ppppmmmm
cextern pw_ppmmppmm
INIT_MMX
cglobal hadamard_load
; not really a global, but otherwise cycles get attributed to the wrong function in profiling
+%ifdef HIGH_BIT_DEPTH
+ mova m0, [r0+0*FENC_STRIDEB]
+ mova m1, [r0+1*FENC_STRIDEB]
+ mova m2, [r0+2*FENC_STRIDEB]
+ mova m3, [r0+3*FENC_STRIDEB]
+%else
pxor m7, m7
movd m0, [r0+0*FENC_STRIDE]
movd m1, [r0+1*FENC_STRIDE]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
+%endif
HADAMARD4_2D 0, 1, 2, 3, 4
SAVE_MM_PERMUTATION
ret
%macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
%ifidn %1, top
- movd %3, [r1+%2-FDEC_STRIDE]
+%ifdef HIGH_BIT_DEPTH
+ mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
+%else
+ movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
pxor %5, %5
punpcklbw %3, %5
+%endif
%else ; left
%ifnidn %2, 0
- shl %2d, 5 ; log(FDEC_STRIDE)
+ shl %2d, 5 ; log(FDEC_STRIDEB)
%endif
- movd %3, [r1+%2-4+1*FDEC_STRIDE]
- pinsrw %3, [r1+%2-2+0*FDEC_STRIDE], 0
- pinsrw %3, [r1+%2-2+2*FDEC_STRIDE], 2
- pinsrw %3, [r1+%2-2+3*FDEC_STRIDE], 3
+ movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
+ pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
+ pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
+ pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
+%ifndef HIGH_BIT_DEPTH
psrlw %3, 8
+%endif
%ifnidn %2, 0
shr %2d, 5
%endif
%8 %3, %6
%endmacro
-%macro CLEAR_SUMS 0
-%ifdef ARCH_X86_64
- mov qword [sums+0], 0
- mov qword [sums+8], 0
- mov qword [sums+16], 0
-%else
- pxor m7, m7
- movq [sums+0], m7
- movq [sums+8], m7
- movq [sums+16], m7
-%endif
-%endmacro
-
; in: m1..m3
; out: m7
; clobber: m4..m6
; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal intra_satd_x3_16x16, 0,5
- %assign stack_pad 88 + ((stack_offset+88+gprsize)&15)
+ %assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
SUB rsp, stack_pad
-%define sums rsp+64 ; size 24
+%define sums rsp+64 ; size 56
%define top_1d rsp+32 ; size 32
%define left_1d rsp ; size 32
movifnidn r1, r1mp
- CLEAR_SUMS
+
+ pxor m7, m7
+ mova [sums+ 0], m7
+ mova [sums+ 8], m7
+ mova [sums+16], m7
+%ifdef HIGH_BIT_DEPTH
+ mova [sums+24], m7
+ mova [sums+32], m7
+ mova [sums+40], m7
+ mova [sums+48], m7
+%endif
; 1D hadamards
- mov t0d, 12
- movd m6, [pw_64]
+ mov t0d, 12
+ movd m6, [pw_32]
.loop_edge:
SCALAR_HADAMARD left, t0, m0, m1
SCALAR_HADAMARD top, t0, m1, m2, m3
- paddw m6, m0
- paddw m6, m1
- sub t0d, 4
+ pavgw m0, m1
+ paddw m6, m0
+ sub t0d, 4
jge .loop_edge
- psrlw m6, 3
- pand m6, [sw_f0] ; dc
+ psrlw m6, 2
+ pand m6, [sw_f0] ; dc
; 2D hadamards
- movifnidn r0, r0mp
- mov r3, -4
+ movifnidn r0, r0mp
+ mov r3, -4
.loop_y:
- mov r4, -4
+ mov r4, -4
.loop_x:
call hadamard_load
SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
pavgw m4, m7
pavgw m5, m7
- paddw m0, [sums+0] ; i16x16_v satd
- paddw m4, [sums+8] ; i16x16_h satd
+ paddw m0, [sums+ 0] ; i16x16_v satd
+ paddw m4, [sums+ 8] ; i16x16_h satd
paddw m5, [sums+16] ; i16x16_dc satd
- movq [sums+0], m0
- movq [sums+8], m4
- movq [sums+16], m5
+ mova [sums+ 0], m0
+ mova [sums+ 8], m4
+ mova [sums+16], m5
- add r0, 4
+ add r0, 4*SIZEOF_PIXEL
inc r4
jl .loop_x
- add r0, 4*FENC_STRIDE-16
+%ifdef HIGH_BIT_DEPTH
+ mova m7, [pw_1]
+ pmaddwd m4, m7
+ pmaddwd m0, m7
+ paddd m4, [sums+32]
+ paddd m0, [sums+24]
+ mova [sums+32], m4
+ mova [sums+24], m0
+ pxor m7, m7
+ punpckhwd m3, m5, m7
+ punpcklwd m5, m7
+ paddd m3, [sums+48]
+ paddd m5, [sums+40]
+ mova [sums+48], m3
+ mova [sums+40], m5
+ mova [sums+ 0], m7
+ mova [sums+ 8], m7
+ mova [sums+16], m7
+%endif
+ add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
inc r3
jl .loop_y
; horizontal sum
movifnidn r2, r2mp
- movq m2, [sums+16]
- movq m1, [sums+8]
- movq m0, [sums+0]
- movq m7, m2
- SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+%ifdef HIGH_BIT_DEPTH
+ mova m1, m5
+ paddd m5, m3
+ HADDD m5, m7 ; DC satd
+ HADDD m4, m7 ; H satd
+ HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
+ psrld m0, 1
+ psrlq m1, 32 ; DC[1]
+ paddd m0, m3 ; DC[2]
+ psrlq m3, 32 ; DC[3]
+ paddd m0, m1
+ paddd m0, m3
+%else
+ mova m7, m5
+ SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
psrld m0, 1
pslld m7, 16
psrld m7, 16
- paddd m0, m2
+ paddd m0, m5
psubd m0, m7
- movd [r2+8], m2 ; i16x16_dc satd
- movd [r2+4], m1 ; i16x16_h satd
- movd [r2+0], m0 ; i16x16_v satd
- ADD rsp, stack_pad
+%endif
+ movd [r2+8], m5 ; i16x16_dc satd
+ movd [r2+4], m4 ; i16x16_h satd
+ movd [r2+0], m0 ; i16x16_v satd
+ ADD rsp, stack_pad
RET
;-----------------------------------------------------------------------------
%define top_1d rsp+16 ; size 16
%define left_1d rsp ; size 16
movifnidn r1, r1mp
- CLEAR_SUMS
+ pxor m7, m7
+ mova [sums+ 0], m7
+ mova [sums+ 8], m7
+ mova [sums+16], m7
; 1D hadamards
mov t0d, 4
movq [sums+8], m4
movq [sums+0], m5
- add r0, 4
+ add r0, 4*SIZEOF_PIXEL
inc r4
jl .loop_x
- add r0, 4*FENC_STRIDE-8
+ add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
add r5, 8
inc r3
jl .loop_y
movq m1, [sums+8]
movq m2, [sums+16]
movq m7, m0
+%ifdef HIGH_BIT_DEPTH
+ psrlq m7, 16
+ HADDW m7, m3
+ SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+ psrld m2, 1
+ paddd m2, m7
+%else
psrlq m7, 15
paddw m2, m7
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
psrld m2, 1
+%endif
movd [r2+0], m0 ; i8x8c_dc satd
movd [r2+4], m1 ; i8x8c_h satd
movd [r2+8], m2 ; i8x8c_v satd
SATDS_SSE2
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
+%endif
INIT_MMX mmx2
INTRA_X3_MMX
-%endif
INIT_XMM sse2
HADAMARD_AC_SSE2
pmaddwd m7, m5, m6
pmaddwd m5, m5
pmaddwd m6, m6
-%if %1==0
- SWAP 3, 5
- SWAP 4, 7
-%else
- paddd m3, m5
- paddd m4, m7
-%endif
+ ACCUM paddd, 3, 5, %1
+ ACCUM paddd, 4, 7, %1
paddd m3, m6
%endmacro
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
psrad m1, 16
PSIGND m1, m0
mova [%1], m1
-%if %4
- por m5, m1
-%else
- SWAP 5, 1
-%endif
+ ACCUM por, 5, 1, %4
%else ; !sse4
mova m0, [%1]
ABSD m1, m0
psrld m1, 16
PSIGND m1, m0
mova [%1], m1
-%if %4
- por m5, m1
-%else
- SWAP 5, 1
-%endif
+ ACCUM por, 5, 1, %4
%endif ; cpuflag
%endmacro
PSIGND m3, m1
mova [%1], m2
mova [%1+mmsize], m3
-%if %4
- por m5, m2
-%else
- SWAP 5, 2
-%endif
+ ACCUM por, 5, 2, %4
por m5, m3
%else ; !sse4
QUANT_ONE_DC %1, %2, %3, %4
psrad m1, 16
PSIGND m1, m0
mova [%1], m1
-%if %4
- por m5, m1
-%else
- SWAP 5, 1
-%endif
+ ACCUM por, 5, 1, %4
%endmacro
%macro QUANT_TWO_AC 4
PSIGND m3, m1
mova [%1], m2
mova [%1+mmsize], m3
-%if %4
- por m5, m2
-%else
- SWAP 5, 2
-%endif
+ ACCUM por, 5, 2, %4
por m5, m3
%else ; !sse4
QUANT_ONE_AC_MMX %1, %2, %3, %4
pmulhuw m0, %2 ; divide
PSIGNW m0, m1 ; restore sign
mova %1, m0 ; store
-%if %4
- por m5, m0
-%else
- SWAP 5, 0
-%endif
+ ACCUM por, 5, 0, %4
%endmacro
%macro QUANT_TWO 7
PSIGNW m2, m3
mova %1, m0
mova %2, m2
-%if %7
- por m5, m0
+ ACCUM por, 5, 0, %7
por m5, m2
-%else
- SWAP 5, 0
- por m5, m2
-%endif
%endmacro
;-----------------------------------------------------------------------------
psadbw m1, m3
psadbw m2, m4
lea r2, [r2+2*r3]
-%if %1
- paddw m0, m1
-%else
- SWAP 0, 1
-%endif
+ ACCUM paddw, 0, 1, %1
paddw m0, m2
%endmacro
movq m5, [r0+FENC_STRIDE*%1]
movq m4, m5
psadbw m4, m0
-%if %1
- paddw m1, m4
-%else
- SWAP 1, 4
-%endif
+ ACCUM paddw, 1, 4, %1
movq m4, m5
psadbw m4, m6
-%if %1
- paddw m2, m4
-%else
- SWAP 2, 4
-%endif
+ ACCUM paddw, 2, 4, %1
pshufw m4, m7, %2
psadbw m5, m4
-%if %1
- paddw m3, m5
-%else
- SWAP 3, 5
-%endif
+ ACCUM paddw, 3, 5, %1
%endmacro
INIT_MMX
psadbw m5, m6
paddw m1, m3
paddw m4, m5
-%if %1
- paddw m0, m1
- paddw m2, m4
-%else
- SWAP 0,1
- SWAP 2,4
-%endif
+ ACCUM paddw, 0, 1, %1
+ ACCUM paddw, 2, 4, %1
%endmacro
%macro INTRA_SAD_8x8C 0
SECTION .text
cextern pw_1
+cextern pw_8
;=============================================================================
; SAD MMX
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3]);
+;-----------------------------------------------------------------------------
+
+;m0 = DC
+;m6 = V
+;m7 = H
+;m1 = DC score
+;m2 = V score
+;m3 = H score
+;m5 = temp
+;m4 = pixel row
+
+%macro INTRA_SAD_HVDC_ITER 2
+ mova m4, [r0+(%1-4)*FENC_STRIDEB]
+ psubw m4, m0
+ ABSW m4, m4, m5
+ ACCUM paddw, 1, 4, %1
+ mova m4, [r0+(%1-4)*FENC_STRIDEB]
+ psubw m4, m6
+ ABSW m4, m4, m5
+ ACCUM paddw, 2, 4, %1
+ pshufd m5, m7, %2
+ psubw m5, [r0+(%1-4)*FENC_STRIDEB]
+ ABSW m5, m5, m4
+ ACCUM paddw, 3, 5, %1
+%endmacro
+
+%macro INTRA_SAD_X3_8x8 0
+cglobal intra_sad_x3_8x8, 3,3,8
+ add r0, 4*FENC_STRIDEB
+ movu m0, [r1+7*SIZEOF_PIXEL]
+ mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction
+ mova m7, m0
+ paddw m0, m6
+ punpckhwd m7, m7
+ HADDW m0, m4
+ paddw m0, [pw_8]
+ psrlw m0, 4
+ SPLATW m0, m0
+ INTRA_SAD_HVDC_ITER 0, q3333
+ INTRA_SAD_HVDC_ITER 1, q2222
+ INTRA_SAD_HVDC_ITER 2, q1111
+ INTRA_SAD_HVDC_ITER 3, q0000
+ movq m7, [r1+7*SIZEOF_PIXEL]
+ punpcklwd m7, m7
+ INTRA_SAD_HVDC_ITER 4, q3333
+ INTRA_SAD_HVDC_ITER 5, q2222
+ INTRA_SAD_HVDC_ITER 6, q1111
+ INTRA_SAD_HVDC_ITER 7, q0000
+ HADDW m2, m4
+ HADDW m3, m4
+ HADDW m1, m4
+ movd [r2+0], m2
+ movd [r2+4], m3
+ movd [r2+8], m1
+ RET
+%endmacro
+
+INIT_XMM sse2
+INTRA_SAD_X3_8x8
+INIT_XMM ssse3
+INTRA_SAD_X3_8x8
paddd %1, %2
%endmacro
-%macro HADDW 2
+%macro HADDW 2 ; reg, tmp
%if cpuflag(xop) && mmsize == 16
vphaddwq %1, %1
movhlps %2, %1
%rotate 1
%endrep
%endmacro
+
+; instruction, accum, input, iteration (zero to swap, nonzero to add)
+%macro ACCUM 4
+%if %4
+ %1 m%2, m%3
+%else
+ SWAP %2, %3
+%endif
+%endmacro