~40% faster.
Also some other minor asm cosmetics.
const pw_8000, times 8 dw 0x8000
const pw_3fff, times 8 dw 0x3fff
const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
+const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
+const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
+const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pd_1, times 4 dd 1
const pd_32, times 4 dd 32
%include "x86inc.asm"
%include "x86util.asm"
+cextern pw_ppmmppmm
+cextern pw_pmpmpmpm
+
SECTION .text
INIT_MMX mmx2
%macro LOAD_4x8P 1 ; dx
pxor m7, m7
- movd m6, [eax+%1+7*FENC_STRIDE]
- movd m0, [eax+%1+0*FENC_STRIDE]
- movd m1, [eax+%1+1*FENC_STRIDE]
- movd m2, [eax+%1+2*FENC_STRIDE]
- movd m3, [eax+%1+3*FENC_STRIDE]
- movd m4, [eax+%1+4*FENC_STRIDE]
- movd m5, [eax+%1+5*FENC_STRIDE]
+ movd m6, [r0+%1+7*FENC_STRIDE]
+ movd m0, [r0+%1+0*FENC_STRIDE]
+ movd m1, [r0+%1+1*FENC_STRIDE]
+ movd m2, [r0+%1+2*FENC_STRIDE]
+ movd m3, [r0+%1+3*FENC_STRIDE]
+ movd m4, [r0+%1+4*FENC_STRIDE]
+ movd m5, [r0+%1+5*FENC_STRIDE]
punpcklbw m6, m7
punpcklbw m0, m7
punpcklbw m1, m7
movq [spill], m6
punpcklbw m2, m7
punpcklbw m3, m7
- movd m6, [eax+%1+6*FENC_STRIDE]
+ movd m6, [r0+%1+6*FENC_STRIDE]
punpcklbw m4, m7
punpcklbw m5, m7
punpcklbw m6, m7
movq m7, [spill]
%endmacro
+%macro HSUMSUB2 4
+ pshufw m4, %1, %3
+ pshufw m5, %2, %3
+ pmullw %1, %4
+ pmullw m5, %4
+ paddw %1, m4
+ paddw %2, m5
+%endmacro
+
;-----------------------------------------------------------------------------
-; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
+; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x3_8x8_core
- mov eax, [esp+4]
- mov ecx, [esp+8]
- sub esp, 0x70
-%define args esp+0x74
+cglobal intra_sa8d_x3_8x8, 2,3
+ SUB esp, 0x94
+%define edge esp+0x70 ; +32
%define spill esp+0x60 ; +16
%define trans esp+0 ; +96
%define sum esp+0 ; +32
+
+ pxor m7, m7
+ movq m0, [r1+7]
+ movq m2, [r1+16]
+ movq m1, m0
+ movq m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ movq m6, [pw_ppmmppmm]
+ HSUMSUB2 m0, m2, q1032, m6
+ HSUMSUB2 m1, m3, q1032, m6
+ movq m6, [pw_pmpmpmpm]
+ HSUMSUB2 m0, m2, q2301, m6
+ HSUMSUB2 m1, m3, q2301, m6
+ movq m4, m0
+ movq m5, m2
+ paddw m0, m1
+ paddw m2, m3
+ psubw m4, m1
+ psubw m3, m5
+ movq [edge+0], m0
+ movq [edge+8], m4
+ movq [edge+16], m2
+ movq [edge+24], m3
+
LOAD_4x8P 0
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
ABSW m1, m1, m4
paddw m2, m1 ; 7x4 sum
movq m7, m0
- movq m1, [ecx+8] ; left bottom
+ movq m1, [edge+8] ; left bottom
psllw m1, 3
psubw m7, m1
ABSW2 m0, m7, m0, m7, m5, m3
paddw m2, m1 ; 7x4 sum
movq m1, m0
- movq m7, [ecx+0]
+ movq m7, [edge+0]
psllw m7, 3 ; left top
- movzx edx, word [ecx+0]
- add dx, [ecx+16]
- lea edx, [4*edx+32]
- and edx, -64
- movd m6, edx ; dc
+ mov r2, [edge+0]
+ add r2, [edge+16]
+ lea r2, [4*r2+32]
+ and r2, 0xffc0
+ movd m6, r2 ; dc
psubw m1, m7
psubw m0, m6
psrlq m2, 16
paddw m2, m3
- movq m3, [ecx+16] ; top left
- movq m4, [ecx+24] ; top right
+ movq m3, [edge+16] ; top left
+ movq m4, [edge+24] ; top right
psllw m3, 3
psllw m4, 3
psubw m3, [sum+16]
paddw m2, m3
paddw m2, m4 ; v
- SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
- mov eax, [args+8]
- movd ecx, m2
- movd edx, m1
- add ecx, 2
- add edx, 2
- shr ecx, 2
- shr edx, 2
- mov [eax+0], ecx ; i8x8_v satd
- mov [eax+4], edx ; i8x8_h satd
- movd ecx, m0
- add ecx, 2
- shr ecx, 2
- mov [eax+8], ecx ; i8x8_dc satd
-
- add esp, 0x70
- ret
-%undef args
+ SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
+ mov r2, r2m
+ pxor m7, m7
+ punpckldq m2, m1
+ pavgw m0, m7
+ pavgw m2, m7
+ movd [r2+8], m0 ; dc
+ movq [r2+0], m2 ; v, h
+ ADD esp, 0x94
+ RET
+%undef edge
%undef spill
%undef trans
%undef sum
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-cglobal pixel_ssim_4x4x2_core
- push ebx
- push edi
- mov ebx, [esp+16]
- mov edx, [esp+24]
- mov edi, 4
+cglobal pixel_ssim_4x4x2_core, 0,5
+ mov r1, r1m
+ mov r3, r3m
+ mov r4, 4
pxor m0, m0
.loop:
- mov eax, [esp+12]
- mov ecx, [esp+20]
- add eax, edi
- add ecx, edi
+ mov r0, r0m
+ mov r2, r2m
+ add r0, r4
+ add r2, r4
pxor m1, m1
pxor m2, m2
pxor m3, m3
pxor m4, m4
%rep 4
- movd m5, [eax]
- movd m6, [ecx]
+ movd m5, [r0]
+ movd m6, [r2]
punpcklbw m5, m0
punpcklbw m6, m0
paddw m1, m5
paddd m3, m5
paddd m4, m7
paddd m3, m6
- add eax, ebx
- add ecx, edx
+ add r0, r1
+ add r2, r3
%endrep
- mov eax, [esp+28]
- lea eax, [eax+edi*4]
+ mov r0, r4m
+ lea r0, [r0+r4*4]
pshufw m5, m1, q0032
pshufw m6, m2, q0032
paddusw m1, m5
paddd m4, m6
punpcklwd m1, m0
punpckldq m3, m4
- movq [eax+0], m1
- movq [eax+8], m3
- sub edi, 4
+ movq [r0+0], m1
+ movq [r0+8], m3
+ sub r4, 4
jge .loop
- pop edi
- pop ebx
emms
- ret
+ RET
times 4 db 1, -1
mask_10: times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
+pb_pppm: times 4 db 1,1,1,-1
deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
+sw_f0: dq 0xfff0, 0
pd_f0: times 4 dd 0xffff0000
sq_0f: times 1 dq 0xffffffff
SECTION .text
cextern pw_1
+cextern pw_8
cextern pw_00ff
-
+cextern pw_ppppmmmm
+cextern pw_ppmmppmm
+cextern pw_pmpmpmpm
cextern hsub_mul
;=============================================================================
; INTRA SATD
;=============================================================================
+%macro HSUMSUB2 8
+ pshufd %4, %2, %7
+ pshufd %5, %3, %7
+ %1 %2, %8
+ %1 %6, %8
+ paddw %2, %4
+ paddw %3, %5
+%endmacro
+
%macro INTRA_SA8D_SSE2 0
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
-; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
+; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x3_8x8_core, 3,3,16
+cglobal intra_sa8d_x3_8x8, 3,3,16
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
punpcklbw m6, m8
punpcklbw m7, m8
- HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
- ; dc
- movzx r0d, word [r1+0]
- add r0w, word [r1+16]
- add r0d, 8
- and r0d, -16
- shl r0d, 2
-
- pxor m15, m15
- movdqa m8, m2
- movdqa m9, m3
- movdqa m10, m4
- movdqa m11, m5
- ABSW2 m8, m9, m8, m9, m12, m13
- ABSW2 m10, m11, m10, m11, m12, m13
+ ABSW2 m8, m9, m2, m3, m2, m3
+ ABSW2 m10, m11, m4, m5, m4, m5
paddusw m8, m10
paddusw m9, m11
- ABSW2 m10, m11, m6, m7, m6, m7
+ ABSW2 m10, m11, m6, m7, m6, m7
ABSW m15, m1, m1
paddusw m10, m11
paddusw m8, m9
paddusw m15, m10
paddusw m15, m8
- movdqa m8, [r1+0] ; left edge
- movd m9, r0d
- psllw m8, 3
+ ; 1D hadamard of edges
+ movq m8, [r1+7]
+ movq m9, [r1+16]
+%if cpuflag(ssse3)
+ punpcklwd m8, m8
+ pshufb m9, [intrax3_shuf]
+ pmaddubsw m8, [pb_pppm]
+ pmaddubsw m9, [pb_pppm]
+ HSUMSUB2 psignw, m8, m9, m10, m11, m9, q1032, [pw_ppppmmmm]
+ HSUMSUB2 psignw, m8, m9, m10, m11, m9, q2301, [pw_ppmmppmm]
+%else ; sse2
+ pxor m10, m10
+ punpcklbw m8, m10
+ punpcklbw m9, m10
+ HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
+ HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
+ pshuflw m10, m8, q2301
+ pshuflw m11, m9, q2301
+ pshufhw m10, m10, q2301
+ pshufhw m11, m11, q2301
+ pmullw m8, [pw_pmpmpmpm]
+ pmullw m11, [pw_pmpmpmpm]
+ paddw m8, m10
+ paddw m9, m11
+%endif
+
+ ; differences
+ paddw m10, m8, m9
+ paddw m10, [pw_8]
+ pand m10, [sw_f0]
+ psllw m10, 2 ; dc
+
+ psllw m8, 3 ; left edge
psubw m8, m0
- psubw m9, m0
- ABSW2 m8, m9, m8, m9, m10, m11 ; 1x8 sum
- paddusw m14, m15, m8
- paddusw m15, m9
+ psubw m10, m0
+ ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
+ paddusw m14, m8, m15
+ paddusw m15, m10
punpcklwd m0, m1
punpcklwd m2, m3
punpcklwd m4, m5
punpckldq m0, m2
punpckldq m4, m6
punpcklqdq m0, m4 ; transpose
- movdqa m1, [r1+16] ; top edge
- psllw m1, 3
- psrldq m2, m15, 2 ; 8x7 sum
- psubw m0, m1 ; 8x1 sum
- ABSW m0, m0, m1
+ psllw m9, 3 ; top edge
+ psrldq m2, m15, 2 ; 8x7 sum
+ psubw m0, m9 ; 8x1 sum
+ ABSW m0, m0, m9
paddusw m2, m0
; 3x HADDW
INIT_XMM sse2
SA8D
SATDS_SSE2
-INTRA_SA8D_SSE2
%ifndef HIGH_BIT_DEPTH
+INTRA_SA8D_SSE2
INIT_MMX mmx2
INTRA_SATDS_MMX
%endif
%undef movdqa ; nehalem doesn't like movaps
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
+%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INIT_MMX ssse3
INTRA_SATDS_MMX
+%endif
%define TRANS TRANS_SSE4
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
INIT_XMM avx
SATDS_SSE2
SA8D
+%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
+%endif
HADAMARD_AC_SSE2
;=============================================================================
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
-void x264_intra_sa8d_x3_8x8_core_mmx2 ( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_avx ( uint8_t *, int16_t [2][8], int * );
void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1,
pixel *pixuv2, int stride2, int width,
#endif
}
#endif
-#if !HIGH_BIT_DEPTH
-#if ARCH_X86_64
+
+#if ARCH_X86_64 && !HIGH_BIT_DEPTH
static void x264_predict_8x8c_dc_left( uint8_t *src )
{
int y;
}
}
-#endif
-
-#define PL(y) \
- UNUSED int l##y = edge[14-y];
-#define PT(x) \
- UNUSED int t##x = edge[16+x];
-#define PREDICT_8x8_LOAD_LEFT \
- PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7)
-#define PREDICT_8x8_LOAD_TOP \
- PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7)
-
-#define SUMSUB(a,b,c,d,e,f,g,h)\
- t=a; a+=b; b-=t;\
- t=c; c+=d; d-=t;\
- t=e; e+=f; f-=t;\
- t=g; g+=h; h-=t;
-
-#define INTRA_SA8D_X3(cpu)\
-void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[36], int res[3] )\
-{\
- PREDICT_8x8_LOAD_TOP\
- PREDICT_8x8_LOAD_LEFT\
- int t;\
- ALIGNED_16( int16_t sa8d_1d[2][8] );\
- SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\
- SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\
- SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\
- sa8d_1d[0][0] = l0;\
- sa8d_1d[0][1] = l1;\
- sa8d_1d[0][2] = l2;\
- sa8d_1d[0][3] = l3;\
- sa8d_1d[0][4] = l4;\
- sa8d_1d[0][5] = l5;\
- sa8d_1d[0][6] = l6;\
- sa8d_1d[0][7] = l7;\
- SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);\
- SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);\
- SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);\
- sa8d_1d[1][0] = t0;\
- sa8d_1d[1][1] = t1;\
- sa8d_1d[1][2] = t2;\
- sa8d_1d[1][3] = t3;\
- sa8d_1d[1][4] = t4;\
- sa8d_1d[1][5] = t5;\
- sa8d_1d[1][6] = t6;\
- sa8d_1d[1][7] = t7;\
- x264_intra_sa8d_x3_8x8_core_##cpu( fenc, sa8d_1d, res );\
-}
-
-#if ARCH_X86_64
-INTRA_SA8D_X3(sse2)
-INTRA_SA8D_X3(ssse3)
-INTRA_SA8D_X3(avx)
-#else
-INTRA_SA8D_X3(mmx2)
-#endif
-#endif // !HIGH_BIT_DEPTH
+#endif // ARCH_X86_64 && !HIGH_BIT_DEPTH
/****************************************************************************
* Exported functions: