%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
+pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
deinterleave_sadx4: dd 0,4,2,6
+hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
SECTION .text
INIT_MMX ssse3
INTRA_SAD_8x8C
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8c, 3,3,7
+ vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred
+ add r1, FDEC_STRIDE*4-1
+ pxor xm5, xm5
+ punpckldq xm3, xm2, xm5 ; V0 _ V1 _
+ movd xm0, [r1 + FDEC_STRIDE*-1 - 3]
+ movd xm1, [r1 + FDEC_STRIDE* 3 - 3]
+ pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0
+ pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0
+ pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1
+ pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1
+ pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2
+ pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2
+ punpcklqdq xm0, xm1 ; H0 _ H1 _
+ vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1
+ pshufb xm0, [hpred_shuf] ; H00224466 H11335577
+ psadbw m3, m5 ; s0 s1 s2 s3
+ vpermq m4, m3, q3312 ; s2 s1 s3 s3
+ vpermq m3, m3, q1310 ; s0 s1 s3 s1
+ paddw m3, m4
+ psrlw m3, 2
+ pavgw m3, m5 ; s0+s2 s1 s3 s1+s3
+ pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _
+ vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V
+ vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V
+ vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V
+ vpermq m0, m0, q3120 ; H00224466 _ H11335577 _
+ movddup m2, [r0+FENC_STRIDE*0]
+ movddup m4, [r0+FENC_STRIDE*2]
+ pshuflw m3, m0, q0000
+ psadbw m3, m2
+ psadbw m2, m1
+ pshuflw m5, m0, q1111
+ psadbw m5, m4
+ psadbw m4, m1
+ paddw m2, m4
+ paddw m3, m5
+ movddup m4, [r0+FENC_STRIDE*4]
+ pshuflw m5, m0, q2222
+ psadbw m5, m4
+ psadbw m4, m6
+ paddw m2, m4
+ paddw m3, m5
+ movddup m4, [r0+FENC_STRIDE*6]
+ pshuflw m5, m0, q3333
+ psadbw m5, m4
+ psadbw m4, m6
+ paddw m2, m4
+ paddw m3, m5
+ vextracti128 xm0, m2, 1
+ vextracti128 xm1, m3, 1
+ paddw xm2, xm0 ; DC V
+ paddw xm3, xm1 ; H
+ pextrd [r2+8], xm2, 2 ; V
+ movd [r2+4], xm3 ; H
+ movd [r2+0], xm2 ; DC
+ RET
+
;-----------------------------------------------------------------------------
; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );