intra_mbcmp_x3 is unnecessary if x9 exists (SSSE3 and onwards).
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
- pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
-#if ARCH_X86_64
- pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
-#endif
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
- if( cpu&X264_CPU_SHUFFLE_IS_FAST )
- pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
- pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
}
if( cpu&X264_CPU_AVX )
INIT5( ssd, _avx );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
-#if ARCH_X86_64
- pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_avx;
-#endif
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
- pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
- pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx;
}
if( cpu&X264_CPU_XOP )
INIT5( ssd, _xop );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
-#if ARCH_X86_64
- pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_xop;
-#endif
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
paddw %3, %5
%endmacro
+; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
+; and are only retained for old cpus.
%macro INTRA_SA8D_SSE2 0
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x3_8x8, 3,3,16
+cglobal intra_sa8d_x3_8x8, 3,3,14
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
paddusw m8, m10
paddusw m9, m11
ABSW2 m10, m11, m6, m7, m6, m7
- ABSW m15, m1, m1
+ ABSW m13, m1, m1
paddusw m10, m11
paddusw m8, m9
- paddusw m15, m10
- paddusw m15, m8
+ paddusw m13, m10
+ paddusw m13, m8
; 1D hadamard of edges
movq m8, [r1+7]
movq m9, [r1+16]
-%if cpuflag(ssse3)
- punpcklwd m8, m8
- pshufb m9, [intrax3_shuf]
- pmaddubsw m8, [pb_pppm]
- pmaddubsw m9, [pb_pppm]
- HSUMSUB2 psignw, m8, m9, m10, m11, m9, q1032, [pw_ppppmmmm]
- HSUMSUB2 psignw, m8, m9, m10, m11, m9, q2301, [pw_ppmmppmm]
-%else ; sse2
pxor m10, m10
punpcklbw m8, m10
punpcklbw m9, m10
pmullw m11, [pw_pmpmpmpm]
paddw m8, m10
paddw m9, m11
-%endif
; differences
paddw m10, m8, m9
psubw m8, m0
psubw m10, m0
ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
- paddusw m14, m8, m15
- paddusw m15, m10
+ paddusw m8, m13
+ paddusw m13, m10
punpcklwd m0, m1
punpcklwd m2, m3
punpcklwd m4, m5
punpckldq m4, m6
punpcklqdq m0, m4 ; transpose
psllw m9, 3 ; top edge
- psrldq m2, m15, 2 ; 8x7 sum
+ psrldq m2, m13, 2 ; 8x7 sum
psubw m0, m9 ; 8x1 sum
ABSW m0, m0, m9
paddusw m2, m0
; 3x HADDW
-%if cpuflag(xop)
- phaddw m2, m14
- vphadduwq m0, m15
- movhlps m1, m0
- vphadduwq m2, m2 ; i8x8_v, i8x8_h
- paddd m0, m1 ; i8x8_dc
- packusdw m2, m0 ; i8x8_v, i8x8_h, i8x8_dc
- pxor m3, m3
- psrlw m2, 1
- pavgw m2, m3
- movq [r2], m2 ; i8x8_v, i8x8_h
- psrldq m2, 8
- movd [r2+8], m2 ; i8x8_dc
-%else
movdqa m7, [pw_1]
pmaddwd m2, m7
- pmaddwd m14, m7
- pmaddwd m15, m7
- punpckhdq m3, m2, m14
- punpckldq m2, m14
- pshufd m5, m15, q3311
+ pmaddwd m8, m7
+ pmaddwd m13, m7
+ punpckhdq m3, m2, m8
+ punpckldq m2, m8
+ pshufd m5, m13, q3311
paddd m2, m3
- paddd m5, m15
- punpckhqdq m3, m2, m5
+ paddd m5, m13
+ punpckhqdq m0, m2, m5
punpcklqdq m2, m5
- pavgw m3, m2
- pxor m0, m0
- pavgw m3, m0
- movq [r2], m3 ; i8x8_v, i8x8_h
- psrldq m3, 8
- movd [r2+8], m3 ; i8x8_dc
-%endif
+ pavgw m0, m2
+ pxor m1, m1
+ pavgw m0, m1
+ movq [r2], m0 ; i8x8_v, i8x8_h
+ psrldq m0, 8
+ movd [r2+8], m0 ; i8x8_dc
RET
%endif ; ARCH_X86_64
%endmacro ; INTRA_SA8D_SSE2
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
%ifndef HIGH_BIT_DEPTH
-INTRA_SA8D_SSE2
INIT_MMX ssse3
INTRA_X3_MMX
%endif
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
-INTRA_SA8D_SSE2
INTRA_X9
INTRA8_X9
%endif
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
-INTRA_SA8D_SSE2
INTRA_X9
; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
%endif
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
-void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_4x4_mmx2 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sad_x3_4x4_avx ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
-void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_xop ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-
-h4x4_pred_shuf: db 3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15
-h4x4_pred_shuf2: db 3,7,11,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-h8x8_pred_shuf: times 8 db 1
- times 8 db 0
- times 8 db 3
- times 8 db 2
- times 8 db 5
- times 8 db 4
- times 8 db 7
- times 8 db 6
-
SECTION .text
cextern pb_3
movd [r2+4], mm1 ;H prediction cost
RET
-%macro INTRA_SADx3_4x4 0
-cglobal intra_sad_x3_4x4, 3,3
- movd xmm4, [r1+FDEC_STRIDE*0-4]
- pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1
- pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2
- pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3
- movd xmm2, [r1-FDEC_STRIDE]
- pxor xmm3, xmm3
- pshufb xmm5, xmm4, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
- pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
- pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
- punpckldq xmm2, xmm4 ; ABCDEFGH
- psadbw xmm2, xmm3
- movd xmm1, [r0+FENC_STRIDE*0]
- pinsrd xmm1, [r0+FENC_STRIDE*1], 1
- pinsrd xmm1, [r0+FENC_STRIDE*2], 2
- pinsrd xmm1, [r0+FENC_STRIDE*3], 3
- psadbw xmm0, xmm1
- psadbw xmm5, xmm1
- psraw xmm2, 2
- pavgw xmm2, xmm3
- pshufb xmm2, xmm3 ; DC prediction
- punpckhqdq xmm3, xmm0, xmm5
- punpcklqdq xmm0, xmm5
- psadbw xmm2, xmm1
- paddw xmm0, xmm3
- movhlps xmm4, xmm2
- packusdw xmm0, xmm0
- paddw xmm2, xmm4
- movq [r2], xmm0 ; V/H prediction costs
- movd [r2+8], xmm2 ; DC prediction cost
- RET
-%endmacro ; INTRA_SADx3_4x4
-
-INIT_XMM sse4
-INTRA_SADx3_4x4
-INIT_XMM avx
-INTRA_SADx3_4x4
-
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
;-----------------------------------------------------------------------------
movd [r2+8], m1
RET
-%macro INTRA_SADx3_8x8 0
-cglobal intra_sad_x3_8x8, 3,4,9
-%ifdef PIC
- lea r11, [h8x8_pred_shuf]
-%define shuf r11
-%else
-%define shuf h8x8_pred_shuf
-%endif
- movq m0, [r1+7] ; left pixels
- movq m1, [r1+16] ; top pixels
- pxor m2, m2
- pxor m3, m3
- psadbw m2, m0
- psadbw m3, m1
- paddw m2, m3
- pxor m3, m3 ; V score accumulator
- psraw m2, 3
- pavgw m2, m3
- punpcklqdq m1, m1 ; V prediction
- pshufb m2, m3 ; DC prediction
- pxor m4, m4 ; H score accumulator
- pxor m5, m5 ; DC score accumulator
- mov r3d, 6
-.loop:
- movq m6, [r0+FENC_STRIDE*0]
- movhps m6, [r0+FENC_STRIDE*1]
- pshufb m7, m0, [shuf+r3*8] ; H prediction
-%ifdef ARCH_X86_64
- psadbw m7, m6
- psadbw m8, m1, m6
- psadbw m6, m2
- paddw m4, m7
- paddw m3, m8
- paddw m5, m6
-%else
- psadbw m7, m6
- paddw m4, m7
- psadbw m7, m1, m6
- psadbw m6, m2
- paddw m3, m7
- paddw m5, m6
-%endif
- add r0, FENC_STRIDE*2
- sub r3d, 2
- jge .loop
-
- movhlps m0, m3
- movhlps m1, m4
- movhlps m2, m5
- paddw m3, m0
- paddw m4, m1
- paddw m5, m2
- movd [r2+0], m3
- movd [r2+4], m4
- movd [r2+8], m5
- RET
-%endmacro ; INTRA_SADx3_8x8
-
-INIT_XMM ssse3
-INTRA_SADx3_8x8
-INIT_XMM avx
-INTRA_SADx3_8x8
-
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------