#if HAVE_MMX
uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
+uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif
/****************************************************************************
pf->nal_escape = x264_nal_escape_mmxext;
if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
pf->nal_escape = x264_nal_escape_sse2;
+ if( cpu&X264_CPU_AVX )
+ pf->nal_escape = x264_nal_escape_avx;
#endif
}
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
}
+ if( cpu&X264_CPU_AVX )
+ {
+ dctf->add4x4_idct = x264_add4x4_idct_avx;
+ dctf->dct4x4dc = x264_dct4x4dc_avx;
+ dctf->idct4x4dc = x264_idct4x4dc_avx;
+ dctf->add8x8_idct = x264_add8x8_idct_avx;
+ dctf->add16x16_idct = x264_add16x16_idct_avx;
+ dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
+ dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_SSE4 )
dctf->add4x4_idct = x264_add4x4_idct_sse4;
+ if( cpu&X264_CPU_AVX )
+ {
+ dctf->add4x4_idct = x264_add4x4_idct_avx;
+ dctf->add8x8_idct = x264_add8x8_idct_avx;
+ dctf->add16x16_idct = x264_add16x16_idct_avx;
+ dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
+ dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
+ dctf->sub8x8_dct = x264_sub8x8_dct_avx;
+ dctf->sub16x16_dct = x264_sub16x16_dct_avx;
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
+ }
#endif //HAVE_MMX
#if HAVE_ALTIVEC
pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
if( cpu&X264_CPU_SSE4 )
pf->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
+ if( cpu&X264_CPU_AVX )
+ pf->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
#endif // HAVE_MMX
#else
#if HAVE_MMX
pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
}
+ if( cpu&X264_CPU_AVX )
+ {
+ pf->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
+#if ARCH_X86_64
+ pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_avx;
+#endif
+ }
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
}
+#if ARCH_X86_64
+ if( cpu&X264_CPU_AVX )
+ {
+ pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
+ pf->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
+ }
+#endif // ARCH_X86_64
#endif // HAVE_MMX
#else
#if HAVE_MMX
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
}
+ if( cpu&X264_CPU_AVX )
+ {
+ pf->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
+#if ARCH_X86_64
+ pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
+#endif
+ if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
+ }
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
#if HIGH_BIT_DEPTH
if( cpu&X264_CPU_SSE2 )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
+ if( cpu&X264_CPU_AVX )
+ pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
#else
if( cpu&X264_CPU_MMX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
+ if( cpu&X264_CPU_AVX )
+ pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
#endif // HIGH_BIT_DEPTH
#endif
}
#if HAVE_MMX
void x264_deblock_v_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
int mvy_limit, int bframe );
void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
int mvy_limit, int bframe );
+void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
+ int mvy_limit, int bframe );
#if ARCH_X86
void x264_deblock_h_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
}
if( cpu&X264_CPU_SSSE3 )
pf->deblock_strength = x264_deblock_strength_ssse3;
+ if( cpu&X264_CPU_AVX )
+ {
+ pf->deblock_strength = x264_deblock_strength_avx;
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ pf->deblock_luma[1] = x264_deblock_v_luma_avx;
+ pf->deblock_luma[0] = x264_deblock_h_luma_avx;
+ pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
+ pf->deblock_chroma[0] = x264_deblock_h_chroma_avx;
+ pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
+ pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
+ pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
+ pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_avx;
+ }
+ }
}
#endif
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL7( _sse4 )
+SATD_X_DECL7( _avx )
#endif // !HIGH_BIT_DEPTH
#endif
/* Slower on Conroe, so only enable under SSE4 */
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
}
+
+ if( cpu&X264_CPU_AVX )
+ {
+ INIT7( satd, _avx );
+ INIT7( satd_x3, _avx );
+ INIT7( satd_x4, _avx );
+ pixf->ads[PIXEL_16x16] = x264_pixel_ads4_avx;
+ pixf->ads[PIXEL_16x8] = x264_pixel_ads2_avx;
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ INIT4( hadamard_ac, _avx );
+ }
+ INIT5( ssd, _avx );
+#if ARCH_X86_64
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
+ pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_avx;
+#endif
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
+ pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
+ pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx;
+ }
#endif //HAVE_MMX
#if HAVE_ARMV6
if( cpu&X264_CPU_AVX )
{
+ pf->dequant_4x4 = x264_dequant_4x4_avx;
+ pf->dequant_8x8 = x264_dequant_8x8_avx;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
#endif // HAVE_MMX
jmp %1_continue
ALIGN 16
%1:
- mova m3, m1
- mova m2, m0
- pcmpeqb m1, m4
- pcmpeqb m0, m4
- pmovmskb r3d, m1
- %2 [r0+r1], m2
- pmovmskb r4d, m0
+ pcmpeqb m3, m1, m4
+ pcmpeqb m2, m0, m4
+ pmovmskb r3d, m3
+ %2 [r0+r1], m0
+ pmovmskb r4d, m2
shl r3d, mmsize
mova m0, [r1+r2+2*mmsize]
or r4d, r3d
- mova m1, [r1+r2+3*mmsize]
+ %2 [r0+r1+mmsize], m1
lea r3d, [r4+r4+1]
- %2 [r0+r1+mmsize], m3
+ mova m1, [r1+r2+3*mmsize]
and r4d, r3d
jnz %1_escape
%1_continue:
NAL_ESCAPE mmxext
INIT_XMM
NAL_ESCAPE sse2
+INIT_AVX
+NAL_ESCAPE avx
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
- SUMSUB_BA w, m%8, m%1 ; %8 = s07, %1 = d07
- SUMSUB_BA w, m%7, m%2 ; %7 = s16, %2 = d16
- SUMSUB_BA w, m%6, m%3 ; %6 = s25, %3 = d25
- SUMSUB_BA w, m%5, m%4 ; %5 = s34, %4 = d34
- SUMSUB_BA w, m%5, m%8 ; %5 = a0, %8 = a2
- SUMSUB_BA w, m%6, m%7 ; %6 = a1, %7 = a3
- SUMSUB_BA w, m%6, m%5 ; %6 = dst0, %5 = dst4
+ SUMSUB_BA w, %8, %1 ; %8 = s07, %1 = d07
+ SUMSUB_BA w, %7, %2 ; %7 = s16, %2 = d16
+ SUMSUB_BA w, %6, %3 ; %6 = s25, %3 = d25
+ SUMSUB_BA w, %5, %4 ; %5 = s34, %4 = d34
+ SUMSUB_BA w, %5, %8 ; %5 = a0, %8 = a2
+ SUMSUB_BA w, %6, %7 ; %6 = a1, %7 = a3
+ SUMSUB_BA w, %6, %5 ; %6 = dst0, %5 = dst4
mova [%9+0x00], m%6
mova [%9+0x40], m%5
- mova m%6, m%7 ; a3
- psraw m%6, 1 ; a3>>1
+ psraw m%6, m%7, 1 ; a3>>1
paddw m%6, m%8 ; a2 + (a3>>1)
psraw m%8, 1 ; a2>>1
psubw m%8, m%7 ; (a2>>1) - a3
mova [%9+0x60], m%8
- mova m%5, m%3
- psraw m%5, 1
+ psraw m%5, m%3, 1
paddw m%5, m%3 ; d25+(d25>>1)
- mova m%7, m%1
- psubw m%7, m%4 ; a5 = d07-d34-(d25+(d25>>1))
+ psubw m%7, m%1, m%4 ; a5 = d07-d34-(d25+(d25>>1))
psubw m%7, m%5
- mova m%5, m%2
- psraw m%5, 1
+ psraw m%5, m%2, 1
paddw m%5, m%2 ; d16+(d16>>1)
- mova m%8, m%1
- paddw m%8, m%4
+ paddw m%8, m%1, m%4
psubw m%8, m%5 ; a6 = d07+d34-(d16+(d16>>1))
- mova m%5, m%1
- psraw m%5, 1
+ psraw m%5, m%1, 1
paddw m%5, m%1 ; d07+(d07>>1)
paddw m%5, m%2
paddw m%5, m%3 ; a4 = d16+d25+(d07+(d07>>1))
- mova m%1, m%4
- psraw m%1, 1
+ psraw m%1, m%4, 1
paddw m%1, m%4 ; d34+(d34>>1)
paddw m%1, m%2
psubw m%1, m%3 ; a7 = d16-d25+(d34+(d34>>1))
- mova m%4, m%1
- psraw m%4, 2
+ psraw m%4, m%1, 2
paddw m%4, m%5 ; a4 + (a7>>2)
- mova m%3, m%8
- psraw m%3, 2
+ psraw m%3, m%8, 2
paddw m%3, m%7 ; a5 + (a6>>2)
psraw m%5, 2
psraw m%7, 2
; in: 0,4 in mem, rest in regs
; out: m0..m7
%macro IDCT8_1D 9
- mova m%1, m%3
- mova m%5, m%7
- psraw m%3, 1
- psraw m%7, 1
- psubw m%3, m%5
- paddw m%7, m%1
- mova m%5, m%2
- psraw m%5, 1
- paddw m%5, m%2
- paddw m%5, m%4
- paddw m%5, m%6
- mova m%1, m%6
- psraw m%1, 1
- paddw m%1, m%6
- paddw m%1, m%8
- psubw m%1, m%2
+ psraw m%1, m%3, 1
+ psraw m%5, m%7, 1
+ psubw m%1, m%7
+ paddw m%5, m%3
+ psraw m%7, m%2, 1
+ paddw m%7, m%2
+ paddw m%7, m%4
+ paddw m%7, m%6
+ psraw m%3, m%6, 1
+ paddw m%3, m%6
+ paddw m%3, m%8
+ psubw m%3, m%2
psubw m%2, m%4
psubw m%6, m%4
paddw m%2, m%8
psraw m%8, 1
psubw m%2, m%4
psubw m%6, m%8
- mova m%4, m%5
- mova m%8, m%1
- psraw m%4, 2
- psraw m%8, 2
+ psraw m%4, m%7, 2
+ psraw m%8, m%3, 2
paddw m%4, m%6
paddw m%8, m%2
psraw m%6, 2
psraw m%2, 2
- psubw m%5, m%6
- psubw m%2, m%1
- mova m%1, [%9+0x00]
+ psubw m%7, m%6
+ psubw m%2, m%3
+ mova m%3, [%9+0x00]
mova m%6, [%9+0x40]
- SUMSUB_BA w, m%6, m%1
- SUMSUB_BA w, m%7, m%6
- SUMSUB_BA w, m%3, m%1
- SUMSUB_BA w, m%5, m%7
- SUMSUB_BA w, m%2, m%3
- SUMSUB_BA w, m%8, m%1
- SUMSUB_BA w, m%4, m%6
+ SUMSUB_BA w, %6, %3
+ SUMSUB_BA w, %5, %6
+ SUMSUB_BA w, %1, %3
+ SUMSUB_BA w, %7, %5
+ SUMSUB_BA w, %2, %1
+ SUMSUB_BA w, %8, %3
+ SUMSUB_BA w, %4, %6
+ SWAP %1, %3
+ SWAP %5, %7
SWAP %1, %5, %6
SWAP %3, %8, %7
%endmacro
%macro ADD_STORE_ROW 3
movq m1, [r0+%1*FDEC_STRIDE]
- movq m2, m1
+ punpckhbw m2, m1, m0
punpcklbw m1, m0
- punpckhbw m2, m0
paddw m1, %2
paddw m2, %3
packuswb m1, m2
ADD_STORE_ROW 7, m7, [r1+0x78]
ret
-INIT_XMM
%macro DCT_SUB8 1
cglobal sub8x8_dct_%1, 3,3
add r2, 4*FDEC_STRIDE
ret
%endmacro
+INIT_XMM
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
%define movdqa movaps
%define punpcklqdq movlhps
%undef punpcklqdq
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
DCT_SUB8 ssse3
+INIT_AVX
+DCT_SUB8 avx
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-cglobal add8x8_idct_sse2, 2,2
+%macro ADD8x8 1
+cglobal add8x8_idct_%1, 2,2
add r0, 4*FDEC_STRIDE
-global add8x8_idct_sse2.skip_prologue
+global add8x8_idct_%1.skip_prologue
.skip_prologue:
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
SBUTTERFLY qdq, 0, 1, 4
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
STORE_IDCT m1, m3, m5, m2
ret
+%endmacro ; ADD8x8
+
+INIT_XMM
+ADD8x8 sse2
+INIT_AVX
+ADD8x8 avx
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal add8x8_idct8_sse2, 2,2
+%macro ADD8x8_IDCT8 1
+cglobal add8x8_idct8_%1, 2,2
add r0, 4*FDEC_STRIDE
-global add8x8_idct8_sse2.skip_prologue
+global add8x8_idct8_%1.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
STORE_IDCT m1, m3, m5, m2
ret
+%endmacro ; ADD8x8_IDCT8
+
+INIT_XMM
+ADD8x8_IDCT8 sse2
+INIT_AVX
+ADD8x8_IDCT8 avx
%endif ; !HIGH_BIT_DEPTH
INIT_XMM
%macro DCT8_1D 10
- SUMSUB_BA w, m%5, m%4 ; %5=s34, %4=d34
- SUMSUB_BA w, m%6, m%3 ; %6=s25, %3=d25
- SUMSUB_BA w, m%7, m%2 ; %7=s16, %2=d16
- SUMSUB_BA w, m%8, m%1 ; %8=s07, %1=d07
+ SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
+ SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25
+ SUMSUB_BA w, %7, %2 ; %7=s16, %2=d16
+ SUMSUB_BA w, %8, %1 ; %8=s07, %1=d07
- SUMSUB_BA w, m%6, m%7, m%10 ; %6=a1, %7=a3
- SUMSUB_BA w, m%5, m%8, m%10 ; %5=a0, %8=a2
+ SUMSUB_BA w, %6, %7, %10 ; %6=a1, %7=a3
+ SUMSUB_BA w, %5, %8, %10 ; %5=a0, %8=a2
- movdqa m%9, m%1
- psraw m%9, 1
+ psraw m%9, m%1, 1
paddw m%9, m%1
paddw m%9, m%2
paddw m%9, m%3 ; %9=a4
- movdqa m%10, m%4
- psraw m%10, 1
+ psraw m%10, m%4, 1
paddw m%10, m%4
paddw m%10, m%2
psubw m%10, m%3 ; %10=a7
- SUMSUB_BA w, m%4, m%1
+ SUMSUB_BA w, %4, %1
psubw m%1, m%3
psubw m%4, m%2
psraw m%3, 1
psubw m%1, m%3 ; %1=a5
psubw m%4, m%2 ; %4=a6
- movdqa m%2, m%10
- psraw m%2, 2
+ psraw m%2, m%10, 2
paddw m%2, m%9 ; %2=b1
psraw m%9, 2
psubw m%9, m%10 ; %9=b7
- SUMSUB_BA w, m%6, m%5, m%10 ; %6=b0, %5=b4
+ SUMSUB_BA w, %6, %5, %10 ; %6=b0, %5=b4
- movdqa m%3, m%7
- psraw m%3, 1
+ psraw m%3, m%7, 1
paddw m%3, m%8 ; %3=b2
psraw m%8, 1
psubw m%8, m%7 ; %8=b6
- movdqa m%7, m%4
- psraw m%7, 2
+ psraw m%7, m%4, 2
paddw m%7, m%1 ; %7=b3
psraw m%1, 2
psubw m%4, m%1 ; %4=b5
%endmacro
%macro IDCT8_1D 10
- SUMSUB_BA w, m%5, m%1, m%9 ; %5=a0, %1=a2
+ SUMSUB_BA w, %5, %1, %9 ; %5=a0, %1=a2
- movdqa m%9, m%2
- psraw m%9, 1
+ psraw m%9, m%2, 1
paddw m%9, m%2
paddw m%9, m%4
- paddw m%9, m%6 ; %9=a7
+ paddw m%9, m%6 ; %9=a7
- movdqa m%10, m%3
- psraw m%3, 1
- psubw m%3, m%7 ; %3=a4
+ psraw m%10, m%3, 1
+ psubw m%10, m%7 ; %10=a4
psraw m%7, 1
- paddw m%7, m%10 ; %7=a6
+ paddw m%7, m%3 ; %7=a6
- movdqa m%10, m%6
- psraw m%10, 1
- paddw m%10, m%6
- paddw m%10, m%8
- psubw m%10, m%2 ; %10=a5
+ psraw m%3, m%6, 1
+ paddw m%3, m%6
+ paddw m%3, m%8
+ psubw m%3, m%2 ; %3=a5
psubw m%2, m%4
psubw m%6, m%4
psubw m%6, m%8
psraw m%4, 1
psraw m%8, 1
- psubw m%2, m%4 ; %2=a3
- psubw m%6, m%8 ; %6=a1
+ psubw m%2, m%4 ; %2=a3
+ psubw m%6, m%8 ; %6=a1
- movdqa m%4, m%9
- psraw m%4, 2
- paddw m%4, m%6 ; %4=b1
+ psraw m%4, m%9, 2
+ paddw m%4, m%6 ; %4=b1
psraw m%6, 2
- psubw m%9, m%6 ; %9=b7
+ psubw m%9, m%6 ; %9=b7
- SUMSUB_BA w, m%7, m%5, m%6 ; %7=b0, %5=b6
- SUMSUB_BA w, m%3, m%1, m%6 ; %3=b2, %1=b4
+ SUMSUB_BA w, %7, %5, %6 ; %7=b0, %5=b6
+ SUMSUB_BA w, %10, %1, %6 ; %10=b2, %1=b4
- movdqa m%8, m%10
- psraw m%8, 2
+ psraw m%8, m%3, 2
paddw m%8, m%2 ; %8=b3
psraw m%2, 2
- psubw m%2, m%10 ; %2=b5
+ psubw m%2, m%3 ; %2=b5
- SUMSUB_BA w, m%9, m%7, m%6 ; %9=c0, %7=c7
- SUMSUB_BA w, m%2, m%3, m%6 ; %2=c1, %3=c6
- SUMSUB_BA w, m%8, m%1, m%6 ; %8=c2, %1=c5
- SUMSUB_BA w, m%4, m%5, m%6 ; %4=c3, %5=c4
+ SUMSUB_BA w, %9, %7, %6 ; %9=c0, %7=c7
+ SUMSUB_BA w, %2, %10, %6 ; %2=c1, %10=c6
+ SUMSUB_BA w, %8, %1, %6 ; %8=c2, %1=c5
+ SUMSUB_BA w, %4, %5, %6 ; %4=c3, %5=c4
- SWAP %1, %9, %6
- SWAP %3, %8, %7
+ SWAP %10, %3
+ SWAP %1, %9, %6
+ SWAP %3, %8, %7
%endmacro
%macro DCT_SUB8 1
%undef punpcklqdq
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
DCT_SUB8 ssse3
+INIT_AVX
+DCT_SUB8 avx
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal add8x8_idct8_sse2, 2,2,11
+%macro ADD8x8_IDCT8 1
+cglobal add8x8_idct8_%1, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
-global add8x8_idct8_sse2.skip_prologue
+global add8x8_idct8_%1.skip_prologue
.skip_prologue:
SWAP 7, 9
movdqa m0, [r1+0x00]
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
STORE_IDCT m1, m3, m5, m7
ret
+%endmacro ; ADD8x8_IDCT8
+
+INIT_XMM
+ADD8x8_IDCT8 sse2
+INIT_AVX
+ADD8x8_IDCT8 avx
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-cglobal add8x8_idct_sse2, 2,2,11
+%macro ADD8x8 1
+cglobal add8x8_idct_%1, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
-global add8x8_idct_sse2.skip_prologue
+global add8x8_idct_%1.skip_prologue
.skip_prologue:
SWAP 7, 9
mova m0, [r1+ 0]
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
STORE_IDCT m1, m3, m5, m7
ret
+%endmacro ; ADD8x8
+
+INIT_XMM
+ADD8x8 sse2
+INIT_AVX
+ADD8x8 avx
%endif ; !HIGH_BIT_DEPTH
cextern pd_32
%macro WALSH4_1D 6
- SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%6
- SUMSUB_BADC %1, m%5, m%3, m%4, m%2, m%6
+ SUMSUB_BADC %1, %5, %4, %3, %2, %6
+ SUMSUB_BADC %1, %5, %3, %4, %2, %6
SWAP %2, %5, %4
%endmacro
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
;-----------------------------------------------------------------------------
; void dct4x4dc( dctcoef d[4][4] )
;-----------------------------------------------------------------------------
-cglobal dct4x4dc_sse2, 1,1,5
+%macro DCT4x4_DC 1
+cglobal dct4x4dc_%1, 1,1,5
mova m0, [r0+ 0]
mova m1, [r0+16]
mova m2, [r0+32]
mova [r0+32], m2
mova [r0+48], m3
RET
+%endmacro ; DCT4x4_DC
+
+INIT_XMM
+DCT4x4_DC sse2
+INIT_AVX
+DCT4x4_DC avx
%else
INIT_MMX
movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
WALSH4_1D w, 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
- SUMSUB_BADC w, m1, m0, m3, m2, m4
+ SUMSUB_BADC w, 1, 0, 3, 2, 4
SWAP 0, 1
SWAP 2, 3
SUMSUB_17BIT 0,2,4,7
;-----------------------------------------------------------------------------
; void idct4x4dc( int32_t d[4][4] )
;-----------------------------------------------------------------------------
-INIT_XMM
-cglobal idct4x4dc_sse2, 1,1
+%macro IDCT4x4DC 1
+cglobal idct4x4dc_%1, 1,1
mova m3, [r0+48]
mova m2, [r0+32]
mova m1, [r0+16]
mova [r0+32], m2
mova [r0+48], m3
RET
+%endmacro ; IDCT4x4DC
+
+INIT_XMM
+IDCT4x4DC sse2
+INIT_AVX
+IDCT4x4DC avx
%else
INIT_MMX
DCT4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
- SUMSUB_BADC w, m3, m0, m2, m1
- SUMSUB_BA w, m2, m3, m4
+ SUMSUB_BADC w, 3, 0, 2, 1
+ SUMSUB_BA w, 2, 3, 4
DCT_UNPACK m2, m4, m5
DCT_UNPACK m3, m6, m7
mova [r0+ 0], m2 ; s03 + s12
DCT_UNPACK m0, m2, m4
DCT_UNPACK m1, m3, m5
- SUMSUB2_AB d, m0, m1, m4
- SUMSUB2_AB d, m2, m3, m5
+ SUMSUB2_AB d, 0, 1, 4
+ SUMSUB2_AB d, 2, 3, 5
mova [r0+16], m0 ; d03*2 + d12
mova [r0+24], m2
mova [r0+48], m4 ; d03 - 2*d12
movhps %6, %1
%endmacro
-INIT_XMM
-cglobal add4x4_idct_sse2, 2,2,6
+%macro ADD4x4_IDCT 1
+cglobal add4x4_idct_%1, 2,2,6
add r0, 4*FDEC_STRIDE
.skip_prologue:
mova m1, [r1+16]
STORE_DIFFx2 m0, m1, m4, m5, [r0-4*FDEC_STRIDE], [r0-2*FDEC_STRIDE]
STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
RET
-%else
+%endmacro
+
+INIT_XMM
+ADD4x4_IDCT sse2
+INIT_AVX
+ADD4x4_IDCT avx
+
+%else ; !HIGH_BIT_DEPTH
cglobal add4x4_idct_mmx, 2,2
pxor m7, m7
STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
RET
-INIT_XMM
-cglobal add4x4_idct_sse4, 2,2,6
- mova m0, [r1+0x00] ; row1/row0
- mova m2, [r1+0x10] ; row3/row2
- mova m1, m0 ; row1/row0
- psraw m0, 1 ; row1>>1/...
- mova m3, m2 ; row3/row2
- psraw m2, 1 ; row3>>1/...
+%macro ADD4x4 1
+cglobal add4x4_idct_%1, 2,2,6
+ mova m1, [r1+0x00] ; row1/row0
+ mova m3, [r1+0x10] ; row3/row2
+ psraw m0, m1, 1 ; row1>>1/...
+ psraw m2, m3, 1 ; row3>>1/...
movsd m0, m1 ; row1>>1/row0
movsd m2, m3 ; row3>>1/row2
psubw m0, m3 ; row1>>1-row3/row0-2
paddw m2, m1 ; row3>>1+row1/row0+2
SBUTTERFLY2 wd, 0, 2, 1
- SUMSUB_BA w, m2, m0, m1
+ SUMSUB_BA w, 2, 0, 1
pshuflw m1, m2, 10110001b
pshufhw m2, m2, 10110001b
punpckldq m1, m0
punpckhdq m2, m0
- SWAP 0, 1
+ SWAP 0, 1
mova m1, [pw_32_0]
paddw m1, m0 ; row1/row0 corrected
psraw m0, 1 ; row1>>1/...
- mova m3, m2 ; row3/row2
- psraw m2, 1 ; row3>>1/...
+ psraw m3, m2, 1 ; row3>>1/...
movsd m0, m1 ; row1>>1/row0
- movsd m2, m3 ; row3>>1/row2
- psubw m0, m3 ; row1>>1-row3/row0-2
- paddw m2, m1 ; row3>>1+row1/row0+2
- SBUTTERFLY2 qdq, 0, 2, 1
- SUMSUB_BA w, m2, m0, m1
+ movsd m3, m2 ; row3>>1/row2
+ psubw m0, m2 ; row1>>1-row3/row0-2
+ paddw m3, m1 ; row3>>1+row1/row0+2
+ SBUTTERFLY2 qdq, 0, 3, 1
+ SUMSUB_BA w, 3, 0, 1
movd m4, [r0+FDEC_STRIDE*0]
movd m1, [r0+FDEC_STRIDE*1]
- movd m3, [r0+FDEC_STRIDE*2]
+ movd m2, [r0+FDEC_STRIDE*2]
movd m5, [r0+FDEC_STRIDE*3]
punpckldq m1, m4 ; row0/row1
pxor m4, m4
- punpckldq m3, m5 ; row3/row2
+ punpckldq m2, m5 ; row3/row2
punpcklbw m1, m4
- psraw m2, 6
- punpcklbw m3, m4
+ psraw m3, 6
+ punpcklbw m2, m4
psraw m0, 6
- paddsw m2, m1
- paddsw m0, m3
- packuswb m0, m2 ; row0/row1/row3/row2
+ paddsw m3, m1
+ paddsw m0, m2
+ packuswb m0, m3 ; row0/row1/row3/row2
pextrd [r0+FDEC_STRIDE*0], m0, 3
pextrd [r0+FDEC_STRIDE*1], m0, 2
movd [r0+FDEC_STRIDE*2], m0
pextrd [r0+FDEC_STRIDE*3], m0, 1
RET
+%endmacro ; ADD4x4
+
+INIT_XMM
+ADD4x4 sse4
+INIT_AVX
+ADD4x4 avx
%endif ; HIGH_BIT_DEPTH
INIT_MMX
INIT_XMM
ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2.skip_prologue,64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_sse2,add8x8_idct_sse2.skip_prologue,64, 16, 8, 8
+INIT_AVX
+ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx.skip_prologue, 64, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_avx ,add8x8_idct_avx.skip_prologue, 64, 16, 8, 8
%else ; !HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
%endif
INIT_XMM
-
cextern sub8x8_dct_sse2.skip_prologue
cextern sub8x8_dct_ssse3.skip_prologue
+cextern sub8x8_dct_avx.skip_prologue
SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx.skip_prologue, 128, 8, 0, 0
+
cextern add8x8_idct_sse2.skip_prologue
-ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
+cextern add8x8_idct_avx.skip_prologue
+ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx.skip_prologue, 128, 8, 0, 0
-cextern sub8x8_dct8_sse2.skip_prologue
cextern add8x8_idct8_sse2.skip_prologue
-SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
+cextern add8x8_idct8_avx.skip_prologue
ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx.skip_prologue, 128, 8, 0, 0
+cextern sub8x8_dct8_sse2.skip_prologue
cextern sub8x8_dct8_ssse3.skip_prologue
+cextern sub8x8_dct8_avx.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx.skip_prologue, 128, 8, 0, 0
%endif ; HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
mova [%1+FDEC_STRIDEB*3], %2
%endmacro
-INIT_XMM
-cglobal add8x8_idct_dc_sse2, 2,2,7
+%macro ADD_IDCT_DC 1
+cglobal add8x8_idct_dc_%1, 2,2,7
mova m6, [pw_pixel_max]
pxor m5, m5
mova m3, [r1]
ADD_DC r0+FDEC_STRIDEB*4, m3
RET
-cglobal add16x16_idct_dc_sse2, 2,3,8
+cglobal add16x16_idct_dc_%1, 2,3,8
mov r2, 4
mova m6, [pw_pixel_max]
mova m7, [pd_32]
dec r2
jg .loop
REP_RET
+%endmacro ; ADD_IDCT_DC
+
+INIT_XMM
+ADD_IDCT_DC sse2
+INIT_AVX
+ADD_IDCT_DC avx
%else ;!HIGH_BIT_DEPTH
%macro ADD_DC 3
add r1, 16
punpcklwd xmm0, xmm0
punpcklwd xmm2, xmm2
- pxor xmm1, xmm1
pxor xmm3, xmm3
paddw xmm0, [pw_32]
paddw xmm2, [pw_32]
psraw xmm0, 6
psraw xmm2, 6
- psubw xmm1, xmm0
- psubw xmm3, xmm2
+ psubw xmm1, xmm3, xmm0
packuswb xmm0, xmm1
+ psubw xmm3, xmm2
+ punpckhbw xmm1, xmm0, xmm0
packuswb xmm2, xmm3
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
+ punpckhbw xmm3, xmm2, xmm2
punpcklbw xmm0, xmm0
punpcklbw xmm2, xmm2
- punpckhbw xmm1, xmm1
- punpckhbw xmm3, xmm3
IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
IDCT_DC_STORE 0, xmm2, xmm3
ret
-cglobal add16x16_idct_dc_ssse3, 2,2,8
+%macro ADD16x16 1
+cglobal add16x16_idct_dc_%1, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
movdqa xmm6, [pb_idctdc_unpack2]
packuswb xmm0, xmm0
packuswb xmm1, xmm1
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
+ pshufb xmm2, xmm0, xmm6
pshufb xmm0, xmm5
- pshufb xmm2, xmm6
+ pshufb xmm3, xmm1, xmm6
pshufb xmm1, xmm5
- pshufb xmm3, xmm6
IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
IDCT_DC_STORE 0, xmm2, xmm3
ret
+%endmacro ; ADD16x16
+
+INIT_XMM
+ADD16x16 ssse3
+INIT_AVX
+ADD16x16 avx
%endif ; HIGH_BIT_DEPTH
movdq2q mm5, xmm5
PALIGNR xmm6, xmm6, 10, xmm3
movdq2q mm6, xmm6
-%ifidn %1, ssse3
+%ifnidn %1, sse2
PALIGNR xmm7, xmm7, 8, xmm3
movdq2q mm7, xmm7
%else
movq [r0+2*56], mm5
movq [r0+2*60], mm3
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
+ punpckhdq xmm3, xmm0, xmm2
punpckldq xmm0, xmm2
+ punpckhdq xmm7, xmm4, xmm6
punpckldq xmm4, xmm6
- punpckhdq xmm3, xmm2
- punpckhdq xmm7, xmm6
pshufhw xmm0, xmm0, 0x1b
pshuflw xmm4, xmm4, 0x1b
pshufhw xmm3, xmm3, 0x1b
movu m2, [r1+14*SIZEOF_DCTCOEF]
movu m3, [r1+21*SIZEOF_DCTCOEF]
mova m4, [r1+28*SIZEOF_DCTCOEF]
- mova m5, m0
- mova m6, m1
+ punpckl%5 m5, m0, m1
psrl%3 m0, %2
+ punpckh%5 m6, m1, m0
+ punpckl%4 m5, m0
punpckl%4 m1, m1
- punpckl%5 m5, m6
punpckh%5 m1, m3
- punpckh%5 m6, m0
- punpckl%4 m5, m0
mova m7, [r1+52*SIZEOF_DCTCOEF]
mova m0, [r1+60*SIZEOF_DCTCOEF]
punpckh%5 m1, m2
movu [r0+47*SIZEOF_DCTCOEF], m4
punpckh%5 m7, m0
psll%3 m0, %2
- mova m3, m5
+ punpckh%4 m3, m5, m5
punpckl%5 m5, m1
punpckh%5 m1, m2
- punpckh%4 m3, m3
mova [r0+52*SIZEOF_DCTCOEF], m6
movu [r0+13*SIZEOF_DCTCOEF], m5
movu m4, [r1+11*SIZEOF_DCTCOEF]
punpckl%5 m6, m7
punpckh%5 m1, m3
punpckh%5 m5, m7
- mova m3, m6
- mova m7, m5
+ punpckh%4 m3, m6, m4
+ punpckh%4 m7, m5, m1
punpckl%4 m6, m4
punpckl%4 m5, m1
- punpckh%4 m3, m4
- punpckh%4 m7, m1
movu m4, [r1+35*SIZEOF_DCTCOEF]
movu m1, [r1+49*SIZEOF_DCTCOEF]
pshuf%6 m6, m6, 0x1b
mova [r0+32*SIZEOF_DCTCOEF], m7
movu [r0+10*SIZEOF_DCTCOEF], m6
movu [r0+21*SIZEOF_DCTCOEF], m5
- mova m3, m0
- mova m7, m2
+ punpckh%5 m3, m0, m4
+ punpckh%5 m7, m2, m1
punpckl%5 m0, m4
punpckl%5 m2, m1
- punpckh%5 m3, m4
- punpckh%5 m7, m1
- mova m4, m2
- mova m1, m7
+ punpckl%4 m4, m2, m0
+ punpckl%4 m1, m7, m3
punpckh%4 m2, m0
punpckh%4 m7, m3
- punpckl%4 m4, m0
- punpckl%4 m1, m3
pshuf%6 m2, m2, 0x1b
pshuf%6 m7, m7, 0x1b
mova [r0+28*SIZEOF_DCTCOEF], m4
%ifdef HIGH_BIT_DEPTH
INIT_XMM
SCAN_8x8_FRAME sse2 , 4 , dq, qdq, dq, d
+INIT_AVX
+SCAN_8x8_FRAME avx , 4 , dq, qdq, dq, d
%else
INIT_MMX
SCAN_8x8_FRAME mmxext, 16, q , dq , wd, w
mova m1, [r1+ 4*SIZEOF_DCTCOEF]
mova m2, [r1+ 8*SIZEOF_DCTCOEF]
mova m3, [r1+12*SIZEOF_DCTCOEF]
- mova m4, m0
+ punpckl%5 m4, m0, m1
mova m5, m1
mova m6, m2
mova m7, m3
psrl%3 m0, %2
punpckl%4 m2, m2
punpckh%4 m1, m1
- punpckl%5 m4, m5
punpckl%5 m5, m3
punpckl%4 m4, m0
punpckh%5 m5, m2
%ifdef HIGH_BIT_DEPTH
INIT_XMM
SCAN_4x4 sse2, 4 , dq, qdq, dq
+INIT_AVX
+SCAN_4x4 avx , 4 , dq, qdq, dq
%else
INIT_MMX
SCAN_4x4 mmx , 16, q , dq , wd
-%endif
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal zigzag_scan_4x4_frame_ssse3, 2,2
+%macro SCAN_4x4_FRAME 1
+cglobal zigzag_scan_4x4_frame_%1, 2,2
movdqa xmm1, [r1+16]
movdqa xmm0, [r1]
pshufb xmm1, [pb_scan4frameb]
pshufb xmm0, [pb_scan4framea]
- movdqa xmm2, xmm1
- psrldq xmm1, 6
- palignr xmm2, xmm0, 6
+ psrldq xmm2, xmm1, 6
+ palignr xmm1, xmm0, 6
pslldq xmm0, 10
- palignr xmm1, xmm0, 10
- movdqa [r0], xmm2
- movdqa [r0+16], xmm1
+ palignr xmm2, xmm0, 10
+ movdqa [r0], xmm1
+ movdqa [r0+16], xmm2
RET
+%endmacro
+
+INIT_XMM
+SCAN_4x4_FRAME ssse3
+INIT_AVX
+SCAN_4x4_FRAME avx
+%endif ; !HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
INIT_XMM
pshuf%2 m2, m2, 000111001b ; 08 11 10 09
punpckl%3 m3, m1 ; 05 03 04 03
pinsr%2 m0, r2d, 3 ; 08 02 01 00
- mova m4, m2
- punpckl%3 m2, m3 ; 04 10 03 09
- pshuf%2 m2, m2, 010110100b ; 10 04 03 09
+ punpckl%3 m4, m2, m3 ; 04 10 03 09
+ pshuf%2 m4, m4, 010110100b ; 10 04 03 09
mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
- mova [r0+ 4*SIZEOF_DCTCOEF], m2 ; 10 04 03 09
+ mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
punpckl%4 m6, m5 ; 17 16 XX XX
psrl%5 m1, %6 ; XX 07 06 05
- punpckh%3 m6, m4 ; 08 17 11 16
+ punpckh%3 m6, m2 ; 08 17 11 16
punpckl%4 m6, m1 ; 06 05 11 16
mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
psrl%5 m1, %6 ; XX XX 07 06
punpckl%3 m1, m5 ; 17 07 16 06
mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
- mova m6, m3
punpckh%4 m1, m1 ; 17 07 17 07
- punpckl%3 m6, m2 ; 25 13 24 12
+ punpckl%3 m6, m3, m2 ; 25 13 24 12
pextr%2 r2d, m5, 2
mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
punpckl%3 m1, m6 ; 24 17 12 07
mova [r0+48*SIZEOF_DCTCOEF], m7
mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
- mova m2, m0
mova m7, [r1+60*SIZEOF_DCTCOEF]
- punpckl%4 m2, m1 ; 53 52 57 56
+ punpckl%4 m2, m0, m1 ; 53 52 57 56
punpckh%4 m1, m0 ; 59 58 55 54
mova [r0+52*SIZEOF_DCTCOEF], m2
mova [r0+56*SIZEOF_DCTCOEF], m1
%ifdef HIGH_BIT_DEPTH
INIT_XMM
SCAN_8x8 sse4 , d, dq, qdq, dq, 4
+INIT_AVX
+SCAN_8x8 avx , d, dq, qdq, dq, 4
%else
INIT_MMX
SCAN_8x8 mmxext, w, wd, dq , q , 16
;-----------------------------------------------------------------------------
; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
-%macro ZIGZAG_SUB_4x4 2
+%macro ZIGZAG_SUB_4x4 3
%ifidn %1, ac
-cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
+cglobal zigzag_sub_4x4%1_%2_%3, 4,4,8
%else
-cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
+cglobal zigzag_sub_4x4%1_%2_%3, 3,3,8
%endif
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
pshufb xmm0, xmm7
pshufb xmm4, xmm7
pxor xmm6, xmm6
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
+ punpckhbw xmm1, xmm0, xmm6
+ punpckhbw xmm5, xmm4, xmm6
punpcklbw xmm0, xmm6
- punpckhbw xmm1, xmm6
punpcklbw xmm4, xmm6
- punpckhbw xmm5, xmm6
psubw xmm0, xmm4
psubw xmm1, xmm5
%ifidn %1, ac
RET
%endmacro
-ZIGZAG_SUB_4x4 , frame
-ZIGZAG_SUB_4x4 ac, frame
-ZIGZAG_SUB_4x4 , field
-ZIGZAG_SUB_4x4 ac, field
+INIT_XMM
+ZIGZAG_SUB_4x4 , frame, ssse3
+ZIGZAG_SUB_4x4 ac, frame, ssse3
+ZIGZAG_SUB_4x4 , field, ssse3
+ZIGZAG_SUB_4x4 ac, field, ssse3
+INIT_AVX
+ZIGZAG_SUB_4x4 , frame, avx
+ZIGZAG_SUB_4x4 ac, frame, avx
+ZIGZAG_SUB_4x4 , field, avx
+ZIGZAG_SUB_4x4 ac, field, avx
;-----------------------------------------------------------------------------
; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
por m7, m3
por m5, m0
%else
- SWAP m5, m0
- SWAP m6, m2
- SWAP m7, m3
+ SWAP 5, 0
+ SWAP 6, 2
+ SWAP 7, 3
%endif
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM
ZIGZAG_8x8_CAVLC sse2, D
+INIT_AVX
+ZIGZAG_8x8_CAVLC avx , D
%else
INIT_MMX
ZIGZAG_8x8_CAVLC mmx , W
%endmacro
%ifndef HIGH_BIT_DEPTH
-INIT_XMM
-cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8
+%macro ZIGZAG_8x8_CAVLC 1
+cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8
INTERLEAVE_XMM 0
INTERLEAVE_XMM 16
packsswb m2, m3
shr r0d, 16
mov [r2+8], r0w
RET
+%endmacro
+
+INIT_XMM
+ZIGZAG_8x8_CAVLC sse2
+INIT_AVX
+ZIGZAG_8x8_CAVLC avx
%endif ; !HIGH_BIT_DEPTH
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_ssse3 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmxext( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct [16] );
+void x264_add4x4_idct_avx ( pixel *p_dst, dctcoef dct [16] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] );
void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [ 4] );
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][16] );
void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
+void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
+void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add8x8_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [ 4] );
void x264_add16x16_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [16] );
void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] );
void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] );
+void x264_add8x8_idct_dc_avx ( pixel *p_dst, dctcoef dct [ 4] );
+void x264_add16x16_idct_dc_avx ( pixel *p_dst, dctcoef dct [16] );
void x264_dct4x4dc_mmx ( int16_t d[16] );
void x264_dct4x4dc_sse2 ( int32_t d[16] );
+void x264_dct4x4dc_avx ( int32_t d[16] );
void x264_idct4x4dc_mmx ( int16_t d[16] );
void x264_idct4x4dc_sse2 ( int32_t d[16] );
+void x264_idct4x4dc_avx ( int32_t d[16] );
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_sse2 ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_avx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_avx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct [64] );
void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] );
+void x264_add8x8_idct8_avx ( uint8_t *dst, int16_t dct [64] );
+void x264_add16x16_idct8_avx ( uint8_t *dst, int16_t dct[4][64] );
+void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[64] );
+int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
+int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
+int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
+int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif
; out: %4 = |%1-%2|-%3
; clobbers: %5
%macro ABS_SUB 5
- mova %5, %2
- mova %4, %1
- psubusw %5, %1
- psubusw %4, %2
+ psubusw %5, %2, %1
+ psubusw %4, %1, %2
por %4, %5
psubw %4, %3
%endmacro
; out: %4 = |%1-%2|<%3
%macro DIFF_LT 5
- mova %4, %2
- mova %5, %1
- psubusw %4, %1
- psubusw %5, %2
+ psubusw %4, %2, %1
+ psubusw %5, %1, %2
por %5, %4 ; |%1-%2|
pxor %4, %4
psubw %5, %3 ; |%1-%2|-%3
; out: %1=p0', m2=q0'
%macro DEBLOCK_P0_Q0 7
psubw %3, %4
- mova %6, %2
pxor %7, %7
paddw %3, [pw_4]
psubw %7, %5
- psubw %6, %1
+ psubw %6, %2, %1
psllw %6, 2
paddw %3, %6
psraw %3, 3
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
%macro LUMA_Q1 6
- mova %6, %3
- pavgw %6, %4 ; (p0+q0+1)>>1
+ pavgw %6, %3, %4 ; (p0+q0+1)>>1
paddw %1, %6
pxor %6, %6
psraw %1, 1
movq m6, [r0+r1*2+0]
movq m1, [r0+r3-8]
TRANSPOSE4x4W 2, 5, 0, 1, 4
- SWAP m2, m7
+ SWAP 2, 7
movq m7, [r0+r3]
TRANSPOSE4x4W 2, 3, 6, 7, 4
%else
pand m4, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
- SWAP m0, m8
- SWAP m3, m9
+ SWAP 0, 8
+ SWAP 3, 9
%endmacro
-cglobal deblock_v_luma_sse2, 5,5,15
+%macro DEBLOCK_LUMA_64 1
+cglobal deblock_v_luma_%1, 5,5,15
%define p2 m8
%define p1 m0
%define p0 m1
jg .loop
REP_RET
-cglobal deblock_h_luma_sse2, 5,7,15
+cglobal deblock_h_luma_%1, 5,7,15
add r1, r1
LOAD_AB m12, m13, r2, r3
mov r2, r1
dec r6
jg .loop
REP_RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_LUMA_64 sse2
+INIT_AVX
+DEBLOCK_LUMA_64 avx
%endif
%macro SWAPMOVA 2
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
+%ifdef ARCH_X86_64
+ paddw t0, %3, %2
+ mova t2, %4
+ paddw t2, %3
+%else
mova t0, %3
mova t2, %4
paddw t0, %2
paddw t2, %3
+%endif
paddw t0, %1
paddw t2, t2
paddw t0, %5
paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
- mova t1, t0
psrlw t2, 3
- psrlw t1, 2
+ psrlw t1, t0, 2
psubw t2, %3
psubw t1, %2
pand t2, %8
paddw t1, %2
SWAPMOVA %11, t1
- mova t1, t0
- psubw t1, %3
+ psubw t1, t0, %3
paddw t0, t0
psubw t1, %5
psubw t0, %3
LOAD_AB t0, t1, r2d, r3d
mova %1, t0
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
+%ifdef ARCH_X86_64
+ mova %2, t0 ; mask0
+ psrlw t3, %1, 2
+%else
mova t3, %1
mova %2, t0 ; mask0
psrlw t3, 2
+%endif
paddw t3, [pw_2] ; alpha/4+2
DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
pand t2, %2
%endmacro
%ifdef ARCH_X86_64
-INIT_XMM
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra_sse2, 4,7,16
+%macro DEBLOCK_LUMA_INTRA_64 1
+cglobal deblock_v_luma_intra_%1, 4,7,16
%define t0 m1
%define t1 m2
%define t2 m4
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_sse2, 4,7,16
+cglobal deblock_h_luma_intra_%1, 4,7,16
%define t0 m15
%define t1 m14
%define t2 m2
jg .loop
ADD rsp, pad
RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_LUMA_INTRA_64 sse2
+INIT_AVX
+DEBLOCK_LUMA_INTRA_64 avx
+
%endif
%macro DEBLOCK_LUMA_INTRA 1
INIT_XMM
DEBLOCK_LUMA sse2
DEBLOCK_LUMA_INTRA sse2
+INIT_AVX
+DEBLOCK_LUMA avx
+DEBLOCK_LUMA_INTRA avx
%endif
%endif ; HIGH_BIT_DEPTH
punpckl%2 m4, m5
punpckh%2 m6, m5
- mova m1, m0
- mova m3, m2
+ punpckh%3 m1, m0, m4
+ punpckh%3 m3, m2, m6
punpckl%3 m0, m4
- punpckh%3 m1, m4
punpckl%3 m2, m6
- punpckh%3 m3, m6
%endmacro
; in: 4 rows of 8 bytes in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
- mova m4, m0
- mova m5, m1
- mova m6, m2
- punpckhdq m4, m4
- punpckhdq m5, m5
- punpckhdq m6, m6
+ punpckhdq m4, m0, m0
+ punpckhdq m5, m1, m1
+ punpckhdq m6, m2, m2
punpcklbw m0, m1
punpcklbw m2, m3
- mova m1, m0
- punpcklwd m0, m2
- punpckhwd m1, m2
- movh %1, m0
- punpckhdq m0, m0
- movh %2, m0
- movh %3, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ movh %1, m1
punpckhdq m1, m1
- movh %4, m1
+ movh %2, m1
+ movh %3, m0
+ punpckhdq m0, m0
+ movh %4, m0
punpckhdq m3, m3
punpcklbw m4, m5
punpcklbw m6, m3
- mova m5, m4
- punpcklwd m4, m6
- punpckhwd m5, m6
- movh %5, m4
- punpckhdq m4, m4
- movh %6, m4
- movh %7, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ movh %5, m5
punpckhdq m5, m5
- movh %8, m5
+ movh %6, m5
+ movh %7, m4
+ punpckhdq m4, m4
+ movh %8, m4
%endmacro
%macro TRANSPOSE4x8B_LOAD 8
%endmacro
%macro TRANSPOSE8x2W_STORE 8
- mova m0, m1
+ punpckhwd m0, m1, m2
punpcklwd m1, m2
- punpckhwd m0, m2
%if mmsize==8
- movd %1, m1
movd %3, m0
+ movd %1, m1
psrlq m1, 32
psrlq m0, 32
movd %2, m1
movd %4, m0
%else
- movd %1, m1
movd %5, m0
+ movd %1, m1
psrldq m1, 4
psrldq m0, 4
movd %2, m1
%endmacro
%macro SBUTTERFLY3 4
- movq %4, %2
+ punpckh%1 %4, %2, %3
punpckl%1 %2, %3
- punpckh%1 %4, %3
%endmacro
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT 5
+%if avx_enabled == 0
mova %5, %2
mova %4, %1
psubusb %5, %1
psubusb %4, %2
+%else
+ psubusb %5, %2, %1
+ psubusb %4, %1, %2
+%endif
por %4, %5
psubusb %4, %3
%endmacro
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT2 5
+%ifdef ARCH_X86_64
+ psubusb %5, %2, %1
+ psubusb %4, %1, %2
+%else
mova %5, %2
mova %4, %1
psubusb %5, %1
psubusb %4, %2
+%endif
psubusb %5, %3
psubusb %4, %3
pcmpeqb %4, %5
; out: m1=p0' m2=q0'
; clobbers: m0,3-6
%macro DEBLOCK_P0_Q0 0
- mova m5, m1
- pxor m5, m2 ; p0^q0
+ pxor m5, m1, m2 ; p0^q0
pand m5, [pb_1] ; (p0^q0)&1
pcmpeqb m4, m4
pxor m3, m4
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
; clobbers: q2, tmp, tc0
%macro LUMA_Q1 6
- mova %6, m1
- pavgb %6, m2
+ pavgb %6, m1, m2
pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
pand %6, [pb_1] ; (p2^avg(p0,q0))&1
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
- mova %6, %1
- psubusb %6, %5
+ psubusb %6, %1, %5
paddusb %5, %1
pmaxub %2, %6
pminub %2, %5
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-INIT_XMM
-cglobal deblock_v_luma_sse2, 5,5,10
+%macro DEBLOCK_LUMA 1
+cglobal deblock_v_luma_%1, 5,5,10
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
movdqa m3, [r4] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m9
- mova m7, m8
- psubb m7, m6
+ psubb m7, m8, m6
pand m6, m8
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal deblock_h_luma_sse2, 5,7
+cglobal deblock_h_luma_%1, 5,7
movsxd r10, r1d
lea r11, [r10+r10*2]
lea r6, [r0-4]
%ifdef WIN64
mov [rsp+0x20], r4
%endif
- call deblock_v_luma_sse2
+ call deblock_v_luma_%1
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add r6, 2
add rsp, 0x68
%endif
RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_LUMA sse2
+INIT_AVX
+DEBLOCK_LUMA avx
%else
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m4
pand m4, [esp+%3] ; tc
- mova m7, m4
- psubb m7, m6
+ psubb m7, m4, m6
pand m6, m4
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
DEBLOCK_LUMA mmxext, v8, 8
INIT_XMM
DEBLOCK_LUMA sse2, v, 16
+INIT_AVX
+DEBLOCK_LUMA avx, v, 16
%endif ; ARCH
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
+%ifdef ARCH_X86_64
+ pavgb t0, p2, p1
+ pavgb t1, p0, q0
+%else
mova t0, p2
mova t1, p0
pavgb t0, p1
pavgb t1, q0
+%endif
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
mova t5, t1
+%ifdef ARCH_X86_64
+ paddb t2, p2, p1
+ paddb t3, p0, q0
+%else
mova t2, p2
mova t3, p0
paddb t2, p1
paddb t3, q0
+%endif
paddb t2, t3
mova t3, t2
mova t4, t2
pand t2, mpb_1
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
+%ifdef ARCH_X86_64
+ pavgb t1, p2, q1
+ psubb t2, p2, q1
+%else
mova t1, p2
mova t2, p2
pavgb t1, q1
psubb t2, q1
+%endif
paddb t3, t3
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
pand t2, mpb_1
pand t3, mpb_1
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
- mova t3, p0
- mova t2, p0
- pxor t3, q1
- pavgb t2, q1
+ pxor t3, p0, q1
+ pavgb t2, p0, q1
pand t3, mpb_1
psubb t2, t3
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
mova %1, t1 ; store p0
mova t1, %4 ; p3
- mova t2, t1
+ paddb t2, t1, p2
pavgb t1, p2
- paddb t2, p2
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
paddb t2, t2
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
INIT_XMM
DEBLOCK_LUMA_INTRA sse2, v
+INIT_AVX
+DEBLOCK_LUMA_INTRA avx , v
%ifndef ARCH_X86_64
INIT_MMX
DEBLOCK_LUMA_INTRA mmxext, v8
mova %6, [pw_2]
paddw %6, %3
paddw %6, %4
- mova %7, %6
+ paddw %7, %6, %2
paddw %6, %1
- paddw %7, %2
paddw %6, %3
paddw %7, %4
psraw %6, 2
punpckldq m2, m7 ; q0 ... q1 ...
punpckldq m4, m1
punpckldq m6, m3
- mova m1, m0
- mova m3, m2
+ punpckhqdq m1, m0, m4 ; p0
punpcklqdq m0, m4 ; p1
- punpckhqdq m1, m4 ; p0
+ punpckhqdq m3, m2, m6 ; q1
punpcklqdq m2, m6 ; q0
- punpckhqdq m3, m6 ; q1
%endif
%endmacro
%endif
INIT_XMM
DEBLOCK_CHROMA sse2
+INIT_AVX
+DEBLOCK_CHROMA avx
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
INIT_XMM
DEBLOCK_CHROMA sse2
+INIT_AVX
+DEBLOCK_CHROMA avx
%ifndef ARCH_X86_64
INIT_MMX
DEBLOCK_CHROMA mmxext
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
%macro CHROMA_INTRA_P0 3
- mova m4, %1
- pxor m4, %3
+ pxor m4, %1, %3
pand m4, [pb_1] ; m4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, m4
- pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
+ pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%endmacro
%define t5 r4
INIT_XMM
DEBLOCK_CHROMA_INTRA sse2
+INIT_AVX
+DEBLOCK_CHROMA_INTRA avx
%ifndef ARCH_X86_64
INIT_MMX
DEBLOCK_CHROMA_INTRA mmxext
shufps m2, m1, 0xdd ; cur nnz, all rows
pslldq m1, 1
shufps m0, m1, 0xdd ; left neighbors
- mova m1, m2
+ pslldq m1, m2, 4
movd m3, [%1-8] ; could be palignr if nnz was aligned
- pslldq m1, 4
por m1, m3 ; top neighbors
%endmacro
por m5, m1
; Check mvs
-%ifidn %1, ssse3
- mova m3, [mv+4*8*0]
- mova m2, [mv+4*8*1]
- mova m0, m3
- mova m1, m2
- palignr m3, [mv+4*8*0-16], 12
- palignr m2, [mv+4*8*1-16], 12
+%ifnidn %1, sse2
+ mova m0, [mv+4*8*0]
+ mova m1, [mv+4*8*1]
+ palignr m3, m0, [mv+4*8*0-16], 12
+ palignr m2, m1, [mv+4*8*1-16], 12
psubw m0, m3
psubw m1, m2
packsswb m0, m1
- mova m3, [mv+4*8*2]
- mova m7, [mv+4*8*3]
- mova m2, m3
- mova m1, m7
- palignr m3, [mv+4*8*2-16], 12
- palignr m7, [mv+4*8*3-16], 12
+ mova m2, [mv+4*8*2]
+ mova m1, [mv+4*8*3]
+ palignr m3, m2, [mv+4*8*2-16], 12
+ palignr m7, m1, [mv+4*8*3-16], 12
psubw m2, m3
psubw m1, m7
packsswb m2, m1
paddb m1, m1
pmaxub m4, m0
pmaxub m5, m1
-%ifidn %1,ssse3
+%ifnidn %1, sse2
pshufb m4, [transpose_shuf]
%else
movhlps m3, m4
DEBLOCK_STRENGTH_XMM sse2
%define ABSB2 ABSB2_SSSE3
DEBLOCK_STRENGTH_XMM ssse3
+INIT_AVX
+DEBLOCK_STRENGTH_XMM avx
%endmacro
%ifdef HIGH_BIT_DEPTH
-
%macro BIWEIGHT_ROW 4
BIWEIGHT [%2], [%3]
%if %4==mmsize/4
movhps [%2+r1+x], m5
%else
WEIGHT %1+x, %1+x+mmsize/2
- SWAP m5, m7
+ SWAP 5, 7
WEIGHT %1+r3+x, %1+r3+x+mmsize/2
CLIPW m5, [pb_0], m4
CLIPW m7, [pb_0], m4
WEIGHTER 20, sse2
%ifdef HIGH_BIT_DEPTH
WEIGHTER 12, sse2
+INIT_AVX
+WEIGHTER 8, avx
+WEIGHTER 12, avx
+WEIGHTER 16, avx
+WEIGHTER 20, avx
%else
%define WEIGHT WEIGHT_SSSE3
%define WEIGHT_START WEIGHT_START_SSSE3
WEIGHTER 8, ssse3
WEIGHTER 16, ssse3
WEIGHTER 20, ssse3
+INIT_AVX
+WEIGHTER 8, avx
+WEIGHTER 16, avx
+WEIGHTER 20, avx
%endif
%macro OFFSET_OP 7
OFFSETPN 12, sse2
OFFSETPN 16, sse2
OFFSETPN 20, sse2
+INIT_AVX
+OFFSETPN 12, avx
+OFFSETPN 16, avx
+OFFSETPN 20, avx
%ifdef HIGH_BIT_DEPTH
+INIT_XMM
OFFSETPN 8, sse2
+INIT_AVX
+OFFSETPN 8, avx
%endif
%undef LOAD_HEIGHT
%undef HEIGHT_REG
AVGH 16, 8, mmxext
INIT_XMM
-
AVG_FUNC 4, movq, movq, sse2
AVGH 4, 8, sse2
AVGH 4, 4, sse2
%macro UNPACK_UNALIGNED 4
movu %1, [%4+0]
movu %2, [%4+4]
- mova %3, %1
+ punpckhwd %3, %1, %2
punpcklwd %1, %2
- punpckhwd %3, %2
- mova %2, %1
%if mmsize == 8
+ mova %2, %1
punpcklwd %1, %3
punpckhwd %2, %3
%else
+ shufps %2, %1, %3, 11011101b
shufps %1, %3, 10001000b
- shufps %2, %3, 11011101b
%endif
%endmacro
%else ; !HIGH_BIT_DEPTH
pmaddwd m0, m7
pmaddwd m1, m7
packssdw m0, m1
- SWAP m3, m0
+ SWAP 3, 0
ALIGN 4
.loop2:
%ifdef HIGH_BIT_DEPTH
.width8:
%ifdef ARCH_X86_64
%define multy0 m8
- SWAP m8, m5
+ SWAP 8, 5
%else
%define multy0 r0m
mova multy0, m5
movu m1, [r3+mmsize/2]
UNPACK_UNALIGNED m0, m2, [r3+2]
UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
- mova m2, m0
- mova m3, m1
+ psrlw m2, m0, 8
+ psrlw m3, m1, 8
pand m0, [pw_00ff]
pand m1, [pw_00ff]
- psrlw m2, 8
- psrlw m3, 8
%endif
pmaddwd m0, m7
pmaddwd m2, m7
pmaddwd m3, m7
packssdw m0, m2
packssdw m1, m3
- SWAP m4, m0
- SWAP m5, m1
+ SWAP 4, 0
+ SWAP 5, 1
add r3, r4
ALIGN 4
.loop4:
movu m1, [r3+mmsize/2]
UNPACK_UNALIGNED m0, m2, [r3+2]
UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
- mova m2, m0
- mova m3, m1
+ psrlw m2, m0, 8
+ psrlw m3, m1, 8
pand m0, [pw_00ff]
pand m1, [pw_00ff]
- psrlw m2, 8
- psrlw m3, 8
pmaddwd m0, m7
pmaddwd m2, m7
pmaddwd m1, m7
pmullw m4, m6
pmullw m5, m6
mova m2, [pw_32]
- mova m3, m2
+ paddw m3, m2, m5
paddw m2, m4
- paddw m3, m5
mova m4, m0
mova m5, m1
pmullw m0, multy0
movhps m0, [r3]
movhps m1, [r3+r6]
%endif
- mova m2, m0
- mova m3, m1
+ psrlw m2, m0, 8
+ psrlw m3, m1, 8
pand m0, [pw_00ff]
pand m1, [pw_00ff]
- psrlw m2, 8
- psrlw m3, 8
%endif ; HIGH_BIT_DEPTH
pmullw m0, m4
pmullw m1, m5
%endmacro ; MC_CHROMA
-%macro MC_CHROMA_SSSE3 0-1
-INIT_XMM
-cglobal mc_chroma_ssse3%1, 0,6,9
+%macro MC_CHROMA_SSSE3 1-2
+cglobal mc_chroma_%1, 0,6,9
MC_CHROMA_START
and r5d, 7
and t2d, 7
imul r5d, t0d ; (x*255+8)*(8-y)
movd m6, t2d
movd m7, r5d
-%ifidn %1, _cache64
+%ifidn %2, _cache64
mov t0d, r3d
and t0d, 7
%ifdef PIC
pshufb m1, m5
movu m3, [r3+r4*2]
pshufb m3, m5
- mova m2, m1
mova m4, m3
pmaddubsw m0, m7
+ pmaddubsw m2, m1, m7
pmaddubsw m1, m6
- pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, [pw_32]
paddw m2, [pw_32]
movu m1, [r3+8]
pshufb m1, m5
%ifdef ARCH_X86_64
- SWAP m8, m6
+ SWAP 8, 6
%define mult1 m8
%else
mova r0m, m6
MC_CHROMA mmxext
INIT_XMM
MC_CHROMA sse2
+INIT_AVX
+MC_CHROMA avx
%else ; !HIGH_BIT_DEPTH
INIT_MMX
%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM
MC_CHROMA sse2_misalign
%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD
MC_CHROMA sse2
-MC_CHROMA_SSSE3
-MC_CHROMA_SSSE3 _cache64
+MC_CHROMA_SSSE3 ssse3
+MC_CHROMA_SSSE3 ssse3_cache64, _cache64
+INIT_AVX
+MC_CHROMA_SSSE3 avx ; No known AVX CPU will trigger CPU_CACHELINE_64
%endif ; HIGH_BIT_DEPTH
%macro LOAD_ADD_2 6
mova %5, %3
mova %1, %4
- mova %6, %5
- mova %2, %1
+ punpckhbw %6, %5, m0
punpcklbw %5, m0
+ punpckhbw %2, %1, m0
punpcklbw %1, m0
- punpckhbw %6, m0
- punpckhbw %2, m0
paddw %1, %5
paddw %2, %6
%endmacro
mova [r0+r2-mmsize*1], m4
jl .loop
REP_RET
-%endmacro
+%endmacro ; HPEL_FILTER
INIT_MMX
HPEL_FILTER mmxext
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
-INIT_MMX
-
-%macro HPEL_V 1-2 0
+%macro HPEL_V 3
;-----------------------------------------------------------------------------
; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
add r0, r4
lea r2, [r2+r4*2]
neg r4
-%ifnidn %1, ssse3
- pxor m0, m0
-%else
+%if %3
mova m0, [filt_mul15]
+%else
+ pxor m0, m0
%endif
.loop:
-%ifidn %1, ssse3
+%if %3
mova m1, [r1]
mova m4, [r1+r3]
mova m2, [r5+r3*2]
LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
- LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
+ LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
FILT_V2 m1, m2, m3, m4, m5, m6
%endif
mova m7, [pw_16]
jl .loop
REP_RET
%endmacro
-HPEL_V mmxext
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
mova m1, [src]
.loop:
mova m2, [src+16]
- mova m4, m1
- PALIGNR m4, m0, 12, m7
- mova m5, m1
- PALIGNR m5, m0, 14, m0
- mova m0, m2
- PALIGNR m0, m1, 6, m7
+ PALIGNR m4, m1, m0, 12, m7
+ PALIGNR m5, m1, m0, 14, m0
+ PALIGNR m0, m2, m1, 6, m7
paddw m4, m0
- mova m0, m2
- PALIGNR m0, m1, 4, m7
+ PALIGNR m0, m2, m1, 4, m7
paddw m5, m0
- mova m6, m2
- PALIGNR m6, m1, 2, m7
+ PALIGNR m6, m2, m1, 2, m7
paddw m6, m1
FILT_H m4, m5, m6
PALIGNR m2, m1, 12, m7
PALIGNR m5, m1, 14, m1
mova m1, [src+32]
- mova m3, m1
- PALIGNR m3, m0, 6, m7
+ PALIGNR m3, m1, m0, 6, m7
paddw m3, m2
- mova m6, m1
- PALIGNR m6, m0, 4, m7
+ PALIGNR m6, m1, m0, 4, m7
paddw m5, m6
- mova m6, m1
- PALIGNR m6, m0, 2, m7
+ PALIGNR m6, m1, m0, 2, m7
paddw m6, m0
FILT_H m3, m5, m6
%endif
mova m7, [pw_16]
.loop:
mova m2, [src+16]
- mova m3, m1
- palignr m3, m0, 14
- mova m4, m1
- palignr m4, m0, 15
- mova m0, m2
- palignr m0, m1, 2
+ palignr m3, m1, m0, 14
+ palignr m4, m1, m0, 15
+ palignr m0, m2, m1, 2
pmaddubsw m3, [filt_mul15]
pmaddubsw m4, [filt_mul15]
pmaddubsw m0, [filt_mul51]
- mova m5, m2
- palignr m5, m1, 1
- mova m6, m2
- palignr m6, m1, 3
+ palignr m5, m2, m1, 1
+ palignr m6, m2, m1, 3
paddw m3, m0
mova m0, m1
pmaddubsw m1, [filt_mul20]
add r2, 16
jl .loop
REP_RET
-%endif
+%endif ; !ARCH_X86_64
%define PALIGNR PALIGNR_MMX
+INIT_MMX
+HPEL_V mmxext, 0, 1
+INIT_XMM
+HPEL_V sse2, 8, 1
+HPEL_C sse2_misalign
%ifndef ARCH_X86_64
HPEL_C sse2
-%endif
-HPEL_V sse2, 8
-HPEL_C sse2_misalign
%define PALIGNR PALIGNR_SSSE3
HPEL_C ssse3
-HPEL_V ssse3
+HPEL_V ssse3, 0, 0
+INIT_AVX
+HPEL_C avx
+HPEL_V avx, 0, 0
+%endif
%ifdef ARCH_X86_64
-
%macro DO_FILT_V 6
;The optimum prefetch distance is difficult to determine in checkasm:
;any prefetch seems slower than not prefetching.
;+16 is picked somewhat arbitrarily here based on the fact that even one
;loop iteration is going to take longer than the prefetch.
prefetcht0 [r1+r2*2+16]
-%ifidn %6, ssse3
+%ifnidn %6, sse2
mova m1, [r3]
mova m2, [r3+r2]
mova %3, [r3+r2*2]
mova m3, [r1]
mova %1, [r1+r2]
mova %2, [r1+r2*2]
- mova m4, m1
+ punpckhbw m4, m1, m2
punpcklbw m1, m2
- punpckhbw m4, m2
- mova m2, %1
+ punpckhbw m2, %1, %2
punpcklbw %1, %2
- punpckhbw m2, %2
- mova %2, m3
+ punpckhbw %2, m3, %3
punpcklbw m3, %3
- punpckhbw %2, %3
pmaddubsw m1, m12
pmaddubsw m4, m12
%endmacro
%macro FILT_C 4
- mova m1, %2
- PALIGNR m1, %1, 12, m2
- mova m2, %2
- PALIGNR m2, %1, 14, %1
- mova m3, %3
- PALIGNR m3, %2, 4, %1
- mova m4, %3
- PALIGNR m4, %2, 2, %1
+ PALIGNR m1, %2, %1, 12, m2
+ PALIGNR m2, %2, %1, 14, %1
+ PALIGNR m3, %3, %2, 4, %1
+ PALIGNR m4, %3, %2, 2, %1
paddw m3, m2
mova %1, %3
PALIGNR %3, %2, 6, m2
%endmacro
%macro ADD8TO16 5
- mova %3, %1
- mova %4, %2
+ punpckhbw %3, %1, %5
punpcklbw %1, %5
+ punpcklbw %4, %2, %5
punpckhbw %2, %5
- punpckhbw %3, %5
- punpcklbw %4, %5
paddw %2, %3
paddw %1, %4
%endmacro
%macro DO_FILT_H 4
- mova m1, %2
- PALIGNR m1, %1, 14, m3
- mova m2, %2
- PALIGNR m2, %1, 15, m3
- mova m4, %3
- PALIGNR m4, %2, 1 , m3
- mova m5, %3
- PALIGNR m5, %2, 2 , m3
- mova m6, %3
- PALIGNR m6, %2, 3 , m3
+ PALIGNR m1, %2, %1, 14, m3
+ PALIGNR m2, %2, %1, 15, m3
+ PALIGNR m4, %3, %2, 1 , m3
+ PALIGNR m5, %3, %2, 2 , m3
+ PALIGNR m6, %3, %2, 3 , m3
mova %1, %2
%ifidn %4, sse2
ADD8TO16 m1, m6, m12, m3, m0 ; a
ADD8TO16 %2, m4, m12, m3, m0 ; c
FILT_V2 m1, m2, %2, m6, m5, m4
FILT_PACK m1, m6, 5, m15
-%else ; ssse3
+%else ; ssse3, avx
pmaddubsw m1, m12
pmaddubsw m2, m12
pmaddubsw %2, m14
RET
%endmacro
+INIT_XMM
%define PALIGNR PALIGNR_MMX
HPEL sse2
%define PALIGNR PALIGNR_SSSE3
HPEL ssse3
-%endif
+INIT_AVX
+HPEL avx
+%endif ; ARCH_X86_64
%undef movntq
%undef movntps
%rep 16/mmsize
mov%4 m0, [%2+(x/2)*mmsize]
mov%4 m1, [%3+(x/2)*mmsize]
- mova m2, m0
+ punpckhwd m2, m0, m1
punpcklwd m0, m1
- punpckhwd m2, m1
mov%5a [%1+(x+0)*mmsize], m0
mov%5a [%1+(x+1)*mmsize], m2
%assign x (x+2)
mov%5a [%1], m0
%else
movq m1, [%3]
- mova m2, m0
+ punpckhbw m2, m0, m1
punpcklbw m0, m1
- punpckhbw m2, m1
mov%5a [%1+0], m0
mov%5a [%1+8], m2
%endif
%rep 16/mmsize
mova m0, [%3+(n+0)*mmsize]
mova m1, [%3+(n+1)*mmsize]
- mova m2, m0
- mova m3, m1
+ psrld m2, m0, 16
+ psrld m3, m1, 16
pand m0, %6
pand m1, %6
- psrld m2, 16
- psrld m3, 16
packssdw m0, m1
packssdw m2, m3
mov%7 [%1+(n/2)*mmsize], m0
INIT_XMM
PLANE_INTERLEAVE sse2
PLANE_DEINTERLEAVE sse2
+INIT_AVX
+PLANE_INTERLEAVE avx
+PLANE_DEINTERLEAVE avx
%else
INIT_MMX
PLANE_INTERLEAVE mmxext
jl .loop
REP_RET
-cglobal integral_init8h_sse4, 3,4
+%macro INTEGRAL_INIT8H 1
+cglobal integral_init8h_%1, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
movdqa m0, [r1+r2]
movdqa m1, [r1+r2+16]
palignr m1, m0, 8
- movdqa m2, m0
- movdqa m3, m1
+ mpsadbw m2, m0, m4, 4
+ mpsadbw m3, m1, m4, 4
mpsadbw m0, m4, 0
mpsadbw m1, m4, 0
- mpsadbw m2, m4, 4
- mpsadbw m3, m4, 4
paddw m0, [r0+r2*2]
paddw m1, [r0+r2*2+16]
paddw m0, m2
add r2, 16
jl .loop
REP_RET
+%endmacro
+
+INIT_XMM
+INTEGRAL_INIT8H sse4
+INIT_AVX
+INTEGRAL_INIT8H avx
%macro INTEGRAL_INIT_8V 1
;-----------------------------------------------------------------------------
PALIGNR %2, %4, 1, m6
pavgb %1, %3
pavgb %2, %4
- mova %5, %1
- mova %6, %2
+ psrlw %5, %1, 8
+ psrlw %6, %2, 8
pand %1, m7
pand %2, m7
- psrlw %5, 8
- psrlw %6, 8
%endmacro
%macro FILT16x2 4
pavgb %1, m3
PALIGNR m3, m2, 1, m6
pavgb m3, m2
- mova m5, m3
- mova m4, %1
+ psrlw m5, m3, 8
+ psrlw m4, %1, 8
pand m3, m7
pand %1, m7
- psrlw m5, 8
- psrlw m4, 8
packuswb m3, %1
packuswb m5, m4
mova [%2], m3
pavgb m0, [r0+%3+r5+1]
pavgb m1, m3
pavgb m0, m2
- mova m3, m1
- mova m2, m0
+ psrlw m3, m1, 8
+ psrlw m2, m0, 8
pand m1, m7
pand m0, m7
- psrlw m3, 8
- psrlw m2, 8
packuswb m0, m1
packuswb m2, m3
mova [%1], m0
pavgw m0, [r0+%3+r5+2]
pavgw m1, m3
pavgw m0, m2
- mova m3, m1
- mova m2, m0
+ psrld m3, m1, 16
+ psrld m2, m0, 16
pand m1, m7
pand m0, m7
- psrld m3, 16
- psrld m2, 16
packssdw m0, m1
packssdw m2, m3
movu [%1], m0
pavgw %1, m3
PALIGNR m3, m2, 2, m6
pavgw m3, m2
- mova m5, m3
- mova m4, %1
+ psrld m5, m3, 16
+ psrld m4, %1, 16
pand m3, m7
pand %1, m7
- psrld m5, 16
- psrld m4, 16
packssdw m3, %1
packssdw m5, m4
mova [%2], m3
MC_WEIGHT( 12, ssse3 )
MC_WEIGHT( 16, ssse3 )
MC_WEIGHT( 20, ssse3 )
+MC_WEIGHT( 4, avx )
+MC_WEIGHT( 8, avx )
+MC_WEIGHT( 12, avx )
+MC_WEIGHT( 16, avx )
+MC_WEIGHT( 20, avx )
#undef MC_OFFSET
#undef MC_WEIGHT
void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst,
pixel *srcu, int i_srcu,
pixel *srcv, int i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_avx( pixel *dst, int i_dst,
+ pixel *srcu, int i_srcu,
+ pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
pixel *srcu, int i_srcu,
pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
uint8_t *dstv, int i_dstv,
uint8_t *src, int i_src, int w, int h );
+void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
+ uint16_t *dstv, int i_dstv,
+ uint16_t *src, int i_src, int w, int h );
void x264_store_interleave_8x8x2_mmxext( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
+void x264_store_interleave_8x8x2_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
void x264_load_deinterleave_8x8x2_fenc_mmx( pixel *dst, pixel *src, int i_src );
void x264_load_deinterleave_8x8x2_fenc_sse2( pixel *dst, pixel *src, int i_src );
void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src );
+void x264_load_deinterleave_8x8x2_fenc_avx( uint16_t *dst, uint16_t *src, int i_src );
void x264_load_deinterleave_8x8x2_fdec_mmx( pixel *dst, pixel *src, int i_src );
void x264_load_deinterleave_8x8x2_fdec_sse2( pixel *dst, pixel *src, int i_src );
void x264_load_deinterleave_8x8x2_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src );
+void x264_load_deinterleave_8x8x2_fdec_avx( uint16_t *dst, uint16_t *src, int i_src );
void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
void x264_memzero_aligned_mmx( void * dst, int n );
void x264_memzero_aligned_sse2( void * dst, int n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, int stride );
void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
MC_CHROMA(sse2_misalign)
MC_CHROMA(ssse3)
MC_CHROMA(ssse3_cache64)
+MC_CHROMA(avx)
+MC_CHROMA(avx_cache64)
#define LOWRES(cpu)\
void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
HPEL(8, mmxext, mmxext, mmxext, mmxext)
#if HIGH_BIT_DEPTH
-HPEL(16, sse2, sse2, sse2, sse2 )
+HPEL(16, sse2, sse2, sse2, sse2)
#else // !HIGH_BIT_DEPTH
HPEL(16, sse2_amd, mmxext, mmxext, sse2)
#if ARCH_X86_64
void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_avx( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
#else
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, ssse3, ssse3, ssse3)
+HPEL(16, avx, avx, avx, ssse3)
#endif
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
#endif // HIGH_BIT_DEPTH
PLANE_INTERLEAVE(mmxext)
PLANE_INTERLEAVE(sse2)
+#if HIGH_BIT_DEPTH
+PLANE_INTERLEAVE(avx)
+#endif
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
pf->integral_init4v = x264_integral_init4v_ssse3;
+
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+
+ pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_avx;
+ pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_avx;
+ pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
+ pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_avx;
+
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ pf->mc_chroma = x264_mc_chroma_avx;
#else // !HIGH_BIT_DEPTH
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
pf->integral_init4h = x264_integral_init4h_sse4;
pf->integral_init8h = x264_integral_init8h_sse4;
+
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+
+ pf->integral_init8h = x264_integral_init8h_avx;
+ pf->hpel_filter = x264_hpel_filter_avx;
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ pf->mc_chroma = x264_mc_chroma_avx;
#endif // HIGH_BIT_DEPTH
}
%define spill esp+0x60 ; +16
%define trans esp+0 ; +96
LOAD_DIFF_4x8P 0
- HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m1
TRANSPOSE4x4W 4, 5, 6, 7, 1
mov r0, [args+4]
mov r2, [args]
LOAD_DIFF_4x8P 4
- HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
movq m2, [trans+0x10]
movq m3, [trans+0x18]
- HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
SUM4x8_MM
movq [trans], m0
movq m6, [trans+0x50]
movq m7, [trans+0x58]
- HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
SUM4x8_MM
pavgw m0, [trans]
%define trans esp+0 ; +96
%define sum esp+0 ; +32
LOAD_4x8P 0
- HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m0
TRANSPOSE4x4W 4, 5, 6, 7, 0
movq [trans+0x38], m3
LOAD_4x8P 4
- HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
movq m2, [trans+0x10]
movq m3, [trans+0x18]
- HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill+0], m0
movq [spill+8], m1
movq m6, [trans+0x50]
movq m7, [trans+0x58]
- HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movd [sum+0x10], m0
movd [sum+0x12], m1
psubusb m%3, m%7
por m%1, m%2
por m%3, m%4
- mova m%2, m%1
- mova m%4, m%3
+ punpcklbw m%2, m%1, m%5
punpckhbw m%1, m%5
- punpcklbw m%2, m%5
+ punpcklbw m%4, m%3, m%5
punpckhbw m%3, m%5
- punpcklbw m%4, m%5
%endif
pmaddwd m%1, m%1
pmaddwd m%2, m%2
%macro SSD_CORE_SSSE3 7-8
%ifidn %8, FULL
- mova m%6, m%1
- mova m%7, m%3
+ punpckhbw m%6, m%1, m%2
+ punpckhbw m%7, m%3, m%4
punpcklbw m%1, m%2
punpcklbw m%3, m%4
- punpckhbw m%6, m%2
- punpckhbw m%7, m%4
SWAP %6, %2, %3
SWAP %7, %4
%endif
%ifidn %3, ssse3
mova m7, [hsub_mul]
+%elifidn %3, avx
+ mova m7, [hsub_mul]
%elifidn %3, sse2
mova m7, [pw_00ff]
%elif %1 >= mmsize
SSD 16, 8, ssse3, 8
SSD 8, 16, ssse3, 8
SSD 8, 4, ssse3, 8
+INIT_AVX
+SSD 16, 16, avx, 8
+SSD 8, 8, avx, 8
+SSD 16, 8, avx, 8
+SSD 8, 16, avx, 8
+SSD 8, 4, avx, 8
INIT_MMX
SSD 4, 4, ssse3
SSD 4, 8, ssse3
jl .loopx
%if mmsize==16 ; using HADDD would remove the mmsize/32 part from the
; equation above, putting the width limit at 8208
- mova m0, m2
- mova m1, m3
+ punpckhdq m0, m2, m6
+ punpckhdq m1, m3, m6
punpckldq m2, m6
punpckldq m3, m6
- punpckhdq m0, m6
- punpckhdq m1, m6
paddq m3, m2
paddq m1, m0
paddq m4, m3
psubusb m0, m1
psubusb m1, [r0+r6]
por m0, m1
- mova m2, m0
+ psrlw m2, m0, 8
+ add r6, mmsize
pand m0, m5
- psrlw m2, 8
- pmaddwd m0, m0
pmaddwd m2, m2
+ pmaddwd m0, m0
paddd m3, m0
paddd m4, m2
- add r6, mmsize
jl .loopx
add r0, r1
add r2, r3
movq [r4], m4
RET
%endmacro ; SSD_NV12
-%endif ; !X264_HIGHT_BIT_DEPTH
+%endif ; !HIGH_BIT_DEPTH
INIT_MMX
SSD_NV12 mmxext
INIT_XMM
SSD_NV12 sse2
+INIT_AVX
+SSD_NV12 avx
;=============================================================================
; variance
mova m4, [r0+%1+mmsize]
%else ; !HIGH_BIT_DEPTH
mova m0, [r0]
- mova m1, m0
+ punpckhbw m1, m0, m7
mova m3, [r0+%1]
mova m4, m3
punpcklbw m0, m7
- punpckhbw m1, m7
%endif ; HIGH_BIT_DEPTH
%ifidn %1, r1
lea r0, [r0+%1*2]
VAR_2ROW r1, 4
VAR_END 8, 8
-INIT_XMM
%ifdef HIGH_BIT_DEPTH
-cglobal pixel_var_16x16_sse2, 2,3,8
+%macro VAR 1
+cglobal pixel_var_16x16_%1, 2,3,8
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 16, 16
-cglobal pixel_var_8x8_sse2, 2,3,8
+cglobal pixel_var_8x8_%1, 2,3,8
lea r2, [r1*3]
VAR_START 0
mova m0, [r0]
mova m4, [r0+r2*2]
VAR_CORE
VAR_END 8, 8
+%endmacro ; VAR
+
+INIT_XMM
+VAR sse2
+INIT_AVX
+VAR avx
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
-cglobal pixel_var_16x16_sse2, 2,3,8
+%macro VAR 1
+cglobal pixel_var_16x16_%1, 2,3,8
VAR_START 1
mov r2d, 8
.loop:
jg .loop
VAR_END 16, 16
-cglobal pixel_var_8x8_sse2, 2,4,8
+cglobal pixel_var_8x8_%1, 2,4,8
VAR_START 1
mov r2d, 2
lea r3, [r1*3]
dec r2d
jg .loop
VAR_END 8, 8
+%endmacro ; VAR
+
+INIT_XMM
+VAR sse2
+INIT_AVX
+VAR avx
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_END 0
DEINTB %1, %2, %3, %4, %5
psubw m%1, m%3
psubw m%2, m%4
- SUMSUB_BA w, m%1, m%2, m%3
+ SUMSUB_BA w, %1, %2, %3
%endmacro
%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
%ifidn %1, sse2
HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
%else
- HADAMARD4_V m%2, m%3, m%4, m%5, m%6
+ HADAMARD4_V %2, %3, %4, %5, %6
; doing the abs first is a slight advantage
ABS4 m%2, m%4, m%3, m%5, m%6, m%7
HADAMARD 1, max, %2, %4, %6, %7
; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 1
-INIT_XMM
%ifnidn %1, sse2
cglobal pixel_satd_4x4_%1, 4, 6, 6
SATD_START_MMX
%if vertical
HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
%else ; non-sse2
- HADAMARD4_V m0, m1, m2, m8, m6
- HADAMARD4_V m4, m5, m3, m9, m6
- SUMSUB_BADC w, m0, m4, m1, m5, m6
+ HADAMARD4_V 0, 1, 2, 8, 6
+ HADAMARD4_V 4, 5, 3, 9, 6
+ SUMSUB_BADC w, 0, 4, 1, 5, 6
HADAMARD 2, sumsub, 0, 4, 6, 11
HADAMARD 2, sumsub, 1, 5, 6, 11
- SUMSUB_BADC w, m2, m3, m8, m9, m6
+ SUMSUB_BADC w, 2, 3, 8, 9, 6
HADAMARD 2, sumsub, 2, 3, 6, 11
HADAMARD 2, sumsub, 8, 9, 6, 11
HADAMARD 1, amax, 0, 4, 6, 11
SA8D_INTER
call pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
SA8D_INTER
- SWAP m0, m10
+ SWAP 0, 10
%ifndef HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova spill2, m1
SWAP 1, 7
LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
- HADAMARD4_V m4, m5, m6, m7, m3
+ HADAMARD4_V 4, 5, 6, 7, 3
mova m1, spill2
mova m2, spill0
mova m3, spill1
mova spill0, m6
mova spill1, m7
- HADAMARD4_V m0, m1, m2, m3, m7
- SUMSUB_BADC w, m0, m4, m1, m5, m7
+ HADAMARD4_V 0, 1, 2, 3, 7
+ SUMSUB_BADC w, 0, 4, 1, 5, 7
HADAMARD 2, sumsub, 0, 4, 7, 6
HADAMARD 2, sumsub, 1, 5, 7, 6
HADAMARD 1, amax, 0, 4, 7, 6
mova m6, spill0
mova m7, spill1
paddw m0, m1
- SUMSUB_BADC w, m2, m6, m3, m7, m4
+ SUMSUB_BADC w, 2, 6, 3, 7, 4
HADAMARD 2, sumsub, 2, 6, 4, 5
HADAMARD 2, sumsub, 3, 7, 4, 5
HADAMARD 1, amax, 2, 6, 4, 5
%endif ; sse2/non-sse2
paddw m0, m2
paddw m0, m3
+ SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1
ret
%endif ; ifndef mmxext
%macro INTRA_SA8D_SSE2 1
%ifdef ARCH_X86_64
-INIT_XMM
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
paddusw m8, m9
paddusw m15, m10
paddusw m15, m8
- movdqa m14, m15 ; 7x8 sum
movdqa m8, [r1+0] ; left edge
movd m9, r0d
psubw m9, m0
ABS1 m8, m10
ABS1 m9, m11 ; 1x8 sum
- paddusw m14, m8
+ paddusw m14, m15, m8
paddusw m15, m9
punpcklwd m0, m1
punpcklwd m2, m3
punpckldq m4, m6
punpcklqdq m0, m4 ; transpose
movdqa m1, [r1+16] ; top edge
- movdqa m2, m15
psllw m1, 3
- psrldq m2, 2 ; 8x7 sum
+ psrldq m2, m15, 2 ; 8x7 sum
psubw m0, m1 ; 8x1 sum
ABS1 m0, m1
paddusw m2, m0
pmaddwd m2, m7
pmaddwd m14, m7
pmaddwd m15, m7
- movdqa m3, m2
+ punpckhdq m3, m2, m14
punpckldq m2, m14
- punpckhdq m3, m14
pshufd m5, m15, 0xf5
paddd m2, m3
paddd m5, m15
- movdqa m3, m2
+ punpckhqdq m3, m2, m5
punpcklqdq m2, m5
- punpckhqdq m3, m5
pavgw m3, m2
pxor m0, m0
pavgw m3, m0
%endmacro
%macro INTRA_SATDS_MMX 1
-INIT_MMX
;-----------------------------------------------------------------------------
; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
mova m2, [r3+0x40]
mova m3, [r3+0x60]
sub r3, 8
- SUMSUB_BADC w, m0, m1, m2, m3, m4
+ SUMSUB_BADC w, 0, 1, 2, 3, 4
ABS4 m0, m2, m1, m3, m4, m5
HADAMARD 0, max, 0, 2, 4, 5
HADAMARD 0, max, 1, 3, 4, 5
mova m1, [r3+0x20]
mova m2, [r3+0x40]
mova m3, [r3+0x60]
- SUMSUB_BADC w, m0, m1, m2, m3, m4
+ SUMSUB_BADC w, 0, 1, 2, 3, 4
HADAMARD 0, sumsub, 0, 2, 4, 5
ABS4 m1, m3, m0, m2, m4, m5
HADAMARD 0, max, 1, 3, 4, 5
paddd m0, m2
paddd m6, m6
paddd m0, m6
- SWAP m0, m6
+ SWAP 0, 6
%else ; !HIGH_BIT_DEPTH
pand m6, m0
paddw m7, m1
paddw m6, m7
%endif ; HIGH_BIT_DEPTH
mova [rsp+gprsize], m6 ; save sa8d
- SWAP m0, m6
+ SWAP 0, 6
SAVE_MM_PERMUTATION hadamard_ac_8x8_mmxext
ret
%endmacro
%macro HADAMARD_AC_SSE2 1
-INIT_XMM
; in: r0=pix, r1=stride, r2=stride*3
; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
cglobal hadamard_ac_8x8_%1
%if vertical
HADAMARD4_2D_SSE 0, 1, 2, 3, 4
%else
- HADAMARD4_V m0, m1, m2, m3, m4
+ HADAMARD4_V 0, 1, 2, 3, 4
%endif
mova spill0, m1
SWAP 1, 7
%if vertical
HADAMARD4_2D_SSE 4, 5, 6, 7, 1
%else
- HADAMARD4_V m4, m5, m6, m7, m1
+ HADAMARD4_V 4, 5, 6, 7, 1
mova m1, spill0
mova spill0, m6
mova spill1, m7
ABS_MOV m2, m4
ABS_MOV m3, m5
paddw m1, m2
- SUMSUB_BA w, m0, m4
+ SUMSUB_BA w, 0, 4
%if vertical
pand m1, [mask_ac4]
%else
AC_PADD m2, m4, [pw_1]
AC_PADD m2, m0, [pw_1]
mova [rsp+gprsize+16], m2 ; save sa8d
- SWAP m0, m2
+ SWAP 0, 2
SAVE_MM_PERMUTATION hadamard_ac_8x8_%1
ret
SATDS_SSE2 sse2
INTRA_SA8D_SSE2 sse2
%ifndef HIGH_BIT_DEPTH
+INIT_MMX
INTRA_SATDS_MMX mmxext
%endif
+INIT_XMM
HADAMARD_AC_SSE2 sse2
%define ABS1 ABS1_SSSE3
%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
%endif
+INIT_XMM
SATDS_SSE2 ssse3
SA8D ssse3
HADAMARD_AC_SSE2 ssse3
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
INTRA_SA8D_SSE2 ssse3
+INIT_MMX
INTRA_SATDS_MMX ssse3
%define TRANS TRANS_SSE4
%define JDUP JDUP_PENRYN
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
+INIT_XMM
SATDS_SSE2 sse4
SA8D sse4
HADAMARD_AC_SSE2 sse4
+INIT_AVX
+SATDS_SSE2 avx
+SA8D avx
+INTRA_SA8D_SSE2 avx
+HADAMARD_AC_SSE2 avx
+
;=============================================================================
; SSIM
;=============================================================================
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-
%macro SSIM_ITER 1
%ifdef HIGH_BIT_DEPTH
movdqu m5, [r0+(%1&1)*r1]
paddw m1, m5
paddw m2, m6
%endif
- movdqa m7, m5
+ pmaddwd m7, m5, m6
pmaddwd m5, m5
- pmaddwd m7, m6
pmaddwd m6, m6
%if %1==0
- SWAP m3, m5
- SWAP m4, m7
+ SWAP 3, 5
+ SWAP 4, 7
%else
paddd m3, m5
paddd m4, m7
paddd m3, m6
%endmacro
-cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8
+%macro SSIM 1
+cglobal pixel_ssim_4x4x2_core_%1, 4,4,8
FIX_STRIDES r1, r3
pxor m0, m0
SSIM_ITER 0
pshufd m1, m1, 0xd8
paddd m4, m6
pmaddwd m1, m7
- movdqa m5, m3
+ punpckhdq m5, m3, m4
punpckldq m3, m4
- punpckhdq m5, m4
%ifdef UNIX64
%define t0 r4
;-----------------------------------------------------------------------------
; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-cglobal pixel_ssim_end4_sse2, 3,3,7
+cglobal pixel_ssim_end4_%1, 3,3,7
movdqa m0, [r0+ 0]
movdqa m1, [r0+16]
movdqa m2, [r0+32]
addps m2, m6 ; vars + ssim_c2
addps m3, m6 ; covar*2 + ssim_c2
%else
- movdqa m4, m1
+ pmaddwd m4, m1, m0 ; s1*s2
pslld m1, 16
- pmaddwd m4, m0 ; s1*s2
por m0, m1
pmaddwd m0, m0 ; s1*s1 + s2*s2
pslld m4, 1
fld dword r0m
%endif
RET
+%endmacro ; SSIM
-
+INIT_XMM
+SSIM sse2
+INIT_AVX
+SSIM avx
;=============================================================================
; Successive Elimination ADS
movdqu xmm10, [r1]
movdqu xmm11, [r1+r2]
.loop:
- movdqa xmm0, xmm10
- movdqu xmm1, [r1+16]
- movdqa xmm10, xmm1
- psubw xmm0, xmm7
- psubw xmm1, xmm6
+ psubw xmm0, xmm10, xmm7
+ movdqu xmm10, [r1+16]
+ psubw xmm1, xmm10, xmm6
ABS1 xmm0, xmm2
ABS1 xmm1, xmm3
- movdqa xmm2, xmm11
- movdqu xmm3, [r1+r2+16]
- movdqa xmm11, xmm3
- psubw xmm2, xmm5
- psubw xmm3, xmm4
+ psubw xmm2, xmm11, xmm5
+ movdqu xmm11, [r1+r2+16]
paddw xmm0, xmm1
+ psubw xmm3, xmm11, xmm4
movdqu xmm9, [r3]
ABS1 xmm2, xmm1
ABS1 xmm3, xmm1
paddw xmm0, xmm2
paddw xmm0, xmm3
paddusw xmm0, xmm9
- movdqa xmm1, xmm8
- psubusw xmm1, xmm0
+ psubusw xmm1, xmm8, xmm0
packsswb xmm1, xmm1
movq [r6], xmm1
%else
ABS1 xmm1, xmm3
paddw xmm0, xmm1
paddusw xmm0, xmm4
- movdqa xmm1, xmm5
- psubusw xmm1, xmm0
+ psubusw xmm1, xmm5, xmm0
packsswb xmm1, xmm1
movq [r6], xmm1
ADS_END 2
ABS1 xmm1, xmm5
paddusw xmm0, xmm2
paddusw xmm1, xmm3
- movdqa xmm4, xmm6
- movdqa xmm5, xmm6
- psubusw xmm4, xmm0
- psubusw xmm5, xmm1
+ psubusw xmm4, xmm6, xmm0
+ psubusw xmm5, xmm6, xmm1
packsswb xmm4, xmm5
movdqa [r6], xmm4
ADS_END 4
%endmacro
+INIT_XMM
ADS_SSE2 sse2
%define ABS1 ABS1_SSSE3
ADS_SSE2 ssse3
+INIT_AVX
+ADS_SSE2 avx
; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
DECL_X1( ssd, sse2slow )
DECL_X1( ssd, sse2 )
DECL_X1( ssd, ssse3 )
+DECL_X1( ssd, avx )
DECL_X1( satd, mmxext )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
DECL_X1( satd, sse4 )
+DECL_X1( satd, avx )
DECL_X1( sa8d, mmxext )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
-DECL_X1( sa8d, sse4)
+DECL_X1( sa8d, sse4 )
+DECL_X1( sa8d, avx )
DECL_X1( sad, cache32_mmxext );
DECL_X1( sad, cache64_mmxext );
DECL_X1( sad, cache64_sse2 );
DECL_PIXELS( uint64_t, var, mmxext, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, int i_stride ))
void x264_intra_satd_x3_4x4_mmxext ( pixel *, pixel *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_4x4_mmxext ( pixel *, pixel *, int * );
void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_4x4_avx ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmxext ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_mmxext ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_avx ( uint8_t *, int16_t [2][8], int * );
void x264_pixel_ssd_nv12_core_mmxext( pixel *pixuv1, int stride1,
pixel *pixuv2, int stride2, int width,
void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, int stride1,
pixel *pixuv2, int stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
+void x264_pixel_ssd_nv12_core_avx( pixel *pixuv1, int stride1,
+ pixel *pixuv2, int stride2, int width,
+ int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, int stride1,
const pixel *pix2, int stride2, int sums[2][4] );
+void x264_pixel_ssim_4x4x2_core_avx( const pixel *pix1, int stride1,
+ const pixel *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
+float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_mmxext( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
DECL_ADS( 4, ssse3 )
DECL_ADS( 2, ssse3 )
DECL_ADS( 1, ssse3 )
+DECL_ADS( 4, avx )
+DECL_ADS( 2, avx )
+DECL_ADS( 1, avx )
#undef DECL_PIXELS
#undef DECL_X1
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS 5-6
%ifidn %1, w
- mova %2, %5
paddw %3, %4
psrlw %3, 1
- pavgw %2, %3
+ pavgw %2, %5, %3
%else
mova %6, %3
pavgb %3, %4
pxor %4, %6
- mova %2, %5
pand %4, [pb_1]
psubusb %3, %4
- pavgb %2, %3
+ pavgb %2, %5, %3
%endif
%endmacro
%macro PREDICT_4x4_DDL 4
cglobal predict_4x4_ddl_%1, 1,1
movu m1, [r0-FDEC_STRIDEB]
- mova m2, m1
+ psll%2 m2, m1, %3
mova m3, m1
mova m4, m1
- psll%2 m1, %3
- pxor m2, m1
- psrl%2 m2, %3
- pxor m3, m2
- PRED8x8_LOWPASS %4, m0, m1, m3, m4, m5
+ pxor m1, m2
+ psrl%2 m1, %3
+ pxor m3, m1
+ PRED8x8_LOWPASS %4, m0, m2, m3, m4, m5
%assign Y 0
%rep 4
%ifdef HIGH_BIT_DEPTH
INIT_XMM
PREDICT_4x4_DDL sse2, dq, 2, w
+INIT_AVX
+PREDICT_4x4_DDL avx , dq, 2, w
INIT_MMX
%define PALIGNR PALIGNR_MMX
cglobal predict_4x4_ddl_mmxext, 1,2
PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
%endif
PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
- mova m1, m3
+ psll%4 m1, m3, %7*6
psrl%4 m3, %7*2
- psll%4 m1, %7*6
movh [r0+0*FDEC_STRIDEB], m5
movh [r0+1*FDEC_STRIDEB], m3
PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2
%endif
punpckh%3 m1, m2 ; l0 l1 l2 l3
punpckh%6 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- mova m0, m1
- mova m2, m1
- mova m5, m1
- psrl%4 m0, %7*2 ; .. .. t2 t1 t0 lt l0 l1
- psrl%4 m2, %7 ; .. t2 t1 t0 lt l0 l1 l2
- pavg%5 m5, m2
+ psrl%4 m2, m1, %7 ; .. t2 t1 t0 lt l0 l1 l2
+ psrl%4 m0, m1, %7*2 ; .. .. t2 t1 t0 lt l0 l1
+ pavg%5 m5, m1, m2
PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
punpckl%2 m5, m3
psrl%4 m3, %7*4
PREDICT_4x4 sse2 , wd, dq, dq, w, qdq, 2
%define PALIGNR PALIGNR_SSSE3
PREDICT_4x4 ssse3 , wd, dq, dq, w, qdq, 2
+INIT_AVX
+PREDICT_4x4 avx , wd, dq, dq, w, qdq, 2
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
%macro PREDICT_4x4_V1 4
cglobal predict_4x4_vl_%1, 1,1,6*(mmsize/16)
movu m1, [r0-FDEC_STRIDEB]
- mova m3, m1
- mova m2, m1
- psrl%2 m3, %3
- psrl%2 m2, %3*2
- mova m4, m3
- pavg%4 m4, m1
+ psrl%2 m3, m1, %3
+ psrl%2 m2, m1, %3*2
+ pavg%4 m4, m3, m1
PRED8x8_LOWPASS %4, m0, m1, m2, m3, m5
movh [r0+0*FDEC_STRIDEB], m4
%ifdef HIGH_BIT_DEPTH
INIT_XMM
PREDICT_4x4_V1 sse2, dq, 2, w
+%ifdef ARCH_X86_64
+INIT_AVX
+PREDICT_4x4_V1 avx , dq, 2, w
+%endif
INIT_MMX
%define PALIGNR PALIGNR_MMX
test r2b, 0x04
je .fix_tr_2
mova m0, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
- mova m5, m0
mova m2, m0
mova m4, m0
- psrl%5 m5, 7*%6
+ psrl%5 m5, m0, 7*%6
PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
PALIGNR m5, m4, 1*SIZEOF_PIXEL, m4
PRED8x8_LOWPASS %2, m1, m2, m5, m0, m4
.done:
REP_RET
.fix_lt_1:
- mova m5, m3
- pxor m5, m4
+ pxor m5, m3, m4
psrl%5 m5, 7*%6
psll%5 m5, 6*%6
pxor m1, m5
jmp .do_left
.fix_lt_2:
- mova m5, m3
- pxor m5, m2
+ pxor m5, m3, m2
psll%5 m5, 7*%6
psrl%5 m5, 7*%6
pxor m2, m5
test r2b, 0x04
jne .do_top
.fix_tr_1:
- mova m5, m3
- pxor m5, m1
+ pxor m5, m3, m1
psrl%5 m5, 7*%6
psll%5 m5, 7*%6
pxor m1, m5
PREDICT_FILTER sse2 , w, d, q, dq, 2
%define PALIGNR PALIGNR_SSSE3
PREDICT_FILTER ssse3 , w, d, q, dq, 2
+INIT_AVX
+PREDICT_FILTER avx , w, d, q, dq, 2
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
cglobal predict_8x8_h_%1, 2,2
movu m1, [r1+7*SIZEOF_PIXEL]
add r0, 4*FDEC_STRIDEB
- mova m2, m1
+ punpckl%2 m2, m1, m1
punpckh%2 m1, m1
- punpckl%2 m2, m2
%assign n 0
%rep 8
%assign i 1+n/4
movu m2, [r1+17*SIZEOF_PIXEL]
movu m3, [r1+23*SIZEOF_PIXEL]
movu m4, [r1+25*SIZEOF_PIXEL]
- mova m1, m5
- psll%3 m1, %4
+ psll%3 m1, m5, %4
add r0, FDEC_STRIDEB*4
PRED8x8_LOWPASS %2, m0, m1, m2, m5, m7
+%if avx_enabled == 1
+ INIT_XMM
PRED8x8_LOWPASS %2, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6
+ INIT_AVX
+%else
+ PRED8x8_LOWPASS %2, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6
+%endif
%assign Y 3
%rep 6
mova [r0+Y*FDEC_STRIDEB], m1
- mova m2, m0
psll%3 m1, %4
- psrl%3 m2, 7*%4
+ psrl%3 m2, m0, 7*%4
psll%3 m0, %4
por m1, m2
%assign Y (Y-1)
;-----------------------------------------------------------------------------
; void predict_8x8_ddr( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
+%if avx_enabled == 0
cglobal predict_8x8_ddr_%1, 2,2,7*(mmsize/16)
movu m1, [r1+ 7*SIZEOF_PIXEL]
movu m2, [r1+ 9*SIZEOF_PIXEL]
%assign Y 3
%rep 6
mova [r0+Y*FDEC_STRIDEB], m0
- mova m2, m1
psrl%3 m0, %4
- psll%3 m2, 7*%4
+ psll%3 m2, m1, 7*%4
psrl%3 m1, %4
por m0, m2
%assign Y (Y-1)
%assign Y (Y-1)
mova [r0+Y*FDEC_STRIDEB], m0
RET
+%endif
%endmacro ; PREDICT_8x8
%ifdef HIGH_BIT_DEPTH
INIT_XMM
PREDICT_8x8 sse2 , w, dq, 2
+INIT_AVX
+PREDICT_8x8 avx , w, dq, 2
%elifndef ARCH_X86_64
INIT_MMX
PREDICT_8x8 mmxext, b, q , 8
psll%4 m0, 8*SIZEOF_PIXEL
psrl%4 m2, 8*SIZEOF_PIXEL
por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
- mova m3, m2
mova m4, m2
mova m5, m2
+ psrl%3 m3, m2, 2*%6
psrl%3 m2, %6
- psrl%3 m3, 2*%6
por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
punpckh%5 m1, m1
por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
pavg%2 m4, m2
PRED8x8_LOWPASS %2, m1, m3, m5, m2, m6
- mova m5, m4
+ punpckh%5 m5, m4, m1 ; p8 p7 p6 p5
punpckl%5 m4, m1 ; p4 p3 p2 p1
- punpckh%5 m5, m1 ; p8 p7 p6 p5
mova m6, m5
mova m7, m5
mova m0, m5
PREDICT_8x8_HU sse2 , w, dq, d, wd, 2
%define PALIGNR PALIGNR_SSSE3
PREDICT_8x8_HU ssse3 , w, dq, d, wd, 2
+INIT_AVX
+PREDICT_8x8_HU avx , w, dq, d, wd, 2
%elifndef ARCH_X86_64
INIT_MMX
%define PALIGNR PALIGNR_MMX
mova m2, [r1+16*SIZEOF_PIXEL]
movu m3, [r1+15*SIZEOF_PIXEL]
movu m1, [r1+14*SIZEOF_PIXEL]
- mova m4, m3
- pavg%2 m3, m2
+ pavg%2 m4, m3, m2
add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS %2, m0, m1, m2, m4, m5
- mova [r0-4*FDEC_STRIDEB], m3
+ PRED8x8_LOWPASS %2, m0, m1, m2, m3, m5
+ mova [r0-4*FDEC_STRIDEB], m4
mova [r0-3*FDEC_STRIDEB], m0
mova m5, m0
- mova m6, m3
+ mova m6, m4
mova m1, [r1+8*SIZEOF_PIXEL]
mova m2, m1
psll%3 m2, %4
PREDICT_8x8_VR sse2 , w, dq, 2
%define PALIGNR PALIGNR_SSSE3
PREDICT_8x8_VR ssse3 , w, dq, 2
+INIT_AVX
+PREDICT_8x8_VR avx , w, dq, 2
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
REP_RET
%endif ; !ARCH_X86_64
-INIT_XMM
-cglobal predict_8x8c_p_core_sse2, 1,1
+%macro PREDICT_8x8C_P 1
+cglobal predict_8x8c_p_core_%1, 1,1
movd m0, r1m
movd m2, r2m
movd m4, r3m
%ifdef HIGH_BIT_DEPTH
mov r1d, 8
.loop:
- mova m5, m0
- paddsw m5, m2
+ paddsw m5, m0, m2
psraw m5, 5
CLIPW m5, m1, m3
mova [r0], m5
jg .loop
%else ;!HIGH_BIT_DEPTH
paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
- mova m3, m0
- paddsw m3, m4
+ paddsw m3, m0, m4
paddsw m4, m4
call .loop
add r0, FDEC_STRIDE*4
.loop:
- mova m5, m0
- mova m1, m3
- psraw m0, 5
+ paddsw m1, m3, m4
+ paddsw m5, m0, m4
psraw m3, 5
+ psraw m0, 5
packuswb m0, m3
movq [r0+FDEC_STRIDE*0], m0
movhps [r0+FDEC_STRIDE*1], m0
- paddsw m5, m4
- paddsw m1, m4
- mova m0, m5
- mova m3, m1
+ paddsw m0, m5, m4
+ paddsw m3, m1, m4
psraw m5, 5
psraw m1, 5
packuswb m5, m1
movq [r0+FDEC_STRIDE*2], m5
movhps [r0+FDEC_STRIDE*3], m5
- paddsw m0, m4
- paddsw m3, m4
%endif ;!HIGH_BIT_DEPTH
RET
+%endmacro ; PREDICT_8x8C_P
+
+INIT_XMM
+PREDICT_8x8C_P sse2
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
REP_RET
%endif ; !ARCH_X86_64
-INIT_XMM
-cglobal predict_16x16_p_core_sse2, 1,2,8
+%macro PREDICT_16x16_P 1
+cglobal predict_16x16_p_core_%1, 1,2,8
movd m0, r1m
movd m1, r2m
movd m2, r3m
SPLATW m0, m0, 0
SPLATW m1, m1, 0
SPLATW m2, m2, 0
- mova m3, m1
- pmullw m3, [pw_76543210]
+ pmullw m3, m1, [pw_76543210]
psllw m1, 3
%ifdef HIGH_BIT_DEPTH
mov r1d, 16
paddw m6, m2
dec r1d
jg .loop
-%else ;!HIGH_BIT_DEPTH
+%else ; !HIGH_BIT_DEPTH
paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
- mova m7, m2
- paddsw m7, m7
+ paddsw m7, m2, m2
mov r1d, 8
ALIGN 4
.loop:
- mova m3, m0
- mova m4, m1
- mova m5, m0
- mova m6, m1
- psraw m3, 5
- psraw m4, 5
- paddsw m5, m2
- paddsw m6, m2
+ psraw m3, m0, 5
+ psraw m4, m1, 5
+ paddsw m5, m0, m2
+ paddsw m6, m1, m2
psraw m5, 5
psraw m6, 5
packuswb m3, m4
add r0, FDEC_STRIDE*2
dec r1d
jg .loop
-%endif ;!HIGH_BIT_DEPTH
+%endif ; !HIGH_BIT_DEPTH
REP_RET
+%endmacro ; PREDICT_16x16_P
-%ifndef HIGH_BIT_DEPTH
INIT_XMM
+PREDICT_16x16_P sse2
+%ifndef HIGH_BIT_DEPTH
+INIT_AVX
+PREDICT_16x16_P avx
+%endif
+
+%ifndef HIGH_BIT_DEPTH
+%macro PREDICT_8x8 1
;-----------------------------------------------------------------------------
; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_sse2, 2,2
+cglobal predict_8x8_ddl_%1, 2,2
movdqa xmm3, [r1+16]
movdqu xmm2, [r1+17]
- movdqa xmm1, xmm3
- pslldq xmm1, 1
+ pslldq xmm1, xmm3, 1
add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
;-----------------------------------------------------------------------------
; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr_sse2, 2,2
+cglobal predict_8x8_ddr_%1, 2,2
movdqu xmm3, [r1+8]
movdqu xmm1, [r1+7]
- movdqa xmm2, xmm3
- psrldq xmm2, 1
+ psrldq xmm2, xmm3, 1
add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
- movdqa xmm1, xmm0
- psrldq xmm1, 1
+ psrldq xmm1, xmm0, 1
%assign Y 3
%rep 3
movq [r0+Y*FDEC_STRIDE], xmm0
movq [r0-3*FDEC_STRIDE], xmm0
movq [r0-4*FDEC_STRIDE], xmm1
RET
-%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_vl_sse2, 2,2
+cglobal predict_8x8_vl_%1, 2,2
movdqa xmm4, [r1+16]
- movdqa xmm2, xmm4
- movdqa xmm1, xmm4
- movdqa xmm3, xmm4
- psrldq xmm2, 1
- pslldq xmm1, 1
- pavgb xmm3, xmm2
+ pslldq xmm1, xmm4, 1
+ psrldq xmm2, xmm4, 1
+ pavgb xmm3, xmm4, xmm2
add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm4, xmm5
; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
RET
-%ifndef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_vr_sse2, 2,2,7
+cglobal predict_8x8_vr_%1, 2,2,7
movdqu xmm0, [r1+8]
movdqa xmm6, [pw_ff00]
add r0, 4*FDEC_STRIDE
- movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
+ pslldq xmm1, xmm0, 2
pslldq xmm0, 1
- pslldq xmm1, 2
pavgb xmm2, xmm0
PRED8x8_LOWPASS b, xmm4, xmm3, xmm1, xmm0, xmm5
pandn xmm6, xmm4
%assign Y (Y-2)
%endrep
RET
-%endif
+%endmacro ; PREDICT_8x8
+
+INIT_XMM
+PREDICT_8x8 sse2
+INIT_AVX
+PREDICT_8x8 avx
+
+%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_8x8_hd( pixel *src, pixel *edge )
mova m5, m3
pavg%2 m3, m1
PRED8x8_LOWPASS %2, m0, m4, m1, m5, m7
- mova m4, m2
- mova m1, m2 ; t6 t5 t4 t3 t2 t1 t0 lt
- psrl%3 m4, 2*%5 ; .. .. t6 t5 t4 t3 t2 t1
- psrl%3 m1, %5 ; .. t6 t5 t4 t3 t2 t1 t0
+ psrl%3 m4, m2, 2*%5 ; .. .. t6 t5 t4 t3 t2 t1
+ psrl%3 m1, m2, %5 ; .. t6 t5 t4 t3 t2 t1 t0
PRED8x8_LOWPASS %2, m6, m4, m2, m1, m5
; .. p11 p10 p9
- mova m7, m3
+ punpckh%4 m7, m3, m0 ; p8 p7 p6 p5
punpckl%4 m3, m0 ; p4 p3 p2 p1
- punpckh%4 m7, m0 ; p8 p7 p6 p5
mova m1, m7
mova m0, m7
mova m4, m7
PREDICT_8x8_HD sse2 , w, dq, wd, 2
%define PALIGNR PALIGNR_SSSE3
PREDICT_8x8_HD ssse3 , w, dq, wd, 2
+INIT_AVX
+PREDICT_8x8_HD avx , w, dq, wd, 2
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
PALIGNR xmm1, xmm0, 7, xmm4
PALIGNR xmm2, xmm0, 9, xmm5
PALIGNR xmm3, xmm0, 8, xmm0
- movdqa xmm4, xmm1
- pavgb xmm4, xmm3
+ pavgb xmm4, xmm1, xmm3
PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm5
punpcklbw xmm4, xmm0
movhlps xmm0, xmm4
PREDICT_8x8_HD sse2
%define PALIGNR PALIGNR_SSSE3
PREDICT_8x8_HD ssse3
+INIT_AVX
+PREDICT_8x8_HD avx
INIT_MMX
%define PALIGNR PALIGNR_MMX
%endif ; HIGH_BIT_DEPTH
void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
+ void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_dc_mmxext( pixel *src );
void x264_predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[33] );
void x264_predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
- void x264_predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[33] );
+ void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[33] );
+ void x264_predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[33] );
+ void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_vr_ssse3( uint16_t *src, uint16_t edge[33] );
+ void x264_predict_8x8_vr_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[33] );
+ void x264_predict_8x8_hu_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[33] );
+ void x264_predict_8x8_hd_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_filter_mmxext( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[33], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[33], int i_neighbor, int i_filters );
+ void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[33], int i_neighbor, int i_filters );
void x264_predict_4x4_ddl_mmxext( pixel *src );
void x264_predict_4x4_ddl_sse2( uint16_t *src );
+ void x264_predict_4x4_ddl_avx( uint16_t *src );
void x264_predict_4x4_ddr_mmxext( pixel *src );
void x264_predict_4x4_vl_mmxext( pixel *src );
void x264_predict_4x4_vl_sse2( uint16_t *src );
+ void x264_predict_4x4_vl_avx( uint16_t *src );
void x264_predict_4x4_vr_mmxext( uint8_t *src );
void x264_predict_4x4_vr_sse2( uint16_t *src );
void x264_predict_4x4_vr_ssse3( pixel *src );
+ void x264_predict_4x4_vr_avx( uint16_t *src );
void x264_predict_4x4_hd_mmxext( pixel *src );
void x264_predict_4x4_hd_sse2( uint16_t *src );
void x264_predict_4x4_hd_ssse3( pixel *src );
+ void x264_predict_4x4_hd_avx( uint16_t *src );
void x264_predict_4x4_dc_mmxext( pixel *src );
void x264_predict_4x4_ddr_sse2( uint16_t *src );
void x264_predict_4x4_ddr_ssse3( pixel *src );
+ void x264_predict_4x4_ddr_avx( uint16_t *src );
void x264_predict_4x4_hu_mmxext( pixel *src );
#define PREDICT_16x16_DC(name)\
PREDICT_16x16_P( mmxext )
#endif
PREDICT_16x16_P( sse2 )
+PREDICT_16x16_P( avx )
#endif //!HIGH_BIT_DEPTH
#ifdef __GNUC__
#if ARCH_X86_64
INTRA_SA8D_X3(sse2)
INTRA_SA8D_X3(ssse3)
+INTRA_SA8D_X3(avx)
#else
INTRA_SA8D_X3(mmxext)
#endif
#ifdef __GNUC__
pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3;
#endif
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx;
#endif // HIGH_BIT_DEPTH
}
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_avx;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
+ *predict_8x8_filter = x264_predict_8x8_filter_avx;
#else
pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmxext;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmxext;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_avx;
+ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_avx;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
#endif // HIGH_BIT_DEPTH
}
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_sse2;
pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_sse2;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_avx;
+ pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_avx;
+ pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_avx;
+#if ARCH_X86_64
+ pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_avx;
+#endif
+ pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx;
#else
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext;
#endif // HIGH_BIT_DEPTH
%if %4
por m5, m1
%else
- SWAP m5, m1
+ SWAP 5, 1
%endif
%endmacro
%if %4
por m5, m1
%else
- SWAP m5, m1
+ SWAP 5, 1
%endif
%endmacro
%if %4
por m5, m2
%else
- SWAP m5, m2
+ SWAP 5, 2
%endif
por m5, m3
%endmacro
%if %4
por m5, m1
%else
- SWAP m5, m1
+ SWAP 5, 1
%endif
%endmacro
%if %4
por m5, m2
%else
- SWAP m5, m2
+ SWAP 5, 2
%endif
por m5, m3
%endmacro
%if %4
por m5, m0
%else
- SWAP m5, m0
+ SWAP 5, 0
%endif
%endmacro
por m5, m0
por m5, m2
%else
- SWAP m5, m0
+ SWAP 5, 0
por m5, m2
%endif
%endmacro
paddd m0, m3
psrad m0, m2
%else
- mova m1, m0
+ punpckhwd m1, m0, m4
punpcklwd m0, m4
- punpckhwd m1, m4
pmaddwd m0, %2
pmaddwd m1, %3
paddd m0, m3
psrld m3, 1
DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
+%ifnidn %1, avx
cglobal dequant_%2x%2_flat16_%1, 0,3
movifnidn t2d, r2m
%if %2 == 8
DEQUANT16_FLAT [r1+32], 32, 96
%endif
RET
+%endif ; !AVX
%endmacro ; DEQUANT
%ifdef HIGH_BIT_DEPTH
INIT_XMM
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
+INIT_AVX
+DEQUANT avx, 4, 4, 2
+DEQUANT avx, 8, 6, 2
%endif
%macro DEQUANT_DC 2
punpcklwd m2, m4
%rep SIZEOF_PIXEL*32/mmsize
mova m0, [r0+x]
- mova m1, m0
+ punpckhwd m1, m0, m5
punpcklwd m0, m5
- punpckhwd m1, m5
pmaddwd m0, m2
pmaddwd m1, m2
psrad m0, m3
INIT_XMM
DEQUANT_DC sse2 , d
DEQUANT_DC sse4 , d
+INIT_AVX
+DEQUANT_DC avx , d
%else
INIT_MMX
DEQUANT_DC mmxext, w
INIT_XMM
DEQUANT_DC sse2 , w
+INIT_AVX
+DEQUANT_DC avx , w
%endif
%ifdef HIGH_BIT_DEPTH
mova m5, m1
psubd m0, [r2+r3*4+0*mmsize]
psubd m1, [r2+r3*4+1*mmsize]
- mova m7, m0
- pcmpgtd m7, m6
+ pcmpgtd m7, m0, m6
pand m0, m7
- mova m7, m1
- pcmpgtd m7, m6
+ pcmpgtd m7, m1, m6
pand m1, m7
PSIGND m0, m2
PSIGND m1, m3
%define PABSD PABSD_SSSE3
%define PSIGND PSIGND_SSSE3
DENOISE_DCT ssse3, 8
+INIT_AVX
+DENOISE_DCT avx , 8
%else ; !HIGH_BIT_DEPTH
void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_avx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
%if %1
paddw m0, m1
%else
- SWAP m0, m1
+ SWAP 0, 1
%endif
paddw m0, m2
%endmacro
movd [r2+4], mm1 ;H prediction cost
RET
-cglobal intra_sad_x3_4x4_sse4, 3,3
+%macro INTRA_SADx3_4x4 1
+cglobal intra_sad_x3_4x4_%1, 3,3
movd xmm4, [r1+FDEC_STRIDE*0-4]
pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1
pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2
pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3
movd xmm2, [r1-FDEC_STRIDE]
pxor xmm3, xmm3
- movdqa xmm5, xmm4
- pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
- pshufb xmm5, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
- pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
- punpckldq xmm2, xmm4 ; ABCDEFGH
+ pshufb xmm5, xmm4, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
+ pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
+ pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
+ punpckldq xmm2, xmm4 ; ABCDEFGH
psadbw xmm2, xmm3
movd xmm1, [r0+FENC_STRIDE*0]
pinsrd xmm1, [r0+FENC_STRIDE*1], 1
psraw xmm2, 2
pavgw xmm2, xmm3
pshufb xmm2, xmm3 ; DC prediction
- movdqa xmm3, xmm0
+ punpckhqdq xmm3, xmm0, xmm5
punpcklqdq xmm0, xmm5
- punpckhqdq xmm3, xmm5
psadbw xmm2, xmm1
paddw xmm0, xmm3
movhlps xmm4, xmm2
movq [r2], xmm0 ; V/H prediction costs
movd [r2+8], xmm2 ; DC prediction cost
RET
+%endmacro ; INTRA_SADx3_4x4
+
+INIT_XMM
+INTRA_SADx3_4x4 sse4
+INIT_AVX
+INTRA_SADx3_4x4 avx
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]);
%if %1
paddw m1, m4
%else
- SWAP m1, m4
+ SWAP 1, 4
%endif
movq m4, m5
psadbw m4, m6
%if %1
paddw m2, m4
%else
- SWAP m2, m4
+ SWAP 2, 4
%endif
pshufw m4, m7, %2
psadbw m5, m4
%if %1
paddw m3, m5
%else
- SWAP m3, m5
+ SWAP 3, 5
%endif
%endmacro
movd [r2+8], m1
RET
-INIT_XMM
-cglobal intra_sad_x3_8x8_ssse3, 3,4,9
+%macro INTRA_SADx3_8x8 1
+cglobal intra_sad_x3_8x8_%1, 3,4,9
%ifdef PIC
lea r11, [h8x8_pred_shuf]
%define shuf r11
.loop:
movq m6, [r0+FENC_STRIDE*0]
movhps m6, [r0+FENC_STRIDE*1]
- movdqa m7, m0
- pshufb m7, [shuf+r3*8] ; H prediction
+ pshufb m7, m0, [shuf+r3*8] ; H prediction
%ifdef ARCH_X86_64
- movdqa m8, m1
psadbw m7, m6
- psadbw m8, m6
+ psadbw m8, m1, m6
psadbw m6, m2
paddw m4, m7
paddw m3, m8
%else
psadbw m7, m6
paddw m4, m7
- movdqa m7, m1
- psadbw m7, m6
+ psadbw m7, m1, m6
psadbw m6, m2
paddw m3, m7
paddw m5, m6
movd [r2+4], m4
movd [r2+8], m5
RET
+%endmacro ; INTRA_SADx3_8x8
+
+INIT_XMM
+INTRA_SADx3_8x8 ssse3
+INIT_AVX
+INTRA_SADx3_8x8 avx
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
%macro INIT_AVX 0
INIT_XMM
%assign avx_enabled 1
+ %define PALIGNR PALIGNR_SSSE3
%define RESET_MM_PERMUTATION INIT_AVX
%endmacro
AVX_INSTR punpckldq, 0, 0
AVX_INSTR punpcklqdq, 0, 0
AVX_INSTR pxor, 0, 0
+AVX_INSTR shufps, 0, 1
AVX_INSTR subpd, 1, 0
AVX_INSTR subps, 1, 0
AVX_INSTR subsd, 1, 0
%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
%macro SBUTTERFLY 4
+%if avx_enabled == 0
mova m%4, m%2
punpckl%1 m%2, m%3
punpckh%1 m%4, m%3
+%else
+ punpckh%1 m%4, m%2, m%3
+ punpckl%1 m%2, m%3
+%endif
SWAP %3, %4
%endmacro
%macro SBUTTERFLY2 4
- mova m%4, m%2
- punpckh%1 m%2, m%3
- punpckl%1 m%4, m%3
+ punpckl%1 m%4, m%2, m%3
+ punpckh%1 m%2, m%2, m%3
SWAP %2, %4, %3
%endmacro
pshufb %1, %3
%endmacro
-%macro PALIGNR_MMX 4
- %ifnidn %4, %2
+%macro PALIGNR_MMX 4-5 ; [dst,] src1, src2, imm, tmp
+ %define %%dst %1
+%if %0==5
+%ifnidn %1, %2
+ mova %%dst, %2
+%endif
+ %rotate 1
+%endif
+%ifnidn %4, %2
mova %4, %2
- %endif
- %if mmsize == 8
- psllq %1, (8-%3)*8
+%endif
+%if mmsize==8
+ psllq %%dst, (8-%3)*8
psrlq %4, %3*8
- %else
- pslldq %1, 16-%3
+%else
+ pslldq %%dst, 16-%3
psrldq %4, %3
- %endif
- por %1, %4
+%endif
+ por %%dst, %4
%endmacro
-%macro PALIGNR_SSSE3 4
+%macro PALIGNR_SSSE3 4-5
+%if %0==5
+ palignr %1, %2, %3, %4
+%else
palignr %1, %2, %3
+%endif
%endmacro
%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
%ifnum %5
- mova m%1, m%5
- mova m%3, m%5
+ pand m%3, m%5, m%4 ; src .. y6 .. y4
+ pand m%1, m%5, m%2 ; dst .. y6 .. y4
%else
mova m%1, %5
- mova m%3, m%1
+ pand m%3, m%1, m%4 ; src .. y6 .. y4
+ pand m%1, m%1, m%2 ; dst .. y6 .. y4
%endif
- pand m%1, m%2 ; dst .. y6 .. y4
- pand m%3, m%4 ; src .. y6 .. y4
- psrlw m%2, 8 ; dst .. y7 .. y5
- psrlw m%4, 8 ; src .. y7 .. y5
+ psrlw m%2, 8 ; dst .. y7 .. y5
+ psrlw m%4, 8 ; src .. y7 .. y5
%endmacro
%macro SUMSUB_BA 3-4
%if %0==3
- padd%1 %2, %3
- padd%1 %3, %3
- psub%1 %3, %2
+ padd%1 m%2, m%3
+ padd%1 m%3, m%3
+ psub%1 m%3, m%2
%else
- mova %4, %2
- padd%1 %2, %3
- psub%1 %3, %4
+%if avx_enabled == 0
+ mova m%4, m%2
+ padd%1 m%2, m%3
+ psub%1 m%3, m%4
+%else
+ padd%1 m%4, m%2, m%3
+ psub%1 m%3, m%2
+ SWAP %2, %4
+%endif
%endif
%endmacro
SUMSUB_BA %1, %2, %3, %6
SUMSUB_BA %1, %4, %5, %6
%else
- padd%1 %2, %3
- padd%1 %4, %5
- padd%1 %3, %3
- padd%1 %5, %5
- psub%1 %3, %2
- psub%1 %5, %4
+ padd%1 m%2, m%3
+ padd%1 m%4, m%5
+ padd%1 m%3, m%3
+ padd%1 m%5, m%5
+ psub%1 m%3, m%2
+ psub%1 m%5, m%4
%endif
%endmacro
%macro TRANS_SSE4 5-6 ; see above
%ifidn %1, d
- mova m%5, m%3
%ifidn %2, ord
- psrl%1 m%3, 16
-%endif
+ psrl%1 m%5, m%3, 16
+ pblendw m%5, m%4, 10101010b
+ psll%1 m%4, 16
+ pblendw m%4, m%3, 01010101b
+ SWAP %3, %5
+%else
+%if avx_enabled == 0
+ mova m%5, m%3
pblendw m%3, m%4, 10101010b
- psll%1 m%4, 16
-%ifidn %2, ord
- pblendw m%4, m%5, 01010101b
%else
- psrl%1 m%5, 16
- por m%4, m%5
+ pblendw m%5, m%3, m%4, 10101010b
+ SWAP %3, %5
+%endif
+ psll%1 m%4, 16
+ psrl%1 m%5, 16
+ por m%4, m%5
%endif
%elifidn %1, q
- mova m%5, m%3
+ shufps m%5, m%3, m%4, 11011101b
shufps m%3, m%4, 10001000b
- shufps m%5, m%4, 11011101b
- SWAP %4, %5
+ SWAP %4, %5
%endif
%endmacro
%endif
%endif
%ifidn %2, sumsub
- SUMSUB_BA w, m%3, m%4, m%5
+ SUMSUB_BA w, %3, %4, %5
%else
%ifidn %2, amax
%if %0==6
%endmacro
%macro SUMSUB2_AB 4
- mova %4, %2
- padd%1 %2, %2
- padd%1 %2, %3
- psub%1 %4, %3
- psub%1 %4, %3
+%ifnum %3
+ psub%1 m%4, m%2, m%3
+ psub%1 m%4, m%3
+ padd%1 m%2, m%2
+ padd%1 m%2, m%3
+%else
+ mova m%4, m%2
+ padd%1 m%2, m%2
+ padd%1 m%2, %3
+ psub%1 m%4, %3
+ psub%1 m%4, %3
+%endif
%endmacro
%macro SUMSUB2_BA 4
+%if avx_enabled == 0
mova m%4, m%2
padd%1 m%2, m%3
padd%1 m%2, m%3
psub%1 m%3, m%4
psub%1 m%3, m%4
+%else
+ padd%1 m%4, m%2, m%3
+ padd%1 m%4, m%3
+ psub%1 m%3, m%2
+ psub%1 m%3, m%2
+ SWAP %2, %4
+%endif
%endmacro
%macro SUMSUBD2_AB 5
- mova %5, %2
- mova %4, %3
- psra%1 %3, 1 ; %3: %3>>1
- psra%1 %2, 1 ; %2: %2>>1
- padd%1 %3, %5 ; %3: %3>>1+%2
- psub%1 %2, %4 ; %2: %2>>1-%3
+%ifnum %4
+ psra%1 m%5, m%2, 1 ; %3: %3>>1
+ psra%1 m%4, m%3, 1 ; %2: %2>>1
+ padd%1 m%4, m%2 ; %3: %3>>1+%2
+ psub%1 m%5, m%3 ; %2: %2>>1-%3
+ SWAP %2, %5
+ SWAP %3, %4
+%else
+ mova %5, m%2
+ mova %4, m%3
+ psra%1 m%3, 1 ; %3: %3>>1
+ psra%1 m%2, 1 ; %2: %2>>1
+ padd%1 m%3, %5 ; %3: %3>>1+%2
+ psub%1 m%2, %4 ; %2: %2>>1-%3
+%endif
%endmacro
%macro DCT4_1D 5
%ifnum %5
- SUMSUB_BADC w, m%4, m%1, m%3, m%2, m%5
- SUMSUB_BA w, m%3, m%4, m%5
- SUMSUB2_AB w, m%1, m%2, m%5
+ SUMSUB_BADC w, %4, %1, %3, %2, %5
+ SUMSUB_BA w, %3, %4, %5
+ SUMSUB2_AB w, %1, %2, %5
SWAP %1, %3, %4, %5, %2
%else
- SUMSUB_BADC w, m%4, m%1, m%3, m%2
- SUMSUB_BA w, m%3, m%4
+ SUMSUB_BADC w, %4, %1, %3, %2
+ SUMSUB_BA w, %3, %4
mova [%5], m%2
- SUMSUB2_AB w, m%1, [%5], m%2
+ SUMSUB2_AB w, %1, [%5], %2
SWAP %1, %3, %4, %2
%endif
%endmacro
%macro IDCT4_1D 6-7
%ifnum %6
- SUMSUBD2_AB %1, m%3, m%5, m%7, m%6
+ SUMSUBD2_AB %1, %3, %5, %7, %6
; %3: %3>>1-%5 %5: %3+%5>>1
- SUMSUB_BA %1, m%4, m%2, m%7
+ SUMSUB_BA %1, %4, %2, %7
; %4: %2+%4 %2: %2-%4
- SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%7
+ SUMSUB_BADC %1, %5, %4, %3, %2, %7
; %5: %2+%4 + (%3+%5>>1)
; %4: %2+%4 - (%3+%5>>1)
; %3: %2-%4 + (%3>>1-%5)
; %2: %2-%4 - (%3>>1-%5)
%else
%ifidn %1, w
- SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+16]
+ SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
%else
- SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+32]
+ SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
%endif
- SUMSUB_BA %1, m%4, m%2
- SUMSUB_BADC %1, m%5, m%4, m%3, m%2
+ SUMSUB_BA %1, %4, %2
+ SUMSUB_BADC %1, %5, %4, %3, %2
%endif
SWAP %2, %5, %4
; %2: %2+%4 + (%3+%5>>1) row0
%endmacro
%macro HADDUW 2
- mova %2, %1
+ psrld %2, %1, 16
pslld %1, 16
- psrld %2, 16
psrld %1, 16
paddd %1, %2
HADDD %1, %2