From 9301bbd39fb0a49b1e986f9a7c29685439686de4 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Thu, 22 Dec 2011 14:03:15 -0800 Subject: [PATCH] XOP 8-bit fDCT Use integer MAC for one of the SUMSUB passes. About a dozen cycles faster for 16x16. --- common/dct.c | 6 ++++++ common/x86/const-a.asm | 1 + common/x86/dct-32.asm | 7 ++++++- common/x86/dct-64.asm | 7 ++++++- common/x86/dct-a.asm | 2 ++ common/x86/dct.h | 2 ++ common/x86/x86util.asm | 21 ++++----------------- 7 files changed, 27 insertions(+), 19 deletions(-) diff --git a/common/dct.c b/common/dct.c index 6836bc2b..c071ae43 100644 --- a/common/dct.c +++ b/common/dct.c @@ -597,6 +597,12 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; } + + if( cpu&X264_CPU_XOP ) + { + dctf->sub8x8_dct = x264_sub8x8_dct_xop; + dctf->sub16x16_dct = x264_sub16x16_dct_xop; + } #endif //HAVE_MMX #if HAVE_ALTIVEC diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index be947eab..eaec531b 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -38,6 +38,7 @@ const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 const pw_1, times 8 dw 1 const pw_2, times 8 dw 2 +const pw_m2, times 8 dw -2 const pw_4, times 8 dw 4 const pw_8, times 8 dw 8 const pw_16, times 8 dw 16 diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm index f4395088..b0aedc8c 100644 --- a/common/x86/dct-32.asm +++ b/common/x86/dct-32.asm @@ -32,10 +32,13 @@ SECTION .text -%ifndef HIGH_BIT_DEPTH +cextern pw_2 +cextern pw_m2 cextern pw_32 cextern hsub_mul +%ifndef HIGH_BIT_DEPTH + ; in: m0..m7 ; out: 0,4,6 in mem, rest in regs %macro DCT8_1D 9 @@ -402,6 +405,8 @@ INIT_XMM ssse3 DCT_SUB8 INIT_XMM avx DCT_SUB8 +INIT_XMM xop +DCT_SUB8 ;----------------------------------------------------------------------------- ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] ) diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm index e1442500..dab00544 100644 --- a/common/x86/dct-64.asm +++ b/common/x86/dct-64.asm @@ -31,10 +31,13 @@ SECTION .text -%ifndef HIGH_BIT_DEPTH +cextern pw_2 +cextern pw_m2 cextern pw_32 cextern hsub_mul +%ifndef HIGH_BIT_DEPTH + %macro DCT8_1D 10 SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34 SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25 @@ -198,6 +201,8 @@ INIT_XMM ssse3 DCT_SUB8 INIT_XMM avx DCT_SUB8 +INIT_XMM xop +DCT_SUB8 ;----------------------------------------------------------------------------- ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] ) diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index f493585c..fda1db02 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -452,9 +452,11 @@ INIT_XMM cextern sub8x8_dct_sse2.skip_prologue cextern sub8x8_dct_ssse3.skip_prologue cextern sub8x8_dct_avx.skip_prologue +cextern sub8x8_dct_xop.skip_prologue SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0 SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0 SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0 +SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0 cextern add8x8_idct_sse2.skip_prologue cextern add8x8_idct_avx.skip_prologue diff --git a/common/x86/dct.h b/common/x86/dct.h index cc580064..71235522 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -38,6 +38,8 @@ void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index b71ed791..097ec9ca 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -586,7 +586,10 @@ %endmacro %macro SUMSUB2_AB 4 -%ifnum %3 +%if cpuflag(xop) + pmacs%1%1 m%4, m%3, [p%1_m2], m%2 + pmacs%1%1 m%2, m%2, [p%1_2], m%3 +%elifnum %3 psub%1 m%4, m%2, m%3 psub%1 m%4, m%3 padd%1 m%2, m%2 @@ -600,22 +603,6 @@ %endif %endmacro -%macro SUMSUB2_BA 4 -%if avx_enabled - padd%1 m%4, m%2, m%3 - padd%1 m%4, m%3 - psub%1 m%3, m%2 - psub%1 m%3, m%2 - SWAP %2, %4 -%else - mova m%4, m%2 - padd%1 m%2, m%3 - padd%1 m%2, m%3 - psub%1 m%3, m%4 - psub%1 m%3, m%4 -%endif -%endmacro - %macro SUMSUBD2_AB 5 %ifnum %4 psra%1 m%5, m%2, 1 ; %3: %3>>1 -- 2.40.0