From 02aa1368da5c222c8833724abccddd8f02630fc6 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Wed, 1 May 2013 14:32:11 -0700 Subject: [PATCH] x86: AVX2 add16x16_idct_dc 27 -> 19 cycles --- common/dct.c | 1 + common/x86/dct-a.asm | 50 +++++++++++++++++++++++++++++++++++++------- common/x86/dct.h | 1 + 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/common/dct.c b/common/dct.c index e0219ecb..52ef9bef 100644 --- a/common/dct.c +++ b/common/dct.c @@ -697,6 +697,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add16x16_idct = x264_add16x16_idct_avx2; dctf->sub8x8_dct = x264_sub8x8_dct_avx2; dctf->sub16x16_dct = x264_sub16x16_dct_avx2; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2; #if ARCH_X86_64 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; #endif diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index cc0a2a12..a787f1cf 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -30,7 +30,9 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 +pb_idctdc_unpack: times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 +pb_idctdc_unpack2: times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 @@ -39,8 +41,6 @@ pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5 pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7 pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9 pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15 -pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 -pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14 pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14 @@ -74,6 +74,7 @@ SECTION .text cextern pw_32_0 cextern pw_32 +cextern pw_512 cextern pw_8000 cextern pw_pixel_max cextern hsub_mul @@ -738,8 +739,7 @@ cglobal add8x8_idct_dc, 2,2 movh m0, [r1] pxor m1, m1 add r0, FDEC_STRIDE*4 - paddw m0, [pw_32] - psraw m0, 6 + pmulhrsw m0, [pw_512] psubw m1, m0 mova m5, [pb_idctdc_unpack] packuswb m0, m0 @@ -836,8 +836,7 @@ cglobal add16x16_idct_dc, 2,2,8 mova m0, [r1] add r1, 16 pxor m1, m1 - paddw m0, [pw_32] - psraw m0, 6 + pmulhrsw m0, [pw_512] psubw m1, m0 mova m5, [ pb_idctdc_unpack] mova m6, [pb_idctdc_unpack2] @@ -857,6 +856,43 @@ ADD16x16 INIT_XMM avx ADD16x16 +%macro ADD_DC_AVX2 3 + mova xm4, [r0+FDEC_STRIDE*0+%3] + mova xm5, [r0+FDEC_STRIDE*1+%3] + vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1 + vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1 + paddusb m4, %1 + paddusb m5, %1 + psubusb m4, %2 + psubusb m5, %2 + mova [r0+FDEC_STRIDE*0+%3], xm4 + mova [r0+FDEC_STRIDE*1+%3], xm5 + vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1 + vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1 +%endmacro + +INIT_YMM avx2 +cglobal add16x16_idct_dc, 2,3,6 + add r0, FDEC_STRIDE*4 + mova m0, [r1] + pxor m1, m1 + pmulhrsw m0, [pw_512] + psubw m1, m0 + mova m4, [pb_idctdc_unpack] + mova m5, [pb_idctdc_unpack2] + packuswb m0, m0 + packuswb m1, m1 + pshufb m2, m0, m4 ; row0, row2 + pshufb m3, m1, m4 ; row0, row2 + pshufb m0, m5 ; row1, row3 + pshufb m1, m5 ; row1, row3 + lea r2, [r0+FDEC_STRIDE*8] + ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4 + ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2 + ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0 + ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2 + RET + %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- diff --git a/common/x86/dct.h b/common/x86/dct.h index 9be71258..0a8ef374 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -68,6 +68,7 @@ void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] ); void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] ); void x264_add8x8_idct_dc_avx ( pixel *p_dst, dctcoef dct [ 4] ); void x264_add16x16_idct_dc_avx ( pixel *p_dst, dctcoef dct [16] ); +void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct [16] ); void x264_dct4x4dc_mmx ( int16_t d[16] ); void x264_dct4x4dc_sse2 ( int32_t d[16] ); -- 2.40.0