From e7cb328580c3e1bd7604a64f40abf3e03c474771 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <henrik@gramner.com>
Date: Tue, 14 May 2013 18:57:40 +0200
Subject: [PATCH] x86: AVX2 dequant_4x4_dc

---
 common/quant.c         |  5 +--
 common/x86/quant-a.asm | 75 ++++++++++++++++++++++++------------------
 common/x86/quant.h     |  1 +
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/common/quant.c b/common/quant.c
index 2fa1521d..bb87d70a 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -545,6 +545,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
         pf->dequant_4x4 = x264_dequant_4x4_avx2;
         pf->dequant_8x8 = x264_dequant_8x8_avx2;
+        pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
         pf->denoise_dct = x264_denoise_dct_avx2;
     }
 #endif // HAVE_MMX
@@ -691,10 +692,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
         pf->quant_8x8 = x264_quant_8x8_avx2;
         pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
-        if( cpu&X264_CPU_LZCNT )
-            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
         pf->dequant_4x4 = x264_dequant_4x4_avx2;
         pf->dequant_8x8 = x264_dequant_8x8_avx2;
+        pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
         if( h->param.i_cqm_preset == X264_CQM_FLAT )
         {
             pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2;
@@ -704,6 +704,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->denoise_dct = x264_denoise_dct_avx2;
         if( cpu&X264_CPU_LZCNT )
         {
+            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
         }
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 02328a9a..def485ee 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -757,55 +757,62 @@ cglobal dequant_4x4dc, 0,3,6
     DEQUANT_START 6, 6
 
 .lshift:
-    movd     m3, [r1]
-    movd     m2, t0d
-    pslld    m3, m2
-    SPLAT%1  m3, m3, 0
-%assign x 0
-%rep SIZEOF_PIXEL*16/mmsize
-    mova     m0, [r0+mmsize*0+x]
-    mova     m1, [r0+mmsize*1+x]
-    %2       m0, m3
-    %2       m1, m3
-    mova     [r0+mmsize*0+x], m0
-    mova     [r0+mmsize*1+x], m1
-%assign x x+mmsize*2
+%if cpuflag(avx2)
+    vpbroadcastdct m3, [r1]
+%else
+    movd    xm3, [r1]
+    SPLAT%1  m3, xm3
+%endif
+    movd    xm2, t0d
+    pslld    m3, xm2
+%assign %%x 0
+%rep SIZEOF_PIXEL*32/mmsize
+    %2       m0, m3, [r0+%%x]
+    mova     [r0+%%x], m0
+%assign %%x %%x+mmsize
 %endrep
     RET
 
 .rshift32:
-    neg   t0d
-    movd  m3, t0d
-    mova  m4, [p%1_1]
-    mova  m5, m4
-    pslld m4, m3
-    psrld m4, 1
-    movd  m2, [r1]
-%assign x 0
+    neg      t0d
+%if cpuflag(avx2)
+    vpbroadcastdct m2, [r1]
+%else
+    movd     xm2, [r1]
+%endif
+    mova      m5, [p%1_1]
+    movd     xm3, t0d
+    pslld     m4, m5, xm3
+    psrld     m4, 1
 %if HIGH_BIT_DEPTH
-    pshufd m2, m2, 0
+%if notcpuflag(avx2)
+    pshufd    m2, m2, 0
+%endif
+%assign %%x 0
 %rep SIZEOF_PIXEL*32/mmsize
-    mova      m0, [r0+x]
-    pmadcswd  m0, m0, m2, m4
-    psrad     m0, m3
-    mova      [r0+x], m0
-%assign x x+mmsize
+    pmadcswd  m0, m2, [r0+%%x], m4
+    psrad     m0, xm3
+    mova      [r0+%%x], m0
+%assign %%x %%x+mmsize
 %endrep
 
 %else ; !HIGH_BIT_DEPTH
+%if notcpuflag(avx2)
     PSHUFLW   m2, m2, 0
+%endif
     punpcklwd m2, m4
+%assign %%x 0
 %rep SIZEOF_PIXEL*32/mmsize
-    mova      m0, [r0+x]
+    mova      m0, [r0+%%x]
     punpckhwd m1, m0, m5
     punpcklwd m0, m5
     pmaddwd   m0, m2
     pmaddwd   m1, m2
-    psrad     m0, m3
-    psrad     m1, m3
+    psrad     m0, xm3
+    psrad     m1, xm3
     packssdw  m0, m1
-    mova      [r0+x], m0
-%assign x x+mmsize
+    mova      [r0+%%x], m0
+%assign %%x %%x+mmsize
 %endrep
 %endif ; !HIGH_BIT_DEPTH
     RET
@@ -816,6 +823,8 @@ INIT_XMM sse2
 DEQUANT_DC d, pmaddwd
 INIT_XMM xop
 DEQUANT_DC d, pmaddwd
+INIT_YMM avx2
+DEQUANT_DC d, pmaddwd
 %else
 %if ARCH_X86_64 == 0
 INIT_MMX mmx2
@@ -825,6 +834,8 @@ INIT_XMM sse2
 DEQUANT_DC w, pmullw
 INIT_XMM avx
 DEQUANT_DC w, pmullw
+INIT_YMM avx2
+DEQUANT_DC w, pmullw
 %endif
 
 ; t4 is eax for return value.
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 5541db03..089942ad 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -65,6 +65,7 @@ void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-- 
2.40.0