x86: AVX2 predict_16x16_dc

author Henrik Gramner <henrik@gramner.com>

Tue, 16 Apr 2013 21:27:25 +0000 (23:27 +0200)

committer Fiona Glaser <fiona@x264.com>

Tue, 23 Apr 2013 21:36:31 +0000 (14:36 -0700)
author Henrik Gramner <henrik@gramner.com>
Tue, 16 Apr 2013 21:27:25 +0000 (23:27 +0200)
committer Fiona Glaser <fiona@x264.com>
Tue, 23 Apr 2013 21:36:31 +0000 (14:36 -0700)
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm

index 2337e8930b23ab549ab6200f776ba80d83bae966..9dae4a87932aa1acd0c263934b3deb3523b67d4a 100644 (file)
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -2094,8 +2094,7 @@ PREDICT_16x16_H
  ;-----------------------------------------------------------------------------
  ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
  ;-----------------------------------------------------------------------------
-
-%macro PRED16x16_DC 2
+%macro PRED16x16_DC_MMX 2
  %if HIGH_BIT_DEPTH
      mova       m0, [r0 - FDEC_STRIDEB+ 0]
      paddw      m0, [r0 - FDEC_STRIDEB+ 8]
@@ -2124,15 +2123,15 @@ INIT_MMX mmx2
  cglobal predict_16x16_dc_core, 1,2
  %if ARCH_X86_64
      movd         m6, r1d
-    PRED16x16_DC m6, 5
+    PRED16x16_DC_MMX m6, 5
  %else
-    PRED16x16_DC r1m, 5
+    PRED16x16_DC_MMX r1m, 5
  %endif
      RET
  
  INIT_MMX mmx2
  cglobal predict_16x16_dc_top, 1,2
-    PRED16x16_DC [pw_8], 4
+    PRED16x16_DC_MMX [pw_8], 4
      RET
  
  INIT_MMX mmx2
@@ -2151,19 +2150,19 @@ cglobal predict_16x16_dc_left_core, 1,1
      RET
  %endif
  
-;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core( pixel *src, int i_dc_left )
-;-----------------------------------------------------------------------------
-
-%macro PRED16x16_DC_SSE2 2
+%macro PRED16x16_DC 2
  %if HIGH_BIT_DEPTH
-    mova       m0, [r0 - FDEC_STRIDEB+ 0]
-    paddw      m0, [r0 - FDEC_STRIDEB+16]
-    HADDW      m0, m2
-    paddw      m0, %1
-    psrlw      m0, %2
-    SPLATW     m0, m0
+    mova      xm0, [r0 - FDEC_STRIDEB+ 0]
+    paddw     xm0, [r0 - FDEC_STRIDEB+16]
+    HADDW     xm0, xm2
+    paddw     xm0, %1
+    psrlw     xm0, %2
+    SPLATW     m0, xm0
+%if mmsize == 32
+    STORE16    m0
+%else
      STORE16    m0, m0
+%endif
  %else ; !HIGH_BIT_DEPTH
      pxor        m0, m0
      psadbw      m0, [r0 - FDEC_STRIDE]
@@ -2177,28 +2176,36 @@ cglobal predict_16x16_dc_left_core, 1,1
  %endif
  %endmacro
  
-INIT_XMM sse2
+%macro PREDICT_16x16_DC_CORE 0
  cglobal predict_16x16_dc_core, 2,2,4
-    movd       m3, r1m
-    PRED16x16_DC_SSE2 m3, 5
+    movd         xm3, r1m
+    PRED16x16_DC xm3, 5
      RET
  
  cglobal predict_16x16_dc_top, 1,2
-    PRED16x16_DC_SSE2 [pw_8], 4
+    PRED16x16_DC [pw_8], 4
      RET
  
-INIT_XMM sse2
-%if HIGH_BIT_DEPTH
  cglobal predict_16x16_dc_left_core, 1,2
-    movd       m0, r1m
-    SPLATW     m0, m0
+    movd      xm0, r1m
+    SPLATW     m0, xm0
+%if HIGH_BIT_DEPTH && mmsize == 16
      STORE16    m0, m0
-    RET
-%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core, 1,1
-    movd       m0, r1m
-    SPLATW     m0, m0
+%else
+%if HIGH_BIT_DEPTH == 0
      packuswb   m0, m0
+%endif
      STORE16    m0
+%endif
      RET
+%endmacro
+
+INIT_XMM sse2
+PREDICT_16x16_DC_CORE
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+PREDICT_16x16_DC_CORE
+%else
+INIT_XMM avx2
+PREDICT_16x16_DC_CORE
  %endif
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c

index 7d1e0622fbda79c3c101801ff5eaa5e66c16509d..480be40349b39cbb98085863faa6e5080d727cf7 100644 (file)
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -43,6 +43,7 @@ void x264_predict_16x16_dc_##name( pixel *src )\
  
  PREDICT_16x16_DC( mmx2 )
  PREDICT_16x16_DC( sse2 )
+PREDICT_16x16_DC( avx2 )
  
  #define PREDICT_16x16_DC_LEFT(name)\
  static void x264_predict_16x16_dc_left_##name( pixel *src )\
@@ -58,6 +59,7 @@ static void x264_predict_16x16_dc_left_##name( pixel *src )\
  
  PREDICT_16x16_DC_LEFT( mmx2 )
  PREDICT_16x16_DC_LEFT( sse2 )
+PREDICT_16x16_DC_LEFT( avx2 )
  
  #define PREDICT_P_SUM(j,i)\
      H += i * ( src[j+i - FDEC_STRIDE ]  - src[j-i - FDEC_STRIDE ] );\
@@ -381,6 +383,9 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
      if( cpu&X264_CPU_AVX2 )
      {
          pf[I_PRED_16x16_P]       = x264_predict_16x16_p_avx2;
+        pf[I_PRED_16x16_DC]      = x264_predict_16x16_dc_avx2;
+        pf[I_PRED_16x16_DC_TOP]  = x264_predict_16x16_dc_top_avx2;
+        pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_avx2;
      }
  }
  
diff --git a/common/x86/predict.h b/common/x86/predict.h

index 7691c09528069bc7f800eb90ff89d337285b02e9..25ba25ddc7a80f51b92095ba8246af3e78b838aa 100644 (file)
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -44,11 +44,13 @@ void x264_predict_16x16_dc_mmx2( pixel *src );
  void x264_predict_16x16_dc_sse2( pixel *src );
  void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
  void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
+void x264_predict_16x16_dc_core_avx2( pixel *src, int i_dc_left );
  void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
  void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
+void x264_predict_16x16_dc_left_core_avx2( pixel *src, int i_dc_left );
  void x264_predict_16x16_dc_top_mmx2( pixel *src );
  void x264_predict_16x16_dc_top_sse2( pixel *src );
-void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
+void x264_predict_16x16_dc_top_avx2( pixel *src );
  void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
  void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
  void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
author	Henrik Gramner <henrik@gramner.com>
	Tue, 16 Apr 2013 21:27:25 +0000 (23:27 +0200)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 23 Apr 2013 21:36:31 +0000 (14:36 -0700)
common/x86/predict-a.asm		patch \| blob \| history
common/x86/predict-c.c		patch \| blob \| history
common/x86/predict.h		patch \| blob \| history