x86-64: 64-bit variant of AVX2 hpel_filter

author Fiona Glaser <fiona@x264.com>

Fri, 10 May 2013 00:20:05 +0000 (17:20 -0700)

committer Fiona Glaser <fiona@x264.com>

Mon, 20 May 2013 19:25:22 +0000 (12:25 -0700)
author Fiona Glaser <fiona@x264.com>
Fri, 10 May 2013 00:20:05 +0000 (17:20 -0700)
committer Fiona Glaser <fiona@x264.com>
Mon, 20 May 2013 19:25:22 +0000 (12:25 -0700)
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index 8f0a0bd653808c10e0c47cf4ea4ae5f93a15792e..893c0655edd500cf4fda8e3f19646a2fcaf304f3 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -37,6 +37,7 @@ filt_mul15: times 16 db 1, -5
  filt_mul51: times 16 db -5, 1
  hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
  deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
+
  %if HIGH_BIT_DEPTH
  deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
  deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
@@ -642,7 +643,6 @@ INIT_XMM avx
  HPEL_C
  HPEL_V 0
  HPEL_H
-%endif
  INIT_YMM avx2
  HPEL_V 8
  HPEL_C
@@ -682,15 +682,16 @@ cglobal hpel_filter_h, 3,3,8
      add       r2, mmsize
      jl .loop
      RET
+%endif
  
  %if ARCH_X86_64
  %macro DO_FILT_V 5
      ;The optimum prefetch distance is difficult to determine in checkasm:
      ;any prefetch seems slower than not prefetching.
      ;In real use, the prefetch seems to be a slight win.
-    ;+16 is picked somewhat arbitrarily here based on the fact that even one
+    ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
      ;loop iteration is going to take longer than the prefetch.
-    prefetcht0 [r1+r2*2+16]
+    prefetcht0 [r1+r2*2+mmsize]
  %if cpuflag(ssse3)
      mova m1, [r3]
      mova m2, [r3+r2]
@@ -723,31 +724,48 @@ cglobal hpel_filter_h, 3,3,8
      packuswb %3, %4
      FILT_V2 m1, m2, m3, m4, m5, m6
  %endif
-    add       r3, 16
-    add       r1, 16
+    add       r3, mmsize
+    add       r1, mmsize
+%if mmsize==32
+    vinserti128 %1, m1, xm4, 1
+    vperm2i128  %2, m1, m4, q0301
+%else
      mova      %1, m1
      mova      %2, m4
+%endif
      FILT_PACK m1, m4, m15, 5
      movntps  [r8+r4+%5], m1
  %endmacro
  
-%macro FILT_C 4
-    PALIGNR   m1, %2, %1, 12, m2
-    PALIGNR   m2, %2, %1, 14, %1
+%macro FILT_C 3
+%if mmsize==32
+    vperm2i128 m3, %2, %1, q0003
+%endif
+    PALIGNR   m1, %2, %1, (mmsize-4), m3
+    PALIGNR   m2, %2, %1, (mmsize-2), m3
+%if mmsize==32
+    vperm2i128 %1, %3, %2, q0003
+%endif
      PALIGNR   m3, %3, %2, 4, %1
      PALIGNR   m4, %3, %2, 2, %1
      paddw     m3, m2
+%if mmsize==32
+    mova      m2, %1
+%endif
      mova      %1, %3
-    PALIGNR   %3, %2, 6, m2
+    PALIGNR   %3, %3, %2, 6, m2
      paddw     m4, %2
      paddw     %3, m1
      FILT_H    %3, m3, m4
  %endmacro
  
  %macro DO_FILT_C 4
-    FILT_C %1, %2, %3, 6
-    FILT_C %2, %1, %4, 6
+    FILT_C %1, %2, %3
+    FILT_C %2, %1, %4
      FILT_PACK %3, %4, m15, 6
+%if mmsize==32
+    vpermq %3, %3, q3120
+%endif
      movntps   [r5+r4], %3
  %endmacro
  
@@ -761,8 +779,14 @@ cglobal hpel_filter_h, 3,3,8
  %endmacro
  
  %macro DO_FILT_H 3
-    PALIGNR   m1, %2, %1, 14, m3
-    PALIGNR   m2, %2, %1, 15, m3
+%if mmsize==32
+    vperm2i128 m3, %2, %1, q0003
+%endif
+    PALIGNR   m1, %2, %1, (mmsize-2), m3
+    PALIGNR   m2, %2, %1, (mmsize-1), m3
+%if mmsize==32
+    vperm2i128 m3, %3, %2, q0003
+%endif
      PALIGNR   m4, %3, %2, 1 , m3
      PALIGNR   m5, %3, %2, 2 , m3
      PALIGNR   m6, %3, %2, 3 , m3
@@ -798,9 +822,9 @@ cglobal hpel_filter_h, 3,3,8
  ;-----------------------------------------------------------------------------
  cglobal hpel_filter, 7,9,16
      mov       r7, r3
-    sub      r5d, 16
+    sub      r5d, mmsize
      mov       r8, r1
-    and       r7, 15
+    and       r7, mmsize-1
      sub       r3, r7
      add       r0, r5
      add       r8, r5
@@ -827,7 +851,7 @@ cglobal hpel_filter, 7,9,16
      DO_FILT_V m8, m7, m13, m12, 0
  ;ALIGN 16
  .loopx:
-    DO_FILT_V m6, m5, m11, m12, 16
+    DO_FILT_V m6, m5, m11, m12, mmsize
  .lastx:
  %if cpuflag(ssse3)
      psrlw   m15, 1   ; pw_512
@@ -840,11 +864,11 @@ cglobal hpel_filter, 7,9,16
  %else
      psrlw   m15, 1   ; pw_16
  %endif
-    movdqa   m7, m5
+    mova     m7, m5
      DO_FILT_H m10, m13, m11
-    add      r4, 16
+    add      r4, mmsize
      jl .loopx
-    cmp      r4, 16
+    cmp      r4, mmsize
      jl .lastx
  ; setup regs for next y
      sub      r4, r7
@@ -867,6 +891,8 @@ INIT_XMM ssse3
  HPEL
  INIT_XMM avx
  HPEL
+INIT_YMM avx2
+HPEL
  %endif ; ARCH_X86_64
  
  %undef movntq
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index cfdc91fd8c20f22c610688f3daddb61edbb45791..d83958ded54cedb6c97fa3bc234839e71afc8c06 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -470,12 +470,13 @@ HPEL(16, sse2_amd, mmx2, mmx2, sse2)
  void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
  void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
  void x264_hpel_filter_avx  ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
  #else
  HPEL(16, sse2, sse2, sse2, sse2)
  HPEL(16, ssse3, ssse3, ssse3, ssse3)
  HPEL(16, avx, avx, avx, avx)
-#endif
  HPEL(32, avx2, avx2, avx2, avx2)
+#endif
  HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
  #endif // HIGH_BIT_DEPTH
  
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm

index a77016cf5e2a9dcf9894d0203e1c89f586a3921b..8a5365e0344d5ea4ffe7548d75e07e047f376622 100644 (file)
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -341,7 +341,18 @@
  %endmacro
  
  %macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
-%if cpuflag(ssse3)
+; AVX2 version uses a precalculated extra input that
+; can be re-used across calls
+%if sizeof%1==32
+                                 ; %3 = abcdefgh ijklmnop (lower address)
+                                 ; %2 = ABCDEFGH IJKLMNOP (higher address)
+;   vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH
+%if %4 < 16
+    palignr    %1, %5, %3, %4    ; %1 = bcdefghi jklmnopA
+%else
+    palignr    %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO
+%endif
+%elif cpuflag(ssse3)
      %if %0==5
          palignr %1, %2, %3, %4
      %else
author	Fiona Glaser <fiona@x264.com>
	Fri, 10 May 2013 00:20:05 +0000 (17:20 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Mon, 20 May 2013 19:25:22 +0000 (12:25 -0700)
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
common/x86/x86util.asm		patch \| blob \| history