From 5f5fa1e9dc6a7dd51fa6c2da243e27fae845887d Mon Sep 17 00:00:00 2001
From: Holger Lubitz <holger@lubitz.org>
Date: Wed, 4 Feb 2009 12:46:17 -0800
Subject: [PATCH] Merging Holger's GSOC branch part 2: intra prediction
 Assembly versions of most remaining 4x4 and 8x8 intra pred functions.
 Assembly version of predict_8x8_filter. A few other optimizations. Primarily
 Core 2-optimized.

---
 common/common.h          |   1 +
 common/predict.c         |  12 +-
 common/predict.h         |   3 +-
 common/x86/predict-a.asm | 461 ++++++++++++++++++++++++++++++++++++---
 common/x86/predict-c.c   |  62 ++++--
 common/x86/predict.h     |   3 +-
 encoder/analyse.c        |   2 +-
 encoder/encoder.c        |   2 +-
 tools/checkasm.c         |  32 ++-
 9 files changed, 520 insertions(+), 58 deletions(-)

diff --git a/common/common.h b/common/common.h
index 97c68781..394f9dab 100644
--- a/common/common.h
+++ b/common/common.h
@@ -616,6 +616,7 @@ struct x264_t
     x264_predict_t      predict_8x8c[4+3];
     x264_predict8x8_t   predict_8x8[9+3];
     x264_predict_t      predict_4x4[9+3];
+    x264_predict_8x8_filter_t predict_8x8_filter;
 
     x264_pixel_function_t pixf;
     x264_mc_functions_t   mc;
diff --git a/common/predict.c b/common/predict.c
index 9b6c600b..3c6cb108 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -506,7 +506,7 @@ void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, in
     int have_lt = i_neighbor & MB_TOPLEFT;
     if( i_filters & MB_LEFT )
     {
-        edge[15] = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2;
+        edge[15] = (SRC(0,-1) + 2*SRC(-1,-1) + SRC(-1,0) + 2) >> 2;
         edge[14] = ((have_lt ? SRC(-1,-1) : SRC(-1,0))
                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2;
         PL(1) PL(2) PL(3) PL(4) PL(5) PL(6)
@@ -519,8 +519,8 @@ void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, in
         edge[16] = ((have_lt ? SRC(-1,-1) : SRC(0,-1))
                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2;
         PT(1) PT(2) PT(3) PT(4) PT(5) PT(6)
-        edge[23] = ((have_tr ? SRC(8,-1) : SRC(7,-1))
-                    + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2;
+        edge[23] = (SRC(6,-1) + 2*SRC(7,-1)
+                    + (have_tr ? SRC(8,-1) : SRC(7,-1)) + 2) >> 2;
 
         if( i_filters & MB_TOPRIGHT )
         {
@@ -563,7 +563,6 @@ void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, in
         src += FDEC_STRIDE; \
     }
 
-/* SIMD is much faster than C for all of these except HU and HD. */
 static void predict_8x8_dc_128( uint8_t *src, uint8_t edge[33] )
 {
     PREDICT_8x8_DC(0x80808080);
@@ -795,7 +794,7 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
 #endif
 }
 
-void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] )
+void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
 {
     pf[I_PRED_8x8_V]      = predict_8x8_v;
     pf[I_PRED_8x8_H]      = predict_8x8_h;
@@ -809,9 +808,10 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] )
     pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left;
     pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top;
     pf[I_PRED_8x8_DC_128] = predict_8x8_dc_128;
+    *predict_8x8_filter   = x264_predict_8x8_filter;
 
 #ifdef HAVE_MMX
-    x264_predict_8x8_init_mmx( cpu, pf );
+    x264_predict_8x8_init_mmx( cpu, pf, predict_8x8_filter );
 #endif
 }
 
diff --git a/common/predict.h b/common/predict.h
index 65912190..630cadd1 100644
--- a/common/predict.h
+++ b/common/predict.h
@@ -26,6 +26,7 @@
 
 typedef void (*x264_predict_t)( uint8_t *src );
 typedef void (*x264_predict8x8_t)( uint8_t *src, uint8_t edge[33] );
+typedef void (*x264_predict_8x8_filter_t) ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
 
 enum intra_chroma_pred_e
 {
@@ -111,7 +112,7 @@ void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, in
 void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
 void x264_predict_8x8c_init  ( int cpu, x264_predict_t pf[7] );
 void x264_predict_4x4_init   ( int cpu, x264_predict_t pf[12] );
-void x264_predict_8x8_init   ( int cpu, x264_predict8x8_t pf[12] );
+void x264_predict_8x8_init   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
 
 
 #endif
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 0f64ca21..5596189e 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -25,14 +25,15 @@
 %include "x86util.asm"
 
 %macro STORE8x8 2
-    movq        [r0 + 0*FDEC_STRIDE], %1
-    movq        [r0 + 1*FDEC_STRIDE], %1
-    movq        [r0 + 2*FDEC_STRIDE], %1
-    movq        [r0 + 3*FDEC_STRIDE], %1
-    movq        [r0 + 4*FDEC_STRIDE], %2
-    movq        [r0 + 5*FDEC_STRIDE], %2
-    movq        [r0 + 6*FDEC_STRIDE], %2
-    movq        [r0 + 7*FDEC_STRIDE], %2
+    add r0, 4*FDEC_STRIDE
+    movq        [r0 + -4*FDEC_STRIDE], %1
+    movq        [r0 + -3*FDEC_STRIDE], %1
+    movq        [r0 + -2*FDEC_STRIDE], %1
+    movq        [r0 + -1*FDEC_STRIDE], %1
+    movq        [r0 +  0*FDEC_STRIDE], %2
+    movq        [r0 +  1*FDEC_STRIDE], %2
+    movq        [r0 +  2*FDEC_STRIDE], %2
+    movq        [r0 +  3*FDEC_STRIDE], %2
 %endmacro
 
 %macro STORE16x16 2
@@ -52,15 +53,24 @@
 %endmacro
 
 %macro STORE16x16_SSE2 1
-    mov         r1d, 4
-.loop:
-    movdqa      [r0 + 0*FDEC_STRIDE], %1
-    movdqa      [r0 + 1*FDEC_STRIDE], %1
-    movdqa      [r0 + 2*FDEC_STRIDE], %1
-    movdqa      [r0 + 3*FDEC_STRIDE], %1
-    add         r0, 4*FDEC_STRIDE
-    dec         r1d
-    jg          .loop
+    add r0, 4*FDEC_STRIDE
+    movdqa      [r0 + -4*FDEC_STRIDE], %1
+    movdqa      [r0 + -3*FDEC_STRIDE], %1
+    movdqa      [r0 + -2*FDEC_STRIDE], %1
+    movdqa      [r0 + -1*FDEC_STRIDE], %1
+    movdqa      [r0 +  0*FDEC_STRIDE], %1
+    movdqa      [r0 +  1*FDEC_STRIDE], %1
+    movdqa      [r0 +  2*FDEC_STRIDE], %1
+    movdqa      [r0 +  3*FDEC_STRIDE], %1
+    add r0, 8*FDEC_STRIDE
+    movdqa      [r0 + -4*FDEC_STRIDE], %1
+    movdqa      [r0 + -3*FDEC_STRIDE], %1
+    movdqa      [r0 + -2*FDEC_STRIDE], %1
+    movdqa      [r0 + -1*FDEC_STRIDE], %1
+    movdqa      [r0 +  0*FDEC_STRIDE], %1
+    movdqa      [r0 +  1*FDEC_STRIDE], %1
+    movdqa      [r0 +  2*FDEC_STRIDE], %1
+    movdqa      [r0 +  3*FDEC_STRIDE], %1
 %endmacro
 
 SECTION_RODATA
@@ -102,18 +112,17 @@ SECTION .text
 ; void predict_4x4_ddl_mmxext( uint8_t *src )
 ;-----------------------------------------------------------------------------
 cglobal predict_4x4_ddl_mmxext, 1,1
-    sub         r0, FDEC_STRIDE
-    movq        mm3, [r0]
-    movq        mm1, [r0-1]
-    movq        mm2, mm3
-    movq        mm4, [pb_0s_ff GLOBAL]
-    psrlq       mm2, 8
-    pand        mm4, mm3
-    por         mm2, mm4
+    movq    mm1, [r0-FDEC_STRIDE]
+    movq    mm2, mm1
+    movq    mm3, mm1
+    movq    mm4, mm1
+    psllq   mm1, 8
+    pxor    mm2, mm1
+    psrlq   mm2, 8
+    pxor    mm3, mm2
+    PRED8x8_LOWPASS mm0, mm1, mm3, mm4, mm5
 
-    PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
-
-%assign Y 1
+%assign Y 0
 %rep 4
     psrlq       mm0, 8
     movd        [r0+Y*FDEC_STRIDE], mm0
@@ -122,6 +131,121 @@ cglobal predict_4x4_ddl_mmxext, 1,1
 
     RET
 
+;-----------------------------------------------------------------------------
+; void predict_4x4_ddr_mmxext( uint8_t *src )
+;-----------------------------------------------------------------------------
+%macro PREDICT_4x4 1
+cglobal predict_4x4_ddr_%1, 1,1
+    movq      mm1, [r0+1*FDEC_STRIDE-8]
+    movq      mm2, [r0+0*FDEC_STRIDE-8]
+    punpckhbw mm2, [r0-1*FDEC_STRIDE-8]
+    movd      mm3, [r0-1*FDEC_STRIDE]
+    punpckhwd mm1, mm2
+    PALIGNR   mm3, mm1, 5, mm4
+    movq      mm1, mm3
+    PALIGNR   mm3, [r0+2*FDEC_STRIDE-8], 7, mm4
+    movq      mm2, mm3
+    PALIGNR   mm3, [r0+3*FDEC_STRIDE-8], 7, mm4
+    PRED8x8_LOWPASS mm0, mm3, mm1, mm2, mm4
+%assign Y 3
+    movd    [r0+Y*FDEC_STRIDE], mm0
+%rep 3
+%assign Y (Y-1)
+    psrlq    mm0, 8
+    movd    [r0+Y*FDEC_STRIDE], mm0
+%endrep
+    RET
+
+cglobal predict_4x4_vr_%1, 1,1
+    movd    mm0, [r0-1*FDEC_STRIDE]              ; ........t3t2t1t0
+    movq    mm7, mm0
+    PALIGNR mm0, [r0-1*FDEC_STRIDE-8], 7, mm1    ; ......t3t2t1t0lt
+    pavgb   mm7, mm0
+    PALIGNR mm0, [r0+0*FDEC_STRIDE-8], 7, mm1    ; ....t3t2t1t0ltl0
+    movq    mm1, mm0
+    PALIGNR mm0, [r0+1*FDEC_STRIDE-8], 7, mm2    ; ..t3t2t1t0ltl0l1
+    movq    mm2, mm0
+    PALIGNR mm0, [r0+2*FDEC_STRIDE-8], 7, mm3    ; t3t2t1t0ltl0l1l2
+    PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
+    movq    mm1, mm3
+    psrlq   mm3, 16
+    psllq   mm1, 48
+    movd   [r0+0*FDEC_STRIDE], mm7
+    movd   [r0+1*FDEC_STRIDE], mm3
+    PALIGNR mm7, mm1, 7, mm2
+    psllq   mm1, 8
+    movd   [r0+2*FDEC_STRIDE], mm7
+    PALIGNR mm3, mm1, 7, mm2
+    movd   [r0+3*FDEC_STRIDE], mm3
+    RET
+
+cglobal predict_4x4_hd_%1, 1,1
+    movd      mm0, [r0-1*FDEC_STRIDE-4] ; lt ..
+    punpckldq mm0, [r0-1*FDEC_STRIDE]   ; t3 t2 t1 t0 lt .. .. ..
+    psllq     mm0, 8                    ; t2 t1 t0 lt .. .. .. ..
+    movq      mm1, [r0+3*FDEC_STRIDE-8] ; l3
+    punpckhbw mm1, [r0+2*FDEC_STRIDE-8] ; l2 l3
+    movq      mm2, [r0+1*FDEC_STRIDE-8] ; l1
+    punpckhbw mm2, [r0+0*FDEC_STRIDE-8] ; l0 l1
+    punpckhwd mm1, mm2                  ; l0 l1 l2 l3
+    punpckhdq mm1, mm0                  ; t2 t1 t0 lt l0 l1 l2 l3
+    movq      mm0, mm1
+    movq      mm2, mm1
+    movq      mm7, mm1
+    psrlq     mm0, 16                   ; .. .. t2 t1 t0 lt l0 l1
+    psrlq     mm2, 8                    ; .. t2 t1 t0 lt l0 l1 l2
+    pavgb     mm7, mm2
+    PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
+    punpcklbw mm7, mm3
+    psrlq     mm3, 32
+    PALIGNR   mm3, mm7, 6, mm6
+%assign Y 3
+    movd     [r0+Y*FDEC_STRIDE], mm7
+%rep 2
+%assign Y (Y-1)
+    psrlq     mm7, 16
+    movd     [r0+Y*FDEC_STRIDE], mm7
+%endrep
+    movd     [r0+0*FDEC_STRIDE], mm3
+    RET
+%endmacro
+
+%define PALIGNR PALIGNR_MMX
+PREDICT_4x4 mmxext
+%define PALIGNR PALIGNR_SSSE3
+PREDICT_4x4 ssse3
+
+;-----------------------------------------------------------------------------
+; void predict_4x4_hu_mmxext( uint8_t *src )
+;-----------------------------------------------------------------------------
+cglobal predict_4x4_hu_mmxext, 1,1
+    movq      mm0, [r0+0*FDEC_STRIDE-8]
+    punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
+    movq      mm1, [r0+2*FDEC_STRIDE-8]
+    punpckhbw mm1, [r0+3*FDEC_STRIDE-8]
+    punpckhwd mm0, mm1
+    movq      mm1, mm0
+    punpckhbw mm1, mm1
+    pshufw    mm1, mm1, 0xFF
+    punpckhdq mm0, mm1
+    movq      mm2, mm0
+    movq      mm3, mm0
+    movq      mm7, mm0
+    psrlq     mm2, 16
+    psrlq     mm3, 8
+    pavgb     mm7, mm3
+    PRED8x8_LOWPASS mm4, mm0, mm2, mm3, mm5
+    punpcklbw mm7, mm4
+%assign Y 0
+    movd    [r0+Y*FDEC_STRIDE], mm7
+%rep 2
+%assign Y (Y+1)
+    psrlq    mm7, 16
+    movd    [r0+Y*FDEC_STRIDE], mm7
+%endrep
+    movd    [r0+3*FDEC_STRIDE], mm1
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void predict_4x4_vl_mmxext( uint8_t *src )
 ;-----------------------------------------------------------------------------
@@ -170,6 +294,120 @@ cglobal predict_4x4_dc_mmxext, 1,4
     mov   [r0+FDEC_STRIDE*3], r1d
     RET
 
+%macro PREDICT_FILTER 1
+;-----------------------------------------------------------------------------
+;void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters )
+;-----------------------------------------------------------------------------
+
+cglobal predict_8x8_filter_%1, 4,5
+    add          r0, 0x58
+%define src r0-0x58
+%ifndef ARCH_X86_64
+    mov          r4, r1
+%define t1 r4
+%define t4 r1
+%else
+%define t1 r1
+%define t4 r4
+%endif
+    test        r3b, 0x01
+    je .check_top
+    movq        mm0, [src+0*FDEC_STRIDE-8]
+    punpckhbw   mm0, [src-1*FDEC_STRIDE-8]
+    movq        mm1, [src+2*FDEC_STRIDE-8]
+    punpckhbw   mm1, [src+1*FDEC_STRIDE-8]
+    punpckhwd   mm1, mm0
+    movq        mm2, [src+4*FDEC_STRIDE-8]
+    punpckhbw   mm2, [src+3*FDEC_STRIDE-8]
+    movq        mm3, [src+6*FDEC_STRIDE-8]
+    punpckhbw   mm3, [src+5*FDEC_STRIDE-8]
+    punpckhwd   mm3, mm2
+    punpckhdq   mm3, mm1
+    movq        mm0, [src+7*FDEC_STRIDE-8]
+    movq        mm1, [src-1*FDEC_STRIDE]
+    movq        mm4, mm3
+    movq        mm2, mm3
+    PALIGNR     mm4, mm0, 7, mm0
+    PALIGNR     mm1, mm2, 1, mm2
+    test        r2b, 0x08
+    je .fix_lt_1
+.do_left:
+    movq        mm0, mm4
+    PRED8x8_LOWPASS mm2, mm1, mm4, mm3, mm5
+    movq     [t1+8], mm2
+    movq        mm4, mm0
+    PRED8x8_LOWPASS mm1, mm3, mm0, mm4, mm5
+    movd         t4, mm1
+    mov      [t1+7], t4b
+.check_top:
+    test        r3b, 0x02
+    je .done
+    movq        mm0, [src-1*FDEC_STRIDE-8]
+    movq        mm3, [src-1*FDEC_STRIDE]
+    movq        mm1, [src-1*FDEC_STRIDE+8]
+    movq        mm2, mm3
+    movq        mm4, mm3
+    PALIGNR     mm2, mm0, 7, mm0
+    PALIGNR     mm1, mm4, 1, mm4
+    test        r2b, 0x08
+    je .fix_lt_2
+    test        r2b, 0x04
+    je .fix_tr_1
+.do_top:
+    PRED8x8_LOWPASS mm4, mm2, mm1, mm3, mm5
+    movq    [t1+16], mm4
+    test        r3b, 0x04
+    je .done
+    test        r2b, 0x04
+    je .fix_tr_2
+    movq        mm0, [src-1*FDEC_STRIDE+8]
+    movq        mm5, mm0
+    movq        mm2, mm0
+    movq        mm4, mm0
+    psrlq       mm5, 56
+    PALIGNR     mm2, mm3, 7, mm3
+    PALIGNR     mm5, mm4, 1, mm4
+    PRED8x8_LOWPASS mm1, mm2, mm5, mm0, mm4
+    jmp .do_topright
+.fix_tr_2:
+    punpckhbw   mm3, mm3
+    pshufw      mm1, mm3, 0xFF
+.do_topright:
+    movq    [t1+24], mm1
+    psrlq       mm1, 56
+    movd         t4, mm1
+    mov     [t1+32], t4b
+.done:
+    REP_RET
+.fix_lt_1:
+    movq        mm5, mm3
+    pxor        mm5, mm4
+    psrlq       mm5, 56
+    psllq       mm5, 48
+    pxor        mm1, mm5
+    jmp .do_left
+.fix_lt_2:
+    movq        mm5, mm3
+    pxor        mm5, mm2
+    psllq       mm5, 56
+    psrlq       mm5, 56
+    pxor        mm2, mm5
+    test        r2b, 0x04
+    jne .do_top
+.fix_tr_1:
+    movq        mm5, mm3
+    pxor        mm5, mm1
+    psrlq       mm5, 56
+    psllq       mm5, 56
+    pxor        mm1, mm5
+    jmp .do_top
+%endmacro
+
+%define PALIGNR PALIGNR_MMX
+PREDICT_FILTER mmxext
+%define PALIGNR PALIGNR_SSSE3
+PREDICT_FILTER ssse3
+
 ;-----------------------------------------------------------------------------
 ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
 ;-----------------------------------------------------------------------------
@@ -411,6 +649,173 @@ cglobal predict_8x8_vr_core_mmxext, 2,2
 
     RET
 
+
+;-----------------------------------------------------------------------------
+; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
+%define PALIGNR PALIGNR_MMX
+cglobal predict_8x8_hd_mmxext, 2,2
+    add     r0, 4*FDEC_STRIDE
+    movq    mm0, [r1]           ; l7 .. .. .. .. .. .. ..
+    movq    mm1, [r1+8]         ; lt l0 l1 l2 l3 l4 l5 l6
+    movq    mm2, [r1+16]        ; t7 t6 t5 t4 t3 t2 t1 t0
+    movq    mm3, mm1            ; lt l0 l1 l2 l3 l4 l5 l6
+    movq    mm4, mm2            ; t7 t6 t5 t4 t3 t2 t1 t0
+    PALIGNR mm2, mm1, 7, mm5    ; t6 t5 t4 t3 t2 t1 t0 lt
+    PALIGNR mm1, mm0, 7, mm6    ; l0 l1 l2 l3 l4 l5 l6 l7
+    PALIGNR mm4, mm3, 1, mm7    ; t0 lt l0 l1 l2 l3 l4 l5
+    movq    mm5, mm3
+    pavgb   mm3, mm1
+    PRED8x8_LOWPASS mm0, mm4, mm1, mm5, mm7
+    movq    mm4, mm2
+    movq    mm1, mm2            ; t6 t5 t4 t3 t2 t1 t0 lt
+    psrlq   mm4, 16             ; .. .. t6 t5 t4 t3 t2 t1
+    psrlq   mm1, 8              ; .. t6 t5 t4 t3 t2 t1 t0
+    PRED8x8_LOWPASS mm6, mm4, mm2, mm1, mm5
+                                ; .. p11 p10 p9
+    movq    mm7, mm3
+    punpcklbw mm3, mm0          ; p4 p3 p2 p1
+    punpckhbw mm7, mm0          ; p8 p7 p6 p5
+    movq    mm1, mm7
+    movq    mm0, mm7
+    movq    mm4, mm7
+    movq   [r0+3*FDEC_STRIDE], mm3
+    PALIGNR mm7, mm3, 2, mm5
+    movq   [r0+2*FDEC_STRIDE], mm7
+    PALIGNR mm1, mm3, 4, mm5
+    movq   [r0+1*FDEC_STRIDE], mm1
+    PALIGNR mm0, mm3, 6, mm5
+    movq    [r0+0*FDEC_STRIDE], mm0
+    movq    mm2, mm6
+    movq    mm3, mm6
+    movq   [r0-1*FDEC_STRIDE], mm4
+    PALIGNR mm6, mm4, 2, mm5
+    movq   [r0-2*FDEC_STRIDE], mm6
+    PALIGNR mm2, mm4, 4, mm5
+    movq   [r0-3*FDEC_STRIDE], mm2
+    PALIGNR mm3, mm4, 6, mm5
+    movq   [r0-4*FDEC_STRIDE], mm3
+    RET
+
+;-----------------------------------------------------------------------------
+; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
+cglobal predict_8x8_hd_ssse3, 2,2
+    add       r0, 4*FDEC_STRIDE
+    movdqa  xmm0, [r1]
+    movdqa  xmm1, [r1+16]
+    movdqa  xmm2, xmm1
+    movdqa  xmm3, xmm1
+    palignr xmm1, xmm0, 7
+    palignr xmm2, xmm0, 9
+    palignr xmm3, xmm0, 8
+    movdqa    xmm4, xmm1
+    pavgb   xmm4, xmm3
+    PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm5
+    punpcklbw xmm4, xmm0
+    movhlps xmm0, xmm4
+
+%assign Y 3
+%rep 3
+    movq   [r0+(Y)*FDEC_STRIDE], xmm4
+    movq   [r0+(Y-4)*FDEC_STRIDE], xmm0
+    psrldq xmm4, 2
+    psrldq xmm0, 2
+%assign Y (Y-1)
+%endrep
+    movq   [r0+(Y)*FDEC_STRIDE], xmm4
+    movq   [r0+(Y-4)*FDEC_STRIDE], xmm0
+    RET
+
+;-----------------------------------------------------------------------------
+; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
+cglobal predict_8x8_hu_mmxext, 2,2
+    movq    mm1, [r1+7]         ; l0 l1 l2 l3 l4 l5 l6 l7
+    add      r0, 4*FDEC_STRIDE
+    pshufw  mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
+    psllq   mm1, 56             ; l7 .. .. .. .. .. .. ..
+    movq    mm2, mm0
+    psllw   mm0, 8
+    psrlw   mm2, 8
+    por     mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
+    movq    mm3, mm2
+    movq    mm4, mm2
+    movq    mm5, mm2
+    psrlq   mm2, 8
+    psrlq   mm3, 16
+    por     mm2, mm1            ; l7 l7 l6 l5 l4 l3 l2 l1
+    punpckhbw mm1, mm1
+    por     mm3, mm1            ; l7 l7 l7 l6 l5 l4 l3 l2
+    pavgb   mm4, mm2
+    PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
+    movq    mm5, mm4
+    punpcklbw mm4, mm1          ; p4 p3 p2 p1
+    punpckhbw mm5, mm1          ; p8 p7 p6 p5
+    movq    mm6, mm5
+    movq    mm7, mm5
+    movq    mm0, mm5
+    PALIGNR mm5, mm4, 2, mm1
+    pshufw  mm1, mm6, 11111001b
+    PALIGNR mm6, mm4, 4, mm2
+    pshufw  mm2, mm7, 11111110b
+    PALIGNR mm7, mm4, 6, mm3
+    pshufw  mm3, mm0, 11111111b
+    movq   [r0-4*FDEC_STRIDE], mm4
+    movq   [r0-3*FDEC_STRIDE], mm5
+    movq   [r0-2*FDEC_STRIDE], mm6
+    movq   [r0-1*FDEC_STRIDE], mm7
+    movq   [r0+0*FDEC_STRIDE], mm0
+    movq   [r0+1*FDEC_STRIDE], mm1
+    movq   [r0+2*FDEC_STRIDE], mm2
+    movq   [r0+3*FDEC_STRIDE], mm3
+    RET
+
+;-----------------------------------------------------------------------------
+; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
+cglobal predict_8x8_hu_sse2, 2,2
+    movq      mm1, [r1+7]           ; l0 l1 l2 l3 l4 l5 l6 l7
+    add        r0, 4*FDEC_STRIDE
+    pshufw    mm0, mm1, 00011011b   ; l6 l7 l4 l5 l2 l3 l0 l1
+    psllq     mm1, 56               ; l7 .. .. .. .. .. .. ..
+    movq      mm2, mm0
+    psllw     mm0, 8
+    psrlw     mm2, 8
+    por       mm2, mm0              ; l7 l6 l5 l4 l3 l2 l1 l0
+    movq      mm3, mm2
+    movq      mm4, mm2
+    movq      mm5, mm2
+    psrlq     mm2, 8
+    psrlq     mm3, 16
+    por       mm2, mm1              ; l7 l7 l6 l5 l4 l3 l2 l1
+    punpckhbw mm1, mm1
+    por       mm3, mm1              ; l7 l7 l7 l6 l5 l4 l3 l2
+    pavgb     mm4, mm2
+    PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
+
+    movq2dq   xmm0, mm4
+    movq2dq   xmm1, mm1
+    punpcklbw xmm0, xmm1
+
+    movhlps   xmm4, xmm0
+    pshuflw   xmm5, xmm4, 11111001b
+    pshuflw   xmm6, xmm4, 11111110b
+    pshuflw   xmm7, xmm4, 11111111b
+%assign Y -4
+%rep 3
+    movq     [r0+Y*FDEC_STRIDE], xmm0
+    psrldq    xmm0, 2
+%assign Y (Y+1)
+%endrep
+    movq     [r0+Y*FDEC_STRIDE], xmm0
+    movq     [r0+0*FDEC_STRIDE], xmm4
+    movq     [r0+1*FDEC_STRIDE], xmm5
+    movq     [r0+2*FDEC_STRIDE], xmm6
+    movq     [r0+3*FDEC_STRIDE], xmm7
+    RET
+
+
 ;-----------------------------------------------------------------------------
 ; void predict_8x8c_v_mmx( uint8_t *src )
 ;-----------------------------------------------------------------------------
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 34a98d6d..a5665cae 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -38,6 +38,8 @@ extern void predict_8x8c_h_mmxext( uint8_t *src );
 extern void predict_8x8c_h_ssse3( uint8_t *src );
 extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
@@ -46,27 +48,45 @@ extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_filter_mmxext   ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+extern void predict_8x8_filter_ssse3   ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
 extern void predict_4x4_ddl_mmxext( uint8_t *src );
+extern void predict_4x4_ddr_mmxext( uint8_t *src );
 extern void predict_4x4_vl_mmxext( uint8_t *src );
+extern void predict_4x4_vr_mmxext( uint8_t *src );
+extern void predict_4x4_vr_ssse3( uint8_t *src );
+extern void predict_4x4_hd_mmxext( uint8_t *src );
+extern void predict_4x4_hd_ssse3( uint8_t *src );
 extern void predict_4x4_dc_mmxext( uint8_t *src );
+extern void predict_4x4_ddr_ssse3( uint8_t *src );
+extern void predict_4x4_hu_mmxext( uint8_t *src );
 extern void predict_16x16_dc_top_sse2( uint8_t *src );
 extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
 extern void predict_16x16_v_sse2( uint8_t *src );
 extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
 
+#define PREDICT_P_SUM(j,i)\
+    H += i * ( src[j+i - FDEC_STRIDE ]  - src[j-i - FDEC_STRIDE ] );\
+    V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
+
 #define PREDICT_16x16_P(name)\
 static void predict_16x16_p_##name( uint8_t *src )\
 {\
-    int a, b, c, i;\
+    int a, b, c;\
     int H = 0;\
     int V = 0;\
     int i00;\
-    for( i = 1; i <= 8; i++ )\
-    {\
-        H += i * ( src[7+i - FDEC_STRIDE ]  - src[7-i - FDEC_STRIDE ] );\
-        V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] );\
-    }\
+    PREDICT_P_SUM(7,1) \
+    PREDICT_P_SUM(7,2) \
+    PREDICT_P_SUM(7,3) \
+    PREDICT_P_SUM(7,4) \
+    PREDICT_P_SUM(7,5) \
+    PREDICT_P_SUM(7,6) \
+    PREDICT_P_SUM(7,7) \
+    PREDICT_P_SUM(7,8) \
     a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
     b = ( 5 * H + 32 ) >> 6;\
     c = ( 5 * V + 32 ) >> 6;\
@@ -79,16 +99,15 @@ PREDICT_16x16_P( sse2   )
 
 static void predict_8x8c_p_mmxext( uint8_t *src )
 {
-    int a, b, c, i;
+    int a, b, c;
     int H = 0;
     int V = 0;
     int i00;
 
-    for( i = 1; i <= 4; i++ )
-    {
-        H += i * ( src[3+i - FDEC_STRIDE] - src[3-i - FDEC_STRIDE] );
-        V += i * ( src[(3+i)*FDEC_STRIDE -1] - src[(3-i)*FDEC_STRIDE -1] );
-    }
+    PREDICT_P_SUM(3,1)
+    PREDICT_P_SUM(3,2)
+    PREDICT_P_SUM(3,3)
+    PREDICT_P_SUM(3,4)
 
     a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );
     b = ( 17 * H + 16 ) >> 5;
@@ -521,7 +540,7 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
     pf[I_PRED_CHROMA_H]       = predict_8x8c_h_ssse3;
 }
 
-void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
+void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
 {
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
@@ -531,6 +550,9 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
     pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
     pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
     pf[I_PRED_8x8_VR]  = predict_8x8_vr_mmxext;
+    pf[I_PRED_8x8_HD]   = predict_8x8_hd_mmxext;
+    pf[I_PRED_8x8_HU]   = predict_8x8_hu_mmxext;
+    *predict_8x8_filter = predict_8x8_filter_mmxext;
 #ifdef ARCH_X86
     pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext;
     pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext;
@@ -540,6 +562,11 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
     pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2;
     pf[I_PRED_8x8_VL]  = predict_8x8_vl_sse2;
     pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2;
+    pf[I_PRED_8x8_HU]   = predict_8x8_hu_sse2;
+    if( !(cpu&X264_CPU_SSSE3) )
+        return;
+    pf[I_PRED_8x8_HD]   = predict_8x8_hd_ssse3;
+    *predict_8x8_filter = predict_8x8_filter_ssse3;
 }
 
 void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
@@ -554,7 +581,16 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
 #endif
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
+    pf[I_PRED_4x4_VR]  = predict_4x4_vr_mmxext;
     pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext;
     pf[I_PRED_4x4_VL]  = predict_4x4_vl_mmxext;
     pf[I_PRED_4x4_DC]  = predict_4x4_dc_mmxext;
+    pf[I_PRED_4x4_DDR] = predict_4x4_ddr_mmxext;
+    pf[I_PRED_4x4_HD]  = predict_4x4_hd_mmxext;
+    pf[I_PRED_4x4_HU]  = predict_4x4_hu_mmxext;
+    if( !(cpu&X264_CPU_SSSE3) )
+        return;
+    pf[I_PRED_4x4_DDR] = predict_4x4_ddr_ssse3;
+    pf[I_PRED_4x4_VR]  = predict_4x4_vr_ssse3;
+    pf[I_PRED_4x4_HD]  = predict_4x4_hd_ssse3;
 }
diff --git a/common/x86/predict.h b/common/x86/predict.h
index 398f3076..70aff09f 100644
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -27,6 +27,5 @@
 void x264_predict_16x16_init_mmx ( int cpu, x264_predict_t pf[7] );
 void x264_predict_8x8c_init_mmx  ( int cpu, x264_predict_t pf[7] );
 void x264_predict_4x4_init_mmx   ( int cpu, x264_predict_t pf[12] );
-void x264_predict_8x8_init_mmx   ( int cpu, x264_predict8x8_t pf[12] );
-
+void x264_predict_8x8_init_mmx   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
 #endif
diff --git a/encoder/analyse.c b/encoder/analyse.c
index b02b945f..62d51c5d 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -683,7 +683,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 
             predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
-            x264_predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
+            h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 
             if( b_merged_satd && i_max == 9 )
             {
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 3ef62f74..6aa69f12 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -727,7 +727,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
     /* init CPU functions */
     x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
     x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
-    x264_predict_8x8_init( h->param.cpu, h->predict_8x8 );
+    x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
     x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
     if( !h->param.b_cabac );
         x264_init_vlc_tables();
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 29ddadd5..b88bf402 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -228,6 +228,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
     x264_predict_t predict_8x8c[4+3];
     x264_predict_t predict_4x4[9+3];
     x264_predict8x8_t predict_8x8[9+3];
+    x264_predict_8x8_filter_t predict_8x8_filter;
     DECLARE_ALIGNED_16( uint8_t edge[33] );
     uint16_t cost_mv[32];
     int ret = 0, ok, used_asm;
@@ -238,9 +239,9 @@ static int check_pixel( int cpu_ref, int cpu_new )
     x264_pixel_init( cpu_new, &pixel_asm );
     x264_predict_16x16_init( 0, predict_16x16 );
     x264_predict_8x8c_init( 0, predict_8x8c );
-    x264_predict_8x8_init( 0, predict_8x8 );
+    x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter );
     x264_predict_4x4_init( 0, predict_4x4 );
-    x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+    predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
 
     // maximize sum
     for( i=0; i<256; i++ )
@@ -1294,30 +1295,32 @@ static int check_intra( int cpu_ref, int cpu_new )
     int ret = 0, ok = 1, used_asm = 0;
     int i;
     DECLARE_ALIGNED_16( uint8_t edge[33] );
+    DECLARE_ALIGNED_16( uint8_t edge2[33] );
     struct
     {
         x264_predict_t      predict_16x16[4+3];
         x264_predict_t      predict_8x8c[4+3];
         x264_predict8x8_t   predict_8x8[9+3];
         x264_predict_t      predict_4x4[9+3];
+        x264_predict_8x8_filter_t predict_8x8_filter;
     } ip_c, ip_ref, ip_a;
 
     x264_predict_16x16_init( 0, ip_c.predict_16x16 );
     x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
-    x264_predict_8x8_init( 0, ip_c.predict_8x8 );
+    x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter );
     x264_predict_4x4_init( 0, ip_c.predict_4x4 );
 
     x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
     x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
-    x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8 );
+    x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter );
     x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );
 
     x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
     x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
-    x264_predict_8x8_init( cpu_new, ip_a.predict_8x8 );
+    x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
     x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
 
-    x264_predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+    ip_c.predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
 
 #define INTRA_TEST( name, dir, w, ... ) \
     if( ip_a.name[dir] != ip_ref.name[dir] )\
@@ -1361,6 +1364,23 @@ static int check_intra( int cpu_ref, int cpu_new )
     for( i = 0; i < 12; i++ )
         INTRA_TEST( predict_8x8, i, 8, edge );
 
+    used_asm = 1;
+    set_func_name("intra_predict_8x8_filter");
+    if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
+    {
+        for( i = 0; i < 32; i++ )
+        {
+            memcpy( edge2, edge, 33 );
+            call_c(ip_c.predict_8x8_filter, buf1+48, edge, (i&24)>>1, i&7);
+            call_a(ip_a.predict_8x8_filter, buf1+48, edge2, (i&24)>>1, i&7);
+            if( memcmp( edge, edge2, 33 ) )
+            {
+                fprintf( stderr, "predict_8x8_filter :  [FAILED] %d %d\n", (i&24)>>1, i&7);
+                ok = 0;
+            }
+        }
+    }
+
     report( "intra pred :" );
     return ret;
 }
-- 
2.40.0