From: George Stephanos <gaf.stephanos@gmail.com>
Date: Sun, 2 Jan 2011 16:26:10 +0000 (-0500)
Subject: SSE2 high bit depth intra_predict_(8x8c|16x16)_p
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=770718bc498bbc215c3f0876013de2b2b3c1db32;p=libx264

SSE2 high bit depth intra_predict_(8x8c|16x16)_p

Patch from Google Code-In.
---

diff --git a/common/predict.c b/common/predict.c
index 20a57e59..4e0a532a 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -119,7 +119,7 @@ void x264_predict_16x16_v_c( pixel *src )
         src += FDEC_STRIDE;
     }
 }
-static void x264_predict_16x16_p_c( pixel *src )
+void x264_predict_16x16_p_c( pixel *src )
 {
     int H = 0, V = 0;
 
@@ -269,7 +269,7 @@ void x264_predict_8x8c_v_c( pixel *src )
         src += FDEC_STRIDE;
     }
 }
-static void x264_predict_8x8c_p_c( pixel *src )
+void x264_predict_8x8c_p_c( pixel *src )
 {
     int H = 0, V = 0;
 
diff --git a/common/predict.h b/common/predict.h
index b6489cd0..c0543312 100644
--- a/common/predict.h
+++ b/common/predict.h
@@ -118,9 +118,11 @@ void x264_predict_4x4_v_c   ( pixel *src );
 void x264_predict_16x16_dc_c( pixel *src );
 void x264_predict_16x16_h_c ( pixel *src );
 void x264_predict_16x16_v_c ( pixel *src );
+void x264_predict_16x16_p_c ( pixel *src );
 void x264_predict_8x8c_dc_c ( pixel *src );
 void x264_predict_8x8c_h_c  ( pixel *src );
 void x264_predict_8x8c_v_c  ( pixel *src );
+void x264_predict_8x8c_p_c  ( pixel *src );
 
 void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
 void x264_predict_8x8c_init  ( int cpu, x264_predict_t pf[7] );
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 1f5fd7dd..a5eaaf73 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -47,6 +47,7 @@ cextern pw_4
 cextern pw_8
 cextern pw_ff00
 cextern pb_reverse
+cextern pw_pixel_max
 
 %macro STORE8x8 2
     add r0, 4*FDEC_STRIDEB
@@ -1010,11 +1011,11 @@ INIT_MMX
 PREDICT_8x8_VR mmxext, b, q , 8
 %endif
 
-%ifndef ARCH_X86_64
-INIT_MMX
 ;-----------------------------------------------------------------------------
 ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
 ;-----------------------------------------------------------------------------
+%ifndef ARCH_X86_64
+INIT_MMX
 cglobal predict_8x8c_p_core_mmxext, 1,2
     LOAD_PLANE_ARGS
     movq        mm1, mm2
@@ -1039,10 +1040,66 @@ ALIGN 4
     dec         r1d
     jg          .loop
     REP_RET
+%endif ; !ARCH_X86_64
+
+INIT_XMM
+cglobal predict_8x8c_p_core_sse2, 1,1
+    movd        m0, r1m
+    movd        m2, r2m
+    movd        m4, r3m
+%ifdef HIGH_BIT_DEPTH
+    mova        m3, [pw_pixel_max]
+    pxor        m1, m1
+%endif
+    SPLATW      m0, m0, 0
+    SPLATW      m2, m2, 0
+    SPLATW      m4, m4, 0
+    pmullw      m2, [pw_76543210]
+%ifdef HIGH_BIT_DEPTH
+    mov        r1d, 8
+.loop:
+    mova        m5, m0
+    paddsw      m5, m2
+    psraw       m5, 5
+    CLIPW       m5, m1, m3
+    mova      [r0], m5
+    paddw       m2, m4
+    add         r0, FDEC_STRIDEB
+    dec r1d
+    jg .loop
+%else ;!HIGH_BIT_DEPTH
+    paddsw      m0, m2        ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
+    mova        m3, m0
+    paddsw      m3, m4
+    paddsw      m4, m4
+call .loop
+    add         r0, FDEC_STRIDE*4
+.loop:
+    mova        m5, m0
+    mova        m1, m3
+    psraw       m0, 5
+    psraw       m3, 5
+    packuswb    m0, m3
+    movq        [r0+FDEC_STRIDE*0], m0
+    movhps      [r0+FDEC_STRIDE*1], m0
+    paddsw      m5, m4
+    paddsw      m1, m4
+    mova        m0, m5
+    mova        m3, m1
+    psraw       m5, 5
+    psraw       m1, 5
+    packuswb    m5, m1
+    movq        [r0+FDEC_STRIDE*2], m5
+    movhps      [r0+FDEC_STRIDE*3], m5
+    paddsw      m0, m4
+    paddsw      m3, m4
+%endif ;!HIGH_BIT_DEPTH
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
 ;-----------------------------------------------------------------------------
+%ifndef ARCH_X86_64
 cglobal predict_16x16_p_core_mmxext, 1,2
     LOAD_PLANE_ARGS
     movq        mm5, mm2
@@ -1081,9 +1138,73 @@ ALIGN 4
     dec         r1d
     jg          .loop
     REP_RET
-
 %endif ; !ARCH_X86_64
 
+INIT_XMM
+cglobal predict_16x16_p_core_sse2, 1,2,8
+    movd     m0, r1m
+    movd     m1, r2m
+    movd     m2, r3m
+%ifdef HIGH_BIT_DEPTH
+    pxor     m6, m6
+    pxor     m7, m7
+%endif
+    SPLATW   m0, m0, 0
+    SPLATW   m1, m1, 0
+    SPLATW   m2, m2, 0
+    mova     m3, m1
+    pmullw   m3, [pw_76543210]
+    psllw    m1, 3
+%ifdef HIGH_BIT_DEPTH
+    mov     r1d, 16
+.loop:
+    mova     m4, m0
+    mova     m5, m0
+    mova     m7, m3
+    paddsw   m7, m6
+    paddsw   m4, m7
+    paddsw   m7, m1
+    paddsw   m5, m7
+    psraw    m4, 5
+    psraw    m5, 5
+    CLIPW    m4, [pb_0], [pw_pixel_max]
+    CLIPW    m5, [pb_0], [pw_pixel_max]
+    mova   [r0], m4
+    mova [r0+16], m5
+    add      r0, FDEC_STRIDEB
+    paddw    m6, m2
+    dec      r1d
+    jg       .loop
+%else ;!HIGH_BIT_DEPTH
+    paddsw   m0, m3  ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
+    paddsw   m1, m0  ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
+    mova     m7, m2
+    paddsw   m7, m7
+    mov     r1d, 8
+ALIGN 4
+.loop:
+    mova     m3, m0
+    mova     m4, m1
+    mova     m5, m0
+    mova     m6, m1
+    psraw    m3, 5
+    psraw    m4, 5
+    paddsw   m5, m2
+    paddsw   m6, m2
+    psraw    m5, 5
+    psraw    m6, 5
+    packuswb m3, m4
+    packuswb m5, m6
+    mova [r0+FDEC_STRIDE*0], m3
+    mova [r0+FDEC_STRIDE*1], m5
+    paddsw   m0, m7
+    paddsw   m1, m7
+    add      r0, FDEC_STRIDE*2
+    dec      r1d
+    jg       .loop
+%endif ;!HIGH_BIT_DEPTH
+    REP_RET
+
 INIT_XMM
 ;-----------------------------------------------------------------------------
 ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
@@ -1574,91 +1695,6 @@ cglobal predict_8x8c_dc_top_mmxext, 1,1
     RET
 
 %endif
-;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-
-cglobal predict_8x8c_p_core_sse2, 1,1
-    movd        xmm0, r1m
-    movd        xmm2, r2m
-    movd        xmm4, r3m
-    pshuflw     xmm0, xmm0, 0
-    pshuflw     xmm2, xmm2, 0
-    pshuflw     xmm4, xmm4, 0
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm4, xmm4
-    pmullw      xmm2, [pw_76543210]
-    paddsw      xmm0, xmm2        ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
-    movdqa      xmm3, xmm0
-    paddsw      xmm3, xmm4
-    paddsw      xmm4, xmm4
-call .loop
-    add           r0, FDEC_STRIDE*4
-.loop:
-    movdqa      xmm5, xmm0
-    movdqa      xmm1, xmm3
-    psraw       xmm0, 5
-    psraw       xmm3, 5
-    packuswb    xmm0, xmm3
-    movq        [r0+FDEC_STRIDE*0], xmm0
-    movhps      [r0+FDEC_STRIDE*1], xmm0
-    paddsw      xmm5, xmm4
-    paddsw      xmm1, xmm4
-    movdqa      xmm0, xmm5
-    movdqa      xmm3, xmm1
-    psraw       xmm5, 5
-    psraw       xmm1, 5
-    packuswb    xmm5, xmm1
-    movq        [r0+FDEC_STRIDE*2], xmm5
-    movhps      [r0+FDEC_STRIDE*3], xmm5
-    paddsw      xmm0, xmm4
-    paddsw      xmm3, xmm4
-    RET
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_sse2, 1,2,8
-    movd        xmm0, r1m
-    movd        xmm1, r2m
-    movd        xmm2, r3m
-    pshuflw     xmm0, xmm0, 0
-    pshuflw     xmm1, xmm1, 0
-    pshuflw     xmm2, xmm2, 0
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    movdqa      xmm3, xmm1
-    pmullw      xmm3, [pw_76543210]
-    psllw       xmm1, 3
-    paddsw      xmm0, xmm3  ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
-    paddsw      xmm1, xmm0  ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
-    movdqa      xmm7, xmm2
-    paddsw      xmm7, xmm7
-    mov         r1d, 8
-ALIGN 4
-.loop:
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm1
-    movdqa      xmm5, xmm0
-    movdqa      xmm6, xmm1
-    psraw       xmm3, 5
-    psraw       xmm4, 5
-    paddsw      xmm5, xmm2
-    paddsw      xmm6, xmm2
-    psraw       xmm5, 5
-    psraw       xmm6, 5
-    packuswb    xmm3, xmm4
-    packuswb    xmm5, xmm6
-    movdqa      [r0+FDEC_STRIDE*0], xmm3
-    movdqa      [r0+FDEC_STRIDE*1], xmm5
-    paddsw      xmm0, xmm7
-    paddsw      xmm1, xmm7
-    add         r0, FDEC_STRIDE*2
-    dec         r1d
-    jg          .loop
-    REP_RET
 
 ;-----------------------------------------------------------------------------
 ; void predict_16x16_v( pixel *src )
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 72b89a53..5de5598b 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -42,9 +42,9 @@
  void x264_predict_16x16_dc_top_sse2( pixel *src );
  void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
  void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
- void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
  void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
- void x264_predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
  void x264_predict_8x8c_dc_mmxext( pixel *src );
  void x264_predict_8x8c_dc_sse2( uint16_t *src );
  void x264_predict_8x8c_dc_top_mmxext( uint8_t *src );
@@ -127,17 +127,20 @@ static void x264_predict_16x16_dc_left_##name( pixel *src )\
 PREDICT_16x16_DC_LEFT( mmxext )
 PREDICT_16x16_DC_LEFT( sse2 )
 
-#if !HIGH_BIT_DEPTH
-ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
-ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
-ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
-
 #define PREDICT_P_SUM(j,i)\
     H += i * ( src[j+i - FDEC_STRIDE ]  - src[j-i - FDEC_STRIDE ] );\
     V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
 
+ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8};
+ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
+ALIGNED_16( static const int16_t pw_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
+ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
+ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+
+#if !HIGH_BIT_DEPTH
 #define PREDICT_16x16_P(name)\
-static void x264_predict_16x16_p_##name( uint8_t *src )\
+static void x264_predict_16x16_p_##name( pixel *src )\
 {\
     int a, b, c;\
     int H = 0;\
@@ -157,17 +160,37 @@ static void x264_predict_16x16_p_##name( uint8_t *src )\
     i00 = a - b * 7 - c * 7 + 16;\
     x264_predict_16x16_p_core_##name( src, i00, b, c );\
 }
-
 #ifndef ARCH_X86_64
 PREDICT_16x16_P( mmxext )
 #endif
 PREDICT_16x16_P( sse2   )
+#endif //!HIGH_BIT_DEPTH
 
 #ifdef __GNUC__
+#if HIGH_BIT_DEPTH
+static void x264_predict_16x16_p_sse2( uint16_t *src )
+#else
 static void x264_predict_16x16_p_ssse3( uint8_t *src )
+#endif
 {
     int a, b, c, i00;
     int H, V;
+#if HIGH_BIT_DEPTH
+    asm (
+        "movdqu        -2+%1, %%xmm1 \n"
+        "movdqa        16+%1, %%xmm0 \n"
+        "pmaddwd          %2, %%xmm0 \n"
+        "pmaddwd          %3, %%xmm1 \n"
+        "paddd        %%xmm1, %%xmm0 \n"
+        "movhlps      %%xmm0, %%xmm1 \n"
+        "paddd        %%xmm1, %%xmm0 \n"
+        "pshuflw $14, %%xmm0, %%xmm1 \n"
+        "paddd        %%xmm1, %%xmm0 \n"
+        "movd         %%xmm0, %0     \n"
+        :"=r"(H)
+        :"m"(src[-FDEC_STRIDE]), "m"(*pw_12345678), "m"(*pw_m87654321)
+    );
+#else
     asm (
         "movq           %1, %%mm1 \n"
         "movq         8+%1, %%mm0 \n"
@@ -184,6 +207,7 @@ static void x264_predict_16x16_p_ssse3( uint8_t *src )
         :"=r"(H)
         :"m"(src[-FDEC_STRIDE]), "m"(*pb_12345678), "m"(*pb_m87654321)
     );
+#endif
     V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )
       + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )
       + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )
@@ -196,10 +220,17 @@ static void x264_predict_16x16_p_ssse3( uint8_t *src )
     b = ( 5 * H + 32 ) >> 6;
     c = ( 5 * V + 32 ) >> 6;
     i00 = a - b * 7 - c * 7 + 16;
-    x264_predict_16x16_p_core_sse2( src, i00, b, c );
+    /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case
+     * than to try to consider it in the asm. */
+    if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )
+        x264_predict_16x16_p_c( src );
+    else
+        x264_predict_16x16_p_core_sse2( src, i00, b, c );
 }
 #endif
 
+#if !HIGH_BIT_DEPTH
+
 #define PREDICT_8x8_P(name)\
 static void x264_predict_8x8c_p_##name( uint8_t *src )\
 {\
@@ -217,17 +248,35 @@ static void x264_predict_8x8c_p_##name( uint8_t *src )\
     i00 = a -3*b -3*c + 16;\
     x264_predict_8x8c_p_core_##name( src, i00, b, c );\
 }
-
 #ifndef ARCH_X86_64
 PREDICT_8x8_P( mmxext )
 #endif
 PREDICT_8x8_P( sse2   )
 
+#endif //!HIGH_BIT_DEPTH
+
 #ifdef __GNUC__
+#if HIGH_BIT_DEPTH
+static void x264_predict_8x8c_p_sse2( uint16_t *src )
+#else
 static void x264_predict_8x8c_p_ssse3( uint8_t *src )
+#endif
 {
     int a, b, c, i00;
     int H, V;
+#if HIGH_BIT_DEPTH
+    asm (
+        "movdqa           %1, %%xmm0 \n"
+        "pmaddwd          %2, %%xmm0 \n"
+        "movhlps      %%xmm0, %%xmm1 \n"
+        "paddd        %%xmm1, %%xmm0 \n"
+        "pshuflw $14, %%xmm0, %%xmm1 \n"
+        "paddd        %%xmm1, %%xmm0 \n"
+        "movd         %%xmm0, %0     \n"
+        :"=r"(H)
+        :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)
+    );
+#else
     asm (
         "movq           %1, %%mm0 \n"
         "pmaddubsw      %2, %%mm0 \n"
@@ -240,6 +289,7 @@ static void x264_predict_8x8c_p_ssse3( uint8_t *src )
         :"=r"(H)
         :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)
     );
+#endif
     V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )
       + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )
       + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )
@@ -249,10 +299,15 @@ static void x264_predict_8x8c_p_ssse3( uint8_t *src )
     b = ( 17 * H + 16 ) >> 5;
     c = ( 17 * V + 16 ) >> 5;
     i00 = a -3*b -3*c + 16;
-    x264_predict_8x8c_p_core_sse2( src, i00, b, c );
+    /* b*7 + c*7 can overflow: it's easier to just branch away in this rare case
+     * than to try to consider it in the asm. */
+    if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 2340 || abs(c) > 2340) )
+        x264_predict_8x8c_p_c( src );
+    else
+        x264_predict_8x8c_p_core_sse2( src, i00, b, c );
 }
 #endif
-
+#if !HIGH_BIT_DEPTH
 #if ARCH_X86_64
 static void x264_predict_8x8c_dc_left( uint8_t *src )
 {
@@ -360,6 +415,7 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
     pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
     pf[I_PRED_16x16_V]       = x264_predict_16x16_v_sse2;
     pf[I_PRED_16x16_H]       = x264_predict_16x16_h_sse2;
+    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_sse2;
 #else
 #if !ARCH_X86_64
     pf[I_PRED_16x16_P]       = x264_predict_16x16_p_mmxext;
@@ -397,6 +453,7 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
     pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_sse2;
     pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_sse2;
     pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_sse2;
+    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_sse2;
 #else
 #if ARCH_X86_64
     pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
diff --git a/tools/checkasm.c b/tools/checkasm.c
index a0e74c75..c552ab9b 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1694,6 +1694,7 @@ static int check_intra( int cpu_ref, int cpu_new )
     int ret = 0, ok = 1, used_asm = 0;
     ALIGNED_16( pixel edge[33] );
     ALIGNED_16( pixel edge2[33] );
+    ALIGNED_16( pixel fdec[FDEC_STRIDE*20] );
     struct
     {
         x264_predict_t      predict_16x16[4+3];
@@ -1718,18 +1719,20 @@ static int check_intra( int cpu_ref, int cpu_new )
     x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
     x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
 
-    ip_c.predict_8x8_filter( pbuf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+    memcpy( fdec, pbuf1, 32*20 * sizeof(pixel) );\
 
-#define INTRA_TEST( name, dir, w, ... )\
+    ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+
+#define INTRA_TEST( name, dir, w, bench, ... )\
     if( ip_a.name[dir] != ip_ref.name[dir] )\
     {\
         set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
         used_asm = 1;\
-        memcpy( pbuf3, pbuf1, 32*20 * sizeof(pixel) );\
-        memcpy( pbuf4, pbuf1, 32*20 * sizeof(pixel) );\
-        call_c( ip_c.name[dir], pbuf3+48, ##__VA_ARGS__ );\
-        call_a( ip_a.name[dir], pbuf4+48, ##__VA_ARGS__ );\
-        if( memcmp( pbuf3, pbuf4, 32*20 * sizeof(pixel) ) )\
+        memcpy( pbuf3, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
+        memcpy( pbuf4, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
+        call_c##bench( ip_c.name[dir], pbuf3+48, ##__VA_ARGS__ );\
+        call_a##bench( ip_a.name[dir], pbuf4+48, ##__VA_ARGS__ );\
+        if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*20 * sizeof(pixel) ) )\
         {\
             fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
             ok = 0;\
@@ -1740,7 +1743,7 @@ static int check_intra( int cpu_ref, int cpu_new )
             {\
                 printf( "%2x ", edge[14-j] );\
                 for( int k = 0; k < w; k++ )\
-                    printf( "%2x ", pbuf4[48+k+j*32] );\
+                    printf( "%2x ", pbuf4[48+k+j*FDEC_STRIDE] );\
                 printf( "\n" );\
             }\
             printf( "\n" );\
@@ -1748,20 +1751,20 @@ static int check_intra( int cpu_ref, int cpu_new )
             {\
                 printf( "   " );\
                 for( int k = 0; k < w; k++ )\
-                    printf( "%2x ", pbuf3[48+k+j*32] );\
+                    printf( "%2x ", pbuf3[48+k+j*FDEC_STRIDE] );\
                 printf( "\n" );\
             }\
         }\
     }
 
     for( int i = 0; i < 12; i++ )
-        INTRA_TEST( predict_4x4, i, 4 );
+        INTRA_TEST(   predict_4x4, i,  4, );
     for( int i = 0; i < 7; i++ )
-        INTRA_TEST( predict_8x8c, i, 8 );
+        INTRA_TEST(  predict_8x8c, i,  8, );
     for( int i = 0; i < 7; i++ )
-        INTRA_TEST( predict_16x16, i, 16 );
+        INTRA_TEST( predict_16x16, i, 16, );
     for( int i = 0; i < 12; i++ )
-        INTRA_TEST( predict_8x8, i, 8, edge );
+        INTRA_TEST(   predict_8x8, i,  8, , edge );
 
     set_func_name("intra_predict_8x8_filter");
     if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
@@ -1780,6 +1783,32 @@ static int check_intra( int cpu_ref, int cpu_new )
         }
     }
 
+#define EXTREMAL_PLANE(size) \
+    { \
+        int max[7]; \
+        for( int j = 0; j < 7; j++ ) \
+            max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \
+        fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \
+        for( int j = 0; j < size/2; j++ ) \
+            fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \
+        for( int j = size/2; j < size-1; j++ ) \
+            fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \
+        fdec[48+(size-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
+        for( int j = 0; j < size/2; j++ ) \
+            fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \
+        for( int j = size/2; j < size-1; j++ ) \
+            fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \
+        fdec[48+(size-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
+    }
+    /* Extremal test case for planar prediction. */
+    for( int test = 0; test < 100 && ok; test++ )
+        for( int i = 0; i < 128 && ok; i++ )
+        {
+            EXTREMAL_PLANE(  8 );
+            INTRA_TEST(  predict_8x8c, I_PRED_CHROMA_P,  8, 1 );
+            EXTREMAL_PLANE( 16 );
+            INTRA_TEST( predict_16x16,  I_PRED_16x16_P, 16, 1 );
+        }
     report( "intra pred :" );
     return ret;
 }