From 41227fa2531d9263e481b80237d2d9ef6f5a450f Mon Sep 17 00:00:00 2001
From: James Weaver <james.barrett@bbc.co.uk>
Date: Tue, 7 Jan 2014 10:31:58 +0000
Subject: [PATCH] v210 input support

Assembly based on code by Henrik Gramner and Loren Merritt.
---
 common/common.c        |  2 +-
 common/frame.c         | 18 ++++++++++-
 common/mc.c            | 29 +++++++++++++++++
 common/mc.h            |  3 ++
 common/x86/mc-a2.asm   | 70 ++++++++++++++++++++++++++++++++++++++++++
 common/x86/mc-c.c      | 14 +++++++++
 encoder/encoder.c      |  2 +-
 filters/video/resize.c | 16 +++++++---
 input/input.c          |  3 +-
 input/raw.c            |  7 +++--
 tools/checkasm.c       | 27 ++++++++++++++++
 x264.c                 | 11 ++++---
 x264.h                 | 15 ++++-----
 13 files changed, 195 insertions(+), 22 deletions(-)

diff --git a/common/common.c b/common/common.c
index 4cdde07c..76e80b94 100644
--- a/common/common.c
+++ b/common/common.c
@@ -1144,7 +1144,7 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
     };
 
     int csp = i_csp & X264_CSP_MASK;
-    if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX )
+    if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX || csp == X264_CSP_V210 )
         return -1;
     x264_picture_init( pic );
     pic->img.i_csp = i_csp;
diff --git a/common/frame.c b/common/frame.c
index 7e79fe9a..a8451810 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -53,6 +53,7 @@ static int x264_frame_internal_csp( int external_csp )
         case X264_CSP_NV16:
         case X264_CSP_I422:
         case X264_CSP_YV16:
+        case X264_CSP_V210:
             return X264_CSP_NV16;
         case X264_CSP_I444:
         case X264_CSP_YV24:
@@ -380,6 +381,12 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
     }
 #endif
 
+    if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 )
+    {
+        x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" );
+        return -1;
+    }
+
     dst->i_type     = src->i_type;
     dst->i_qpplus1  = src->i_qpplus1;
     dst->i_pts      = dst->i_reordered_pts = src->i_pts;
@@ -392,7 +399,16 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 
     uint8_t *pix[3];
     int stride[3];
-    if ( i_csp >= X264_CSP_BGR )
+    if( i_csp == X264_CSP_V210 )
+    {
+         stride[0] = src->img.i_stride[0];
+         pix[0] = src->img.plane[0];
+
+         h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0],
+                                             dst->plane[1], dst->i_stride[1],
+                                             (uint32_t *)pix[0], stride[0]/sizeof(uint32_t), h->param.i_width, h->param.i_height );
+    }
+    else if( i_csp >= X264_CSP_BGR )
     {
          stride[0] = src->img.i_stride[0];
          pix[0] = src->img.plane[0];
diff --git a/common/mc.c b/common/mc.c
index 0fe0e61a..c7a544f6 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -336,6 +336,34 @@ static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
     }
 }
 
+void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
+                                          pixel *dstc, intptr_t i_dstc,
+                                          uint32_t *src, intptr_t i_src, int w, int h )
+{
+    for( int l = 0; l < h; l++ )
+    {
+        pixel *dsty0 = dsty;
+        pixel *dstc0 = dstc;
+        uint32_t *src0 = src;
+
+        for( int n = 0; n < w; n += 3 )
+        {
+            *(dstc0++) = *src0 & 0x03FF;
+            *(dsty0++) = ( *src0 >> 10 ) & 0x03FF;
+            *(dstc0++) = ( *src0 >> 20 ) & 0x03FF;
+            src0++;
+            *(dsty0++) = *src0 & 0x03FF;
+            *(dstc0++) = ( *src0 >> 10 ) & 0x03FF;
+            *(dsty0++) = ( *src0 >> 20 ) & 0x03FF;
+            src0++;
+        }
+
+        dsty += i_dsty;
+        dstc += i_dstc;
+        src  += i_src;
+    }
+}
+
 static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
 {
     for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
@@ -507,6 +535,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
     pf->plane_copy_interleave = x264_plane_copy_interleave_c;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
+    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
 
     pf->hpel_filter = hpel_filter;
 
diff --git a/common/mc.h b/common/mc.h
index edd3e327..054ba60e 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -93,6 +93,9 @@ typedef struct
                                      pixel *src,  intptr_t i_src, int w, int h );
     void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
                                          pixel *dstc, intptr_t i_dstc, pixel *src,  intptr_t i_src, int pw, int w, int h );
+    void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
+                                          pixel *dstc, intptr_t i_dstc,
+                                          uint32_t *src, intptr_t i_src, int w, int h );
     void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
                          intptr_t i_stride, int i_width, int i_height, int16_t *buf );
 
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 39b83555..e12e19cf 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -38,6 +38,13 @@ filt_mul51: times 16 db -5, 1
 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
+v210_mask: times 4 dq 0xc00ffc003ff003ff
+v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
+v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
+; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
+v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
+           dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
+
 %if HIGH_BIT_DEPTH
 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
@@ -1195,6 +1202,64 @@ cglobal load_deinterleave_chroma_fdec, 4,4
     RET
 %endmacro ; PLANE_DEINTERLEAVE
 
+%macro PLANE_DEINTERLEAVE_V210 0
+;-----------------------------------------------------------------------------
+; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
+;                                         uint16_t *dstc, intptr_t i_dstc,
+;                                         uint32_t *src, intptr_t i_src, int w, int h )
+;-----------------------------------------------------------------------------
+%if ARCH_X86_64
+cglobal plane_copy_deinterleave_v210, 8,10,7
+%define src   r8
+%define org_w r9
+%define h     r7d
+%else
+cglobal plane_copy_deinterleave_v210, 7,7,7
+%define src   r4m
+%define org_w r6m
+%define h     dword r7m
+%endif
+    FIX_STRIDES r1, r3, r6d
+    shl    r5, 2
+    add    r0, r6
+    add    r2, r6
+    neg    r6
+    mov   src, r4
+    mov org_w, r6
+    mova   m2, [v210_mask]
+    mova   m3, [v210_luma_shuf]
+    mova   m4, [v210_chroma_shuf]
+    mova   m5, [v210_mult] ; also functions as vpermd index for avx2
+    pshufd m6, m5, q1102
+
+ALIGN 16
+.loop:
+    movu   m1, [r4]
+    pandn  m0, m2, m1
+    pand   m1, m2
+    pshufb m0, m3
+    pshufb m1, m4
+    pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
+    pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
+%if mmsize == 32
+    vpermd m0, m5, m0
+    vpermd m1, m5, m1
+%endif
+    movu [r0+r6], m0
+    movu [r2+r6], m1
+    add    r4, mmsize
+    add    r6, 3*mmsize/4
+    jl .loop
+    add    r0, r1
+    add    r2, r3
+    add   src, r5
+    mov    r4, src
+    mov    r6, org_w
+    dec     h
+    jg .loop
+    RET
+%endmacro ; PLANE_DEINTERLEAVE_V210
+
 %if HIGH_BIT_DEPTH
 INIT_MMX mmx2
 PLANE_INTERLEAVE
@@ -1203,9 +1268,14 @@ PLANE_DEINTERLEAVE
 INIT_XMM sse2
 PLANE_INTERLEAVE
 PLANE_DEINTERLEAVE
+INIT_XMM ssse3
+PLANE_DEINTERLEAVE_V210
 INIT_XMM avx
 PLANE_INTERLEAVE
 PLANE_DEINTERLEAVE
+PLANE_DEINTERLEAVE_V210
+INIT_YMM avx2
+PLANE_DEINTERLEAVE_V210
 %else
 INIT_MMX mmx2
 PLANE_INTERLEAVE
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index cbe2d3ac..b2807302 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -116,6 +116,15 @@ void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
 void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
                                        uint16_t *dstv, intptr_t i_dstv,
                                        uint16_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
+                                              uint16_t *dstv, intptr_t i_dstv,
+                                              uint32_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx  ( uint16_t *dstu, intptr_t i_dstu,
+                                              uint16_t *dstv, intptr_t i_dstv,
+                                              uint32_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
+                                              uint16_t *dstv, intptr_t i_dstv,
+                                              uint32_t *src,  intptr_t i_src, int w, int h );
 void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
@@ -627,6 +636,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         return;
 
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
 
     if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
@@ -639,6 +649,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
     pf->plane_copy_interleave        = x264_plane_copy_interleave_avx;
     pf->plane_copy_deinterleave      = x264_plane_copy_deinterleave_avx;
+    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
     pf->store_interleave_chroma      = x264_store_interleave_chroma_avx;
     pf->copy[PIXEL_16x16]            = x264_mc_copy_w16_aligned_avx;
 
@@ -649,7 +660,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
 
     if( cpu&X264_CPU_AVX2 )
+    {
         pf->mc_luma = mc_luma_avx2;
+        pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
+    }
 #else // !HIGH_BIT_DEPTH
 
 #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 2c7438ea..d7302963 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -467,7 +467,7 @@ static int x264_validate_parameters( x264_t *h, int b_open )
         x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" );
         return -1;
     }
-    else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_NV16 )
+    else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_V210 )
     {
         x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" );
         return -1;
diff --git a/filters/video/resize.c b/filters/video/resize.c
index 7480a149..79fc89ad 100644
--- a/filters/video/resize.c
+++ b/filters/video/resize.c
@@ -94,9 +94,12 @@ static void help( int longhelp )
 
     for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
     {
-        printf( "%s", x264_cli_csps[i].name );
-        if( i+1 < X264_CSP_CLI_MAX )
-            printf( ", " );
+        if( x264_cli_csps[i].name )
+        {
+            printf( "%s", x264_cli_csps[i].name );
+            if( i+1 < X264_CSP_CLI_MAX )
+                printf( ", " );
+        }
     }
     printf( "\n"
             "               - depth: 8 or 16 bits per pixel [keep current]\n"
@@ -243,8 +246,11 @@ static int handle_opts( const char **optlist, char **opts, video_info_t *info, r
         if( strlen( str_csp ) == 0 )
             csp = info->csp & X264_CSP_MASK;
         else
-            for( csp = X264_CSP_CLI_MAX-1; x264_cli_csps[csp].name && strcasecmp( x264_cli_csps[csp].name, str_csp ); )
-                csp--;
+            for( csp = X264_CSP_CLI_MAX-1; csp > X264_CSP_NONE; csp-- )
+            {
+                if( x264_cli_csps[csp].name && !strcasecmp( x264_cli_csps[csp].name, str_csp ) )
+                    break;
+            }
         FAIL_IF_ERROR( csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", str_csp );
         h->dst_csp = csp;
         if( depth == 16 )
diff --git a/input/input.c b/input/input.c
index 9a99fa36..c6bb5ac2 100644
--- a/input/input.c
+++ b/input/input.c
@@ -42,7 +42,8 @@ const x264_cli_csp_t x264_cli_csps[] = {
 int x264_cli_csp_is_invalid( int csp )
 {
     int csp_mask = csp & X264_CSP_MASK;
-    return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX || csp & X264_CSP_OTHER;
+    return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX ||
+           csp_mask == X264_CSP_V210 || csp & X264_CSP_OTHER;
 }
 
 int x264_cli_csp_depth_factor( int csp )
diff --git a/input/raw.c b/input/raw.c
index 0caf6f02..c4f14744 100644
--- a/input/raw.c
+++ b/input/raw.c
@@ -55,8 +55,11 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     FAIL_IF_ERROR( !info->width || !info->height, "raw input requires a resolution.\n" )
     if( opt->colorspace )
     {
-        for( info->csp = X264_CSP_CLI_MAX-1; x264_cli_csps[info->csp].name && strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ); )
-            info->csp--;
+        for( info->csp = X264_CSP_CLI_MAX-1; info->csp > X264_CSP_NONE; info->csp-- )
+        {
+            if( x264_cli_csps[info->csp].name && !strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ) )
+                break;
+        }
         FAIL_IF_ERROR( info->csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", opt->colorspace );
     }
     else /* default */
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 2f6f2860..f181ff54 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1453,6 +1453,33 @@ static int check_mc( int cpu_ref, int cpu_new )
     }
     report( "plane_copy :" );
 
+    if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
+    {
+        set_func_name( "plane_copy_deinterleave_v210" );
+        used_asm = 1;
+        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
+        {
+            int w = (plane_specs[i].w + 1) >> 1;
+            int h = plane_specs[i].h;
+            intptr_t dst_stride = ALIGN( w, 16 );
+            intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
+            intptr_t offv = dst_stride*h + 32;
+            memset( pbuf3, 0, 0x1000 );
+            memset( pbuf4, 0, 0x1000 );
+            call_c( mc_c.plane_copy_deinterleave_v210, pbuf3, dst_stride, pbuf3+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
+            call_a( mc_a.plane_copy_deinterleave_v210, pbuf4, dst_stride, pbuf4+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
+            for( int y = 0; y < h; y++ )
+                if( memcmp( pbuf3+y*dst_stride,      pbuf4+y*dst_stride,      w*sizeof(uint16_t) ) ||
+                    memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(uint16_t) ) )
+                {
+                    ok = 0;
+                    fprintf( stderr, "plane_copy_deinterleave_v210 FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
+                    break;
+                }
+        }
+    }
+    report( "v210 :" );
+
     if( mc_a.hpel_filter != mc_ref.hpel_filter )
     {
         pixel *srchpel = pbuf1+8+2*64;
diff --git a/x264.c b/x264.c
index a32e43d9..79d14108 100644
--- a/x264.c
+++ b/x264.c
@@ -420,9 +420,12 @@ static void print_csp_names( int longhelp )
     printf( INDENT );
     for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
     {
-        printf( "%s", x264_cli_csps[i].name );
-        if( i+1 < X264_CSP_CLI_MAX )
-            printf( ", " );
+        if( x264_cli_csps[i].name )
+        {
+            printf( "%s", x264_cli_csps[i].name );
+            if( i+1 < X264_CSP_CLI_MAX )
+                printf( ", " );
+        }
     }
 #if HAVE_LAVF
     printf( "\n" );
@@ -1282,7 +1285,7 @@ static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info,
     int csp = info->csp & X264_CSP_MASK;
     if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
         param->i_csp = X264_CSP_I420;
-    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) )
+    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_V210) )
         param->i_csp = X264_CSP_I422;
     else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
         param->i_csp = X264_CSP_I444;
diff --git a/x264.h b/x264.h
index 796fb530..625a2d9e 100644
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 140
+#define X264_BUILD 141
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -215,12 +215,13 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
 #define X264_CSP_I422           0x0004  /* yuv 4:2:2 planar */
 #define X264_CSP_YV16           0x0005  /* yvu 4:2:2 planar */
 #define X264_CSP_NV16           0x0006  /* yuv 4:2:2, with one y plane and one packed u+v */
-#define X264_CSP_I444           0x0007  /* yuv 4:4:4 planar */
-#define X264_CSP_YV24           0x0008  /* yvu 4:4:4 planar */
-#define X264_CSP_BGR            0x0009  /* packed bgr 24bits   */
-#define X264_CSP_BGRA           0x000a  /* packed bgr 32bits   */
-#define X264_CSP_RGB            0x000b  /* packed rgb 24bits   */
-#define X264_CSP_MAX            0x000c  /* end of list */
+#define X264_CSP_V210           0x0007  /* 10-bit yuv 4:2:2 packed in 32 */
+#define X264_CSP_I444           0x0008  /* yuv 4:4:4 planar */
+#define X264_CSP_YV24           0x0009  /* yvu 4:4:4 planar */
+#define X264_CSP_BGR            0x000a  /* packed bgr 24bits   */
+#define X264_CSP_BGRA           0x000b  /* packed bgr 32bits   */
+#define X264_CSP_RGB            0x000c  /* packed rgb 24bits   */
+#define X264_CSP_MAX            0x000d  /* end of list */
 #define X264_CSP_VFLIP          0x1000  /* the csp is vertically flipped */
 #define X264_CSP_HIGH_DEPTH     0x2000  /* the csp has a depth of 16 bits per pixel component */
 
-- 
2.50.1