From 41227fa2531d9263e481b80237d2d9ef6f5a450f Mon Sep 17 00:00:00 2001 From: James Weaver Date: Tue, 7 Jan 2014 10:31:58 +0000 Subject: [PATCH] v210 input support Assembly based on code by Henrik Gramner and Loren Merritt. --- common/common.c | 2 +- common/frame.c | 18 ++++++++++- common/mc.c | 29 +++++++++++++++++ common/mc.h | 3 ++ common/x86/mc-a2.asm | 70 ++++++++++++++++++++++++++++++++++++++++++ common/x86/mc-c.c | 14 +++++++++ encoder/encoder.c | 2 +- filters/video/resize.c | 16 +++++++--- input/input.c | 3 +- input/raw.c | 7 +++-- tools/checkasm.c | 27 ++++++++++++++++ x264.c | 11 ++++--- x264.h | 15 ++++----- 13 files changed, 195 insertions(+), 22 deletions(-) diff --git a/common/common.c b/common/common.c index 4cdde07c..76e80b94 100644 --- a/common/common.c +++ b/common/common.c @@ -1144,7 +1144,7 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh }; int csp = i_csp & X264_CSP_MASK; - if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX ) + if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX || csp == X264_CSP_V210 ) return -1; x264_picture_init( pic ); pic->img.i_csp = i_csp; diff --git a/common/frame.c b/common/frame.c index 7e79fe9a..a8451810 100644 --- a/common/frame.c +++ b/common/frame.c @@ -53,6 +53,7 @@ static int x264_frame_internal_csp( int external_csp ) case X264_CSP_NV16: case X264_CSP_I422: case X264_CSP_YV16: + case X264_CSP_V210: return X264_CSP_NV16; case X264_CSP_I444: case X264_CSP_YV24: @@ -380,6 +381,12 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) } #endif + if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 ) + { + x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" ); + return -1; + } + dst->i_type = src->i_type; dst->i_qpplus1 = src->i_qpplus1; dst->i_pts = dst->i_reordered_pts = src->i_pts; @@ -392,7 +399,16 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) uint8_t *pix[3]; int stride[3]; - if ( i_csp >= X264_CSP_BGR ) + if( i_csp == X264_CSP_V210 ) + { + stride[0] = src->img.i_stride[0]; + pix[0] = src->img.plane[0]; + + h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0], + dst->plane[1], dst->i_stride[1], + (uint32_t *)pix[0], stride[0]/sizeof(uint32_t), h->param.i_width, h->param.i_height ); + } + else if( i_csp >= X264_CSP_BGR ) { stride[0] = src->img.i_stride[0]; pix[0] = src->img.plane[0]; diff --git a/common/mc.c b/common/mc.c index 0fe0e61a..c7a544f6 100644 --- a/common/mc.c +++ b/common/mc.c @@ -336,6 +336,34 @@ static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta, } } +void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty, + pixel *dstc, intptr_t i_dstc, + uint32_t *src, intptr_t i_src, int w, int h ) +{ + for( int l = 0; l < h; l++ ) + { + pixel *dsty0 = dsty; + pixel *dstc0 = dstc; + uint32_t *src0 = src; + + for( int n = 0; n < w; n += 3 ) + { + *(dstc0++) = *src0 & 0x03FF; + *(dsty0++) = ( *src0 >> 10 ) & 0x03FF; + *(dstc0++) = ( *src0 >> 20 ) & 0x03FF; + src0++; + *(dsty0++) = *src0 & 0x03FF; + *(dstc0++) = ( *src0 >> 10 ) & 0x03FF; + *(dsty0++) = ( *src0 >> 20 ) & 0x03FF; + src0++; + } + + dsty += i_dsty; + dstc += i_dstc; + src += i_src; + } +} + static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ) { for( int y=0; yplane_copy_interleave = x264_plane_copy_interleave_c; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c; + pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c; pf->hpel_filter = hpel_filter; diff --git a/common/mc.h b/common/mc.h index edd3e327..054ba60e 100644 --- a/common/mc.h +++ b/common/mc.h @@ -93,6 +93,9 @@ typedef struct pixel *src, intptr_t i_src, int w, int h ); void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); + void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty, + pixel *dstc, intptr_t i_dstc, + uint32_t *src, intptr_t i_src, int w, int h ); void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, intptr_t i_stride, int i_width, int i_height, int16_t *buf ); diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 39b83555..e12e19cf 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -38,6 +38,13 @@ filt_mul51: times 16 db -5, 1 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 +v210_mask: times 4 dq 0xc00ffc003ff003ff +v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15 +v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14 +; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register +v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800 + dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800 + %if HIGH_BIT_DEPTH deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 @@ -1195,6 +1202,64 @@ cglobal load_deinterleave_chroma_fdec, 4,4 RET %endmacro ; PLANE_DEINTERLEAVE +%macro PLANE_DEINTERLEAVE_V210 0 +;----------------------------------------------------------------------------- +; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty, +; uint16_t *dstc, intptr_t i_dstc, +; uint32_t *src, intptr_t i_src, int w, int h ) +;----------------------------------------------------------------------------- +%if ARCH_X86_64 +cglobal plane_copy_deinterleave_v210, 8,10,7 +%define src r8 +%define org_w r9 +%define h r7d +%else +cglobal plane_copy_deinterleave_v210, 7,7,7 +%define src r4m +%define org_w r6m +%define h dword r7m +%endif + FIX_STRIDES r1, r3, r6d + shl r5, 2 + add r0, r6 + add r2, r6 + neg r6 + mov src, r4 + mov org_w, r6 + mova m2, [v210_mask] + mova m3, [v210_luma_shuf] + mova m4, [v210_chroma_shuf] + mova m5, [v210_mult] ; also functions as vpermd index for avx2 + pshufd m6, m5, q1102 + +ALIGN 16 +.loop: + movu m1, [r4] + pandn m0, m2, m1 + pand m1, m2 + pshufb m0, m3 + pshufb m1, m4 + pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __ + pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __ +%if mmsize == 32 + vpermd m0, m5, m0 + vpermd m1, m5, m1 +%endif + movu [r0+r6], m0 + movu [r2+r6], m1 + add r4, mmsize + add r6, 3*mmsize/4 + jl .loop + add r0, r1 + add r2, r3 + add src, r5 + mov r4, src + mov r6, org_w + dec h + jg .loop + RET +%endmacro ; PLANE_DEINTERLEAVE_V210 + %if HIGH_BIT_DEPTH INIT_MMX mmx2 PLANE_INTERLEAVE @@ -1203,9 +1268,14 @@ PLANE_DEINTERLEAVE INIT_XMM sse2 PLANE_INTERLEAVE PLANE_DEINTERLEAVE +INIT_XMM ssse3 +PLANE_DEINTERLEAVE_V210 INIT_XMM avx PLANE_INTERLEAVE PLANE_DEINTERLEAVE +PLANE_DEINTERLEAVE_V210 +INIT_YMM avx2 +PLANE_DEINTERLEAVE_V210 %else INIT_MMX mmx2 PLANE_INTERLEAVE diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index cbe2d3ac..b2807302 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -116,6 +116,15 @@ void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu, void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu, uint16_t *dstv, intptr_t i_dstv, uint16_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); @@ -627,6 +636,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) return; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; + pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3; if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) ) pf->integral_init4v = x264_integral_init4v_ssse3; @@ -639,6 +649,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx; pf->plane_copy_interleave = x264_plane_copy_interleave_avx; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx; + pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx; pf->store_interleave_chroma = x264_store_interleave_chroma_avx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx; @@ -649,7 +660,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop; if( cpu&X264_CPU_AVX2 ) + { pf->mc_luma = mc_luma_avx2; + pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2; + } #else // !HIGH_BIT_DEPTH #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead diff --git a/encoder/encoder.c b/encoder/encoder.c index 2c7438ea..d7302963 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -467,7 +467,7 @@ static int x264_validate_parameters( x264_t *h, int b_open ) x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" ); return -1; } - else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_NV16 ) + else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_V210 ) { x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" ); return -1; diff --git a/filters/video/resize.c b/filters/video/resize.c index 7480a149..79fc89ad 100644 --- a/filters/video/resize.c +++ b/filters/video/resize.c @@ -94,9 +94,12 @@ static void help( int longhelp ) for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ ) { - printf( "%s", x264_cli_csps[i].name ); - if( i+1 < X264_CSP_CLI_MAX ) - printf( ", " ); + if( x264_cli_csps[i].name ) + { + printf( "%s", x264_cli_csps[i].name ); + if( i+1 < X264_CSP_CLI_MAX ) + printf( ", " ); + } } printf( "\n" " - depth: 8 or 16 bits per pixel [keep current]\n" @@ -243,8 +246,11 @@ static int handle_opts( const char **optlist, char **opts, video_info_t *info, r if( strlen( str_csp ) == 0 ) csp = info->csp & X264_CSP_MASK; else - for( csp = X264_CSP_CLI_MAX-1; x264_cli_csps[csp].name && strcasecmp( x264_cli_csps[csp].name, str_csp ); ) - csp--; + for( csp = X264_CSP_CLI_MAX-1; csp > X264_CSP_NONE; csp-- ) + { + if( x264_cli_csps[csp].name && !strcasecmp( x264_cli_csps[csp].name, str_csp ) ) + break; + } FAIL_IF_ERROR( csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", str_csp ); h->dst_csp = csp; if( depth == 16 ) diff --git a/input/input.c b/input/input.c index 9a99fa36..c6bb5ac2 100644 --- a/input/input.c +++ b/input/input.c @@ -42,7 +42,8 @@ const x264_cli_csp_t x264_cli_csps[] = { int x264_cli_csp_is_invalid( int csp ) { int csp_mask = csp & X264_CSP_MASK; - return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX || csp & X264_CSP_OTHER; + return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX || + csp_mask == X264_CSP_V210 || csp & X264_CSP_OTHER; } int x264_cli_csp_depth_factor( int csp ) diff --git a/input/raw.c b/input/raw.c index 0caf6f02..c4f14744 100644 --- a/input/raw.c +++ b/input/raw.c @@ -55,8 +55,11 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c FAIL_IF_ERROR( !info->width || !info->height, "raw input requires a resolution.\n" ) if( opt->colorspace ) { - for( info->csp = X264_CSP_CLI_MAX-1; x264_cli_csps[info->csp].name && strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ); ) - info->csp--; + for( info->csp = X264_CSP_CLI_MAX-1; info->csp > X264_CSP_NONE; info->csp-- ) + { + if( x264_cli_csps[info->csp].name && !strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ) ) + break; + } FAIL_IF_ERROR( info->csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", opt->colorspace ); } else /* default */ diff --git a/tools/checkasm.c b/tools/checkasm.c index 2f6f2860..f181ff54 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1453,6 +1453,33 @@ static int check_mc( int cpu_ref, int cpu_new ) } report( "plane_copy :" ); + if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 ) + { + set_func_name( "plane_copy_deinterleave_v210" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + intptr_t dst_stride = ALIGN( w, 16 ); + intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t); + intptr_t offv = dst_stride*h + 32; + memset( pbuf3, 0, 0x1000 ); + memset( pbuf4, 0, 0x1000 ); + call_c( mc_c.plane_copy_deinterleave_v210, pbuf3, dst_stride, pbuf3+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h ); + call_a( mc_a.plane_copy_deinterleave_v210, pbuf4, dst_stride, pbuf4+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(uint16_t) ) || + memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(uint16_t) ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_deinterleave_v210 FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); + break; + } + } + } + report( "v210 :" ); + if( mc_a.hpel_filter != mc_ref.hpel_filter ) { pixel *srchpel = pbuf1+8+2*64; diff --git a/x264.c b/x264.c index a32e43d9..79d14108 100644 --- a/x264.c +++ b/x264.c @@ -420,9 +420,12 @@ static void print_csp_names( int longhelp ) printf( INDENT ); for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ ) { - printf( "%s", x264_cli_csps[i].name ); - if( i+1 < X264_CSP_CLI_MAX ) - printf( ", " ); + if( x264_cli_csps[i].name ) + { + printf( "%s", x264_cli_csps[i].name ); + if( i+1 < X264_CSP_CLI_MAX ) + printf( ", " ); + } } #if HAVE_LAVF printf( "\n" ); @@ -1282,7 +1285,7 @@ static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info, int csp = info->csp & X264_CSP_MASK; if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) ) param->i_csp = X264_CSP_I420; - else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) ) + else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_V210) ) param->i_csp = X264_CSP_I422; else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) ) param->i_csp = X264_CSP_I444; diff --git a/x264.h b/x264.h index 796fb530..625a2d9e 100644 --- a/x264.h +++ b/x264.h @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 140 +#define X264_BUILD 141 /* Application developers planning to link against a shared library version of * libx264 from a Microsoft Visual Studio or similar development environment @@ -215,12 +215,13 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; #define X264_CSP_I422 0x0004 /* yuv 4:2:2 planar */ #define X264_CSP_YV16 0x0005 /* yvu 4:2:2 planar */ #define X264_CSP_NV16 0x0006 /* yuv 4:2:2, with one y plane and one packed u+v */ -#define X264_CSP_I444 0x0007 /* yuv 4:4:4 planar */ -#define X264_CSP_YV24 0x0008 /* yvu 4:4:4 planar */ -#define X264_CSP_BGR 0x0009 /* packed bgr 24bits */ -#define X264_CSP_BGRA 0x000a /* packed bgr 32bits */ -#define X264_CSP_RGB 0x000b /* packed rgb 24bits */ -#define X264_CSP_MAX 0x000c /* end of list */ +#define X264_CSP_V210 0x0007 /* 10-bit yuv 4:2:2 packed in 32 */ +#define X264_CSP_I444 0x0008 /* yuv 4:4:4 planar */ +#define X264_CSP_YV24 0x0009 /* yvu 4:4:4 planar */ +#define X264_CSP_BGR 0x000a /* packed bgr 24bits */ +#define X264_CSP_BGRA 0x000b /* packed bgr 32bits */ +#define X264_CSP_RGB 0x000c /* packed rgb 24bits */ +#define X264_CSP_MAX 0x000d /* end of list */ #define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */ #define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */ -- 2.50.1