From 627f891c571cacb51deb5e211b23c309b14a6587 Mon Sep 17 00:00:00 2001 From: Yu Xiaolei Date: Fri, 6 Jun 2014 16:05:27 +0800 Subject: [PATCH] NV21 input support Eliminates an extra copy when encoding Android camera preview images. Checkasm test by Janne Grunau. ARM assembly with improvements from Janne Grunau. --- common/arm/mc-a.S | 24 ++++++++++++++++++++++++ common/arm/mc-c.c | 3 +++ common/common.c | 1 + common/frame.c | 7 +++++++ common/mc.c | 12 ++++++++++++ common/mc.h | 1 + encoder/encoder.c | 2 +- filters/video/depth.c | 3 ++- filters/video/resize.c | 1 + input/input.c | 1 + tools/checkasm.c | 26 ++++++++++++++++++++++++++ x264.h | 23 ++++++++++++----------- 12 files changed, 91 insertions(+), 13 deletions(-) diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index 0a949b8f..695a6ca8 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -1566,6 +1566,30 @@ blocki: pop {r4-r7, pc} endfunc +function x264_plane_copy_swap_neon + push {r4-r5, lr} + ldrd r4, r5, [sp, #12] + add lr, r4, #15 + bic lr, lr, #15 + sub r1, r1, lr, lsl #1 + sub r3, r3, lr, lsl #1 +1: + vld1.8 {q0, q1}, [r2]! + subs lr, lr, #16 + vrev16.8 q0, q0 + vrev16.8 q1, q1 + vst1.8 {q0, q1}, [r0]! + bgt 1b + + subs r5, r5, #1 + add r0, r0, r1 + add r2, r2, r3 + mov lr, r4 + bgt 1b + + pop {r4-r5, pc} +endfunc + function x264_store_interleave_chroma_neon push {lr} ldr lr, [sp, #4] diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index e1097be3..2633772e 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -57,6 +57,8 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); +void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ); void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); @@ -240,6 +242,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; pf->plane_copy_interleave = x264_plane_copy_interleave_neon; + pf->plane_copy_swap = x264_plane_copy_swap_neon; pf->store_interleave_chroma = x264_store_interleave_chroma_neon; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; diff --git a/common/common.c b/common/common.c index 28eec1f8..3153c7eb 100644 --- a/common/common.c +++ b/common/common.c @@ -1142,6 +1142,7 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } }, [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } }, [X264_CSP_NV12] = { 2, { 256*1, 256*1 }, { 256*1, 256/2 }, }, + [X264_CSP_NV21] = { 2, { 256*1, 256*1 }, { 256*1, 256/2 }, }, [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, }, diff --git a/common/frame.c b/common/frame.c index 2e93c53e..203f48e6 100644 --- a/common/frame.c +++ b/common/frame.c @@ -47,6 +47,7 @@ static int x264_frame_internal_csp( int external_csp ) switch( external_csp & X264_CSP_MASK ) { case X264_CSP_NV12: + case X264_CSP_NV21: case X264_CSP_I420: case X264_CSP_YV12: return X264_CSP_NV12; @@ -435,6 +436,12 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift ); } + else if( i_csp == X264_CSP_NV21 ) + { + get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift ); + h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], + stride[1]/sizeof(pixel), h->param.i_width>>1, h->param.i_height>>v_shift ); + } else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 ) { int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16; diff --git a/common/mc.c b/common/mc.c index 4bb0bf32..e0483609 100644 --- a/common/mc.c +++ b/common/mc.c @@ -299,6 +299,17 @@ void x264_plane_copy_c( pixel *dst, intptr_t i_dst, } } +void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ) +{ + for( int y=0; yload_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec; pf->plane_copy = x264_plane_copy_c; + pf->plane_copy_swap = x264_plane_copy_swap_c; pf->plane_copy_interleave = x264_plane_copy_interleave_c; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c; diff --git a/common/mc.h b/common/mc.h index a03a76b8..53aab379 100644 --- a/common/mc.h +++ b/common/mc.h @@ -88,6 +88,7 @@ typedef struct void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height ); void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); + void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); void (*plane_copy_interleave)( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); /* may write up to 15 pixels off the end of each plane */ diff --git a/encoder/encoder.c b/encoder/encoder.c index 9d25a77e..4450d709 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -492,7 +492,7 @@ static int x264_validate_parameters( x264_t *h, int b_open ) #endif if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX ) { - x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" ); + x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" ); return -1; } diff --git a/filters/video/depth.c b/filters/video/depth.c index 88de95a8..c8001ce1 100644 --- a/filters/video/depth.c +++ b/filters/video/depth.c @@ -50,6 +50,7 @@ static int depth_filter_csp_is_supported( int csp ) csp_mask == X264_CSP_YV16 || csp_mask == X264_CSP_YV24 || csp_mask == X264_CSP_NV12 || + csp_mask == X264_CSP_NV21 || csp_mask == X264_CSP_NV16 || csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB || @@ -59,7 +60,7 @@ static int depth_filter_csp_is_supported( int csp ) static int csp_num_interleaved( int csp, int plane ) { int csp_mask = csp & X264_CSP_MASK; - return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 : + return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV21 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 : csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB ? 3 : csp_mask == X264_CSP_BGRA ? 4 : 1; diff --git a/filters/video/resize.c b/filters/video/resize.c index 958ed072..f348c5bb 100644 --- a/filters/video/resize.c +++ b/filters/video/resize.c @@ -156,6 +156,7 @@ static int convert_csp_to_pix_fmt( int csp ) case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGRA64 : AV_PIX_FMT_BGRA; /* the next csp has no equivalent 16bit depth in swscale */ case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV12; + case X264_CSP_NV21: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV21; /* the next csp is no supported by swscale at all */ case X264_CSP_NV16: default: return AV_PIX_FMT_NONE; diff --git a/input/input.c b/input/input.c index 209028b6..fa7f296a 100644 --- a/input/input.c +++ b/input/input.c @@ -33,6 +33,7 @@ const x264_cli_csp_t x264_cli_csps[] = { [X264_CSP_YV16] = { "yv16", 3, { 1, .5, .5 }, { 1, 1, 1 }, 2, 1 }, [X264_CSP_YV24] = { "yv24", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 }, [X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, + [X264_CSP_NV21] = { "nv21", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, [X264_CSP_NV16] = { "nv16", 2, { 1, 1 }, { 1, 1 }, 2, 1 }, [X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 }, [X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 }, diff --git a/tools/checkasm.c b/tools/checkasm.c index 0ef7d1d1..d620276d 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1413,6 +1413,32 @@ static int check_mc( int cpu_ref, int cpu_new ) } } + if( mc_a.plane_copy_swap != mc_ref.plane_copy_swap ) + { + set_func_name( "plane_copy_swap" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + intptr_t src_stride = plane_specs[i].src_stride; + intptr_t dst_stride = (2*w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); + memset( pbuf3, 0, 0x1000*sizeof(pixel) ); + memset( pbuf4, 0, 0x1000*sizeof(pixel) ); + call_c( mc_c.plane_copy_swap, pbuf3, dst_stride, src1, src_stride, w, h ); + call_a( mc_a.plane_copy_swap, pbuf4, dst_stride, src1, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_swap FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); + break; + } + } + } + if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave ) { set_func_name( "plane_copy_interleave" ); diff --git a/x264.h b/x264.h index 17495511..3d574ecb 100644 --- a/x264.h +++ b/x264.h @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 146 +#define X264_BUILD 147 /* Application developers planning to link against a shared library version of * libx264 from a Microsoft Visual Studio or similar development environment @@ -214,16 +214,17 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; #define X264_CSP_I420 0x0001 /* yuv 4:2:0 planar */ #define X264_CSP_YV12 0x0002 /* yvu 4:2:0 planar */ #define X264_CSP_NV12 0x0003 /* yuv 4:2:0, with one y plane and one packed u+v */ -#define X264_CSP_I422 0x0004 /* yuv 4:2:2 planar */ -#define X264_CSP_YV16 0x0005 /* yvu 4:2:2 planar */ -#define X264_CSP_NV16 0x0006 /* yuv 4:2:2, with one y plane and one packed u+v */ -#define X264_CSP_V210 0x0007 /* 10-bit yuv 4:2:2 packed in 32 */ -#define X264_CSP_I444 0x0008 /* yuv 4:4:4 planar */ -#define X264_CSP_YV24 0x0009 /* yvu 4:4:4 planar */ -#define X264_CSP_BGR 0x000a /* packed bgr 24bits */ -#define X264_CSP_BGRA 0x000b /* packed bgr 32bits */ -#define X264_CSP_RGB 0x000c /* packed rgb 24bits */ -#define X264_CSP_MAX 0x000d /* end of list */ +#define X264_CSP_NV21 0x0004 /* yuv 4:2:0, with one y plane and one packed v+u */ +#define X264_CSP_I422 0x0005 /* yuv 4:2:2 planar */ +#define X264_CSP_YV16 0x0006 /* yvu 4:2:2 planar */ +#define X264_CSP_NV16 0x0007 /* yuv 4:2:2, with one y plane and one packed u+v */ +#define X264_CSP_V210 0x0008 /* 10-bit yuv 4:2:2 packed in 32 */ +#define X264_CSP_I444 0x0009 /* yuv 4:4:4 planar */ +#define X264_CSP_YV24 0x000a /* yvu 4:4:4 planar */ +#define X264_CSP_BGR 0x000b /* packed bgr 24bits */ +#define X264_CSP_BGRA 0x000c /* packed bgr 32bits */ +#define X264_CSP_RGB 0x000d /* packed rgb 24bits */ +#define X264_CSP_MAX 0x000e /* end of list */ #define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */ #define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */ -- 2.40.0