From 5caef139cf7d6b41a95ee9568625d36d1ae1c107 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Fri, 26 Aug 2016 20:26:55 +0300 Subject: [PATCH] arm/aarch64: use plane_copy wrapper macros Move the macros to common/mc.h to share them across all architectures. Fixes possible buffer overreads if the width of the user supplied frames is not a multiple of 16. Reported-by: Kirill Batuzov --- common/aarch64/mc-a.S | 4 +- common/aarch64/mc-c.c | 13 +++--- common/arm/mc-a.S | 6 +-- common/arm/mc-c.c | 18 +++++---- common/mc.h | 92 +++++++++++++++++++++++++++++++++++++++++++ common/x86/mc-c.c | 89 ----------------------------------------- 6 files changed, 116 insertions(+), 106 deletions(-) diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S index 915d8c01..fe0f8704 100644 --- a/common/aarch64/mc-a.S +++ b/common/aarch64/mc-a.S @@ -1253,7 +1253,7 @@ load_deinterleave_chroma: ret endfunc -function x264_plane_copy_neon, export=1 +function x264_plane_copy_core_neon, export=1 add x8, x4, #15 and x4, x8, #~15 sub x1, x1, x4 @@ -1352,7 +1352,7 @@ function x264_plane_copy_deinterleave_rgb_neon, export=1 ret endfunc -function x264_plane_copy_interleave_neon, export=1 +function x264_plane_copy_interleave_core_neon, export=1 add w9, w6, #15 and w9, w9, #0xfffffff0 sub x1, x1, x9, lsl #1 diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c index 717820f0..4f93965e 100644 --- a/common/aarch64/mc-c.c +++ b/common/aarch64/mc-c.c @@ -49,8 +49,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); -void x264_plane_copy_neon( pixel *dst, intptr_t i_dst, - pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ); void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); @@ -58,9 +58,9 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); -void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, - pixel *srcu, intptr_t i_srcu, - pixel *srcv, intptr_t i_srcv, int w, int h ); +void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); @@ -206,6 +206,9 @@ static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride, void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); + +PLANE_COPY(16, neon) +PLANE_INTERLEAVE(neon) #endif // !HIGH_BIT_DEPTH PROPAGATE_LIST(neon) diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index 76295cd4..165c1fa9 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -1468,7 +1468,7 @@ function x264_load_deinterleave_chroma_fenc_neon bx lr endfunc -function x264_plane_copy_neon +function x264_plane_copy_core_neon push {r4,lr} ldr r4, [sp, #8] ldr lr, [sp, #12] @@ -1577,7 +1577,7 @@ block4: pop {r4-r8, r10, r11, pc} endfunc -function x264_plane_copy_interleave_neon +function x264_plane_copy_interleave_core_neon push {r4-r7, lr} ldrd r6, r7, [sp, #28] ldrd r4, r5, [sp, #20] @@ -1604,7 +1604,7 @@ blocki: pop {r4-r7, pc} endfunc -function x264_plane_copy_swap_neon +function x264_plane_copy_swap_core_neon push {r4-r5, lr} ldrd r4, r5, [sp, #12] add lr, r4, #15 diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index d330bc30..ae1a6861 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -48,8 +48,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); -void x264_plane_copy_neon( pixel *dst, intptr_t i_dst, - pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ); void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); @@ -57,11 +57,11 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); -void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, - pixel *srcu, intptr_t i_srcu, - pixel *srcv, intptr_t i_srcv, int w, int h ); -void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst, - pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); +void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ); void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); @@ -232,6 +232,10 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8 src += stride; } } + +PLANE_COPY(16, neon) +PLANE_COPY_SWAP(16, neon) +PLANE_INTERLEAVE(neon) #endif // !HIGH_BIT_DEPTH PROPAGATE_LIST(neon) diff --git a/common/mc.h b/common/mc.h index cebdb557..5a83ec22 100644 --- a/common/mc.h +++ b/common/mc.h @@ -100,6 +100,98 @@ static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, in }\ } +void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); + +#define PLANE_COPY(align, cpu)\ +static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ +{\ + int c_w = (align) / sizeof(pixel) - 1;\ + if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ + x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\ + else if( !(w&c_w) )\ + x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\ + else\ + {\ + if( --h > 0 )\ + {\ + if( i_src > 0 )\ + {\ + x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ + dst += i_dst * h;\ + src += i_src * h;\ + }\ + else\ + x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ + }\ + /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ + memcpy( dst, src, w*sizeof(pixel) );\ + }\ +} + +void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); + +#define PLANE_COPY_SWAP(align, cpu)\ +static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ +{\ + int c_w = (align>>1) / sizeof(pixel) - 1;\ + if( !(w&c_w) )\ + x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\ + else if( w > c_w )\ + {\ + if( --h > 0 )\ + {\ + if( i_src > 0 )\ + {\ + x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ + dst += i_dst * h;\ + src += i_src * h;\ + }\ + else\ + x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ + }\ + x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\ + for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\ + {\ + dst[x] = src[x+1];\ + dst[x+1] = src[x];\ + }\ + }\ + else\ + x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\ +} + +void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); + +#define PLANE_INTERLEAVE(cpu) \ +static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\ + pixel *srcu, intptr_t i_srcu,\ + pixel *srcv, intptr_t i_srcv, int w, int h )\ +{\ + int c_w = 16 / sizeof(pixel) - 1;\ + if( !(w&c_w) )\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ + else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\ + {\ + if( --h > 0 )\ + {\ + if( i_srcu > 0 )\ + {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\ + dst += i_dst * h;\ + srcu += i_srcu * h;\ + srcv += i_srcv * h;\ + }\ + else\ + x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\ + }\ + x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ + }\ + else\ + x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ +} + struct x264_weight_t; typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int ); typedef struct x264_weight_t diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 11e481e1..21acdebd 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -88,10 +88,8 @@ void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); void x264_prefetch_ref_mmx2( pixel *, intptr_t, int ); void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); -void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); -void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); @@ -101,9 +99,6 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst, void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); -void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, - pixel *srcu, intptr_t i_srcu, - pixel *srcv, intptr_t i_srcv, int w, int h ); void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); @@ -493,96 +488,12 @@ HPEL(32, avx2, avx2, avx2, avx2) #endif #endif // HIGH_BIT_DEPTH -#define PLANE_COPY(align, cpu)\ -static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ -{\ - int c_w = (align) / sizeof(pixel) - 1;\ - if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ - x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\ - else if( !(w&c_w) )\ - x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\ - else\ - {\ - if( --h > 0 )\ - {\ - if( i_src > 0 )\ - {\ - x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ - dst += i_dst * h;\ - src += i_src * h;\ - }\ - else\ - x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ - }\ - /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ - memcpy( dst, src, w*sizeof(pixel) );\ - }\ -} - PLANE_COPY(16, sse) PLANE_COPY(32, avx) -#define PLANE_COPY_SWAP(align, cpu)\ -static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ -{\ - int c_w = (align>>1) / sizeof(pixel) - 1;\ - if( !(w&c_w) )\ - x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\ - else if( w > c_w )\ - {\ - if( --h > 0 )\ - {\ - if( i_src > 0 )\ - {\ - x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ - dst += i_dst * h;\ - src += i_src * h;\ - }\ - else\ - x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ - }\ - x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\ - for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\ - {\ - dst[x] = src[x+1];\ - dst[x+1] = src[x];\ - }\ - }\ - else\ - x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\ -} - PLANE_COPY_SWAP(16, ssse3) PLANE_COPY_SWAP(32, avx2) -#define PLANE_INTERLEAVE(cpu) \ -static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\ - pixel *srcu, intptr_t i_srcu,\ - pixel *srcv, intptr_t i_srcv, int w, int h )\ -{\ - int c_w = 16 / sizeof(pixel) - 1;\ - if( !(w&c_w) )\ - x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ - else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\ - {\ - if( --h > 0 )\ - {\ - if( i_srcu > 0 )\ - {\ - x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\ - dst += i_dst * h;\ - srcu += i_srcu * h;\ - srcv += i_srcv * h;\ - }\ - else\ - x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\ - }\ - x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ - }\ - else\ - x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ -} - PLANE_INTERLEAVE(mmx2) PLANE_INTERLEAVE(sse2) #if HIGH_BIT_DEPTH -- 2.40.0