From: Janne Grunau Date: Fri, 25 Jul 2014 10:53:17 +0000 (+0100) Subject: aarch64: implement x264_pixel_sa8d_satd_16x16_neon X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d040d28514db7d1fbd5c3f06c37a77de14b15e5b;p=libx264 aarch64: implement x264_pixel_sa8d_satd_16x16_neon ~20% faster than calling pixel_sa8d_16x16 and pixel_satd_16x16 separately. --- diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S index 07e9a610..8c7b9279 100644 --- a/common/aarch64/pixel-a.S +++ b/common/aarch64/pixel-a.S @@ -803,7 +803,7 @@ endfunc function x264_pixel_sa8d_8x8_neon, export=1 mov x4, x30 - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] @@ -814,20 +814,20 @@ endfunc function x264_pixel_sa8d_16x16_neon, export=1 mov x4, x30 - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #8 add x2, x2, #8 - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h add v0.4s, v30.4s, v31.4s @@ -838,13 +838,48 @@ function x264_pixel_sa8d_16x16_neon, export=1 ret x4 endfunc -function x264_sa8d_8x8_neon +.macro sa8d_satd_8x8 satd= +function pixel_sa8d_\satd\()8x8_neon load_diff_fly_8x8 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h +.ifc \satd, satd_ + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h + SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h + SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h + SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h + + transpose v4.4s, v6.4s, v24.4s, v26.4s + transpose v5.4s, v7.4s, v25.4s, v27.4s + transpose v24.4s, v26.4s, v0.4s, v2.4s + transpose v25.4s, v27.4s, v1.4s, v3.4s + + abs v0.8h, v4.8h + abs v1.8h, v5.8h + abs v2.8h, v6.8h + abs v3.8h, v7.8h + abs v4.8h, v24.8h + abs v5.8h, v25.8h + abs v6.8h, v26.8h + abs v7.8h, v27.8h + + umax v0.8h, v0.8h, v2.8h + umax v1.8h, v1.8h, v3.8h + umax v2.8h, v4.8h, v6.8h + umax v3.8h, v5.8h, v7.8h + + add v26.8h, v0.8h, v1.8h + add v27.8h, v2.8h, v3.8h +.endif + SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h @@ -855,20 +890,20 @@ function x264_sa8d_8x8_neon transpose v22.8h, v23.8h, v18.8h, v19.8h transpose v6.8h, v7.8h, v2.8h, v3.8h - SUMSUB_AB v28.8h, v29.8h, v20.8h, v21.8h + SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h - SUMSUB_AB v26.8h, v27.8h, v6.8h, v7.8h + SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h - transpose v20.4s, v22.4s, v28.4s, v0.4s - transpose v21.4s, v23.4s, v29.4s, v1.4s - transpose v16.4s, v18.4s, v24.4s, v26.4s - transpose v17.4s, v19.4s, v25.4s, v27.4s + transpose v20.4s, v22.4s, v2.4s, v0.4s + transpose v21.4s, v23.4s, v3.4s, v1.4s + transpose v16.4s, v18.4s, v24.4s, v4.4s + transpose v17.4s, v19.4s, v25.4s, v5.4s SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h - SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h - SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h transpose v16.2d, v20.2d, v0.2d, v4.2d transpose v17.2d, v21.2d, v1.2d, v5.2d @@ -894,7 +929,47 @@ function x264_sa8d_8x8_neon ret endfunc +.endm + +sa8d_satd_8x8 +sa8d_satd_8x8 satd_ +function x264_pixel_sa8d_satd_16x16_neon, export=1 + mov x4, x30 + bl pixel_sa8d_satd_8x8_neon + uaddlp v30.4s, v0.8h + uaddlp v31.4s, v1.8h + uaddlp v28.4s, v26.8h + uaddlp v29.4s, v27.8h + bl pixel_sa8d_satd_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + uadalp v28.4s, v26.8h + uadalp v29.4s, v27.8h + sub x0, x0, x1, lsl #4 + sub x2, x2, x3, lsl #4 + add x0, x0, #8 + add x2, x2, #8 + bl pixel_sa8d_satd_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + uadalp v28.4s, v26.8h + uadalp v29.4s, v27.8h + bl pixel_sa8d_satd_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + uadalp v28.4s, v26.8h + uadalp v29.4s, v27.8h + add v0.4s, v30.4s, v31.4s // sa8d + add v1.4s, v28.4s, v29.4s // satd + addv s0, v0.4s + addv s1, v1.4s + urshr v0.4s, v0.4s, #1 + fmov w0, s0 + fmov w1, s1 + add x0, x0, x1, lsl #32 + ret x4 +endfunc .macro HADAMARD_AC w h function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1 diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h index 9c7768c4..d4097edd 100644 --- a/common/aarch64/pixel.h +++ b/common/aarch64/pixel.h @@ -4,6 +4,7 @@ * Copyright (C) 2009-2014 x264 project * * Authors: David Conrad + * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -49,6 +50,7 @@ DECL_X1( ssd, neon ) int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); +uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); diff --git a/common/pixel.c b/common/pixel.c index de79152f..421a67c9 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1422,6 +1422,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;