]> granicus.if.org Git - libx264/commitdiff
aarch64: pixel: add 10bits sad functions
authorHubert Mazur <hum@semihalf.com>
Thu, 6 Oct 2022 11:37:53 +0000 (11:37 +0000)
committerHubert Mazur <hum@semihalf.com>
Fri, 28 Oct 2022 07:11:57 +0000 (07:11 +0000)
Provide routines for sad functions for high bit depth, i.e. 10 bits.
Benchmarks run on AWS Gravtion 2 instances.

sad_4x4_c: 583
sad_4x4_neon: 273
sad_4x8_c: 1179
sad_4x8_neon: 366
sad_4x16_c: 2121
sad_4x16_neon: 550
sad_8x4_c: 924
sad_8x4_neon: 213
sad_8x8_c: 1711
sad_8x8_neon: 316
sad_8x16_c: 3505
sad_8x16_neon: 497
sad_16x8_c: 3070
sad_16x8_neon: 635
sad_16x16_c: 6113
sad_16x16_neon: 1118

Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
common/aarch64/pixel-a.S
common/aarch64/pixel.h
common/pixel.c

index e3bc12d4ed9f68a96fbb579a59f46dc2c7a3c3e9..b75a81f7a321731bd98b9bb7cbefbd29019731f1 100644 (file)
@@ -40,6 +40,7 @@ const mask_ac_4_8
 .short 0, -1, -1, -1, -1, -1, -1, -1
 endconst
 
+#if BIT_DEPTH == 8
 .macro SAD_START_4
     ld1        {v1.s}[0], [x2], x3
     ld1        {v0.s}[0], [x0], x1
@@ -112,6 +113,110 @@ function pixel_sad\name\()_\w\()x\h\()_neon, export=1
 endfunc
 .endm
 
+#else /* BIT_DEPTH == 8 */
+
+.macro SAD_START_4
+    lsl        x1, x1, #1
+    lsl        x3, x3, #1
+    ld1        {v1.d}[0], [x2], x3
+    ld1        {v0.d}[0], [x0], x1
+    ld1        {v1.d}[1], [x2], x3
+    ld1        {v0.d}[1], [x0], x1
+    uabdl       v16.4s,  v0.4h,  v1.4h
+    uabdl2      v18.4s,  v0.8h,  v1.8h
+.endm
+
+.macro SAD_4
+    ld1        {v1.d}[0], [x2], x3
+    ld1        {v0.d}[0], [x0], x1
+    ld1        {v1.d}[1], [x2], x3
+    ld1        {v0.d}[1], [x0], x1
+    uabal       v16.4s,  v0.4h,  v1.4h
+    uabal2      v18.4s,  v0.8h,  v1.8h
+.endm
+
+.macro SAD_START_8
+    lsl         x1, x1, #1
+    lsl         x3, x3, #1
+    ld1         {v1.8h}, [x2], x3
+    ld1         {v0.8h}, [x0], x1
+    ld1         {v3.8h}, [x2], x3
+    ld1         {v2.8h}, [x0], x1
+    uabdl       v16.4s,  v0.4h,  v1.4h
+    uabdl2      v17.4s,  v0.8h,  v1.8h
+    uabdl       v18.4s,  v2.4h,  v3.4h
+    uabdl2      v19.4s,  v2.8h,  v3.8h
+.endm
+
+.macro SAD_8
+    ld1         {v1.8h}, [x2], x3
+    ld1         {v0.8h}, [x0], x1
+    ld1         {v3.8h}, [x2], x3
+    ld1         {v2.8h}, [x0], x1
+    uabal       v16.4s,  v0.4h,  v1.4h
+    uabal2      v17.4s,  v0.8h,  v1.8h
+    uabal       v18.4s,  v2.4h,  v3.4h
+    uabal2      v19.4s,  v2.8h,  v3.8h
+.endm
+
+.macro SAD_START_16
+    lsl         x1, x1, #1
+    lsl         x3, x3, #1
+    ld2         {v0.8h, v1.8h}, [x2], x3
+    ld2         {v2.8h, v3.8h}, [x0], x1
+    ld2         {v4.8h, v5.8h}, [x2], x3
+    ld2         {v6.8h, v7.8h}, [x0], x1
+    uabdl       v16.4s,  v0.4h,  v2.4h
+    uabdl2      v17.4s,  v0.8h,  v2.8h
+    uabdl       v20.4s,  v1.4h,  v3.4h
+    uabdl2      v21.4s,  v1.8h,  v3.8h
+    uabdl       v18.4s,  v4.4h,  v6.4h
+    uabdl2      v19.4s,  v4.8h,  v6.8h
+    uabdl       v22.4s,  v5.4h,  v7.4h
+    uabdl2      v23.4s,  v5.8h,  v7.8h
+.endm
+
+.macro SAD_16
+    ld2         {v0.8h, v1.8h}, [x2], x3
+    ld2         {v2.8h, v3.8h}, [x0], x1
+    ld2         {v4.8h, v5.8h}, [x2], x3
+    ld2         {v6.8h, v7.8h}, [x0], x1
+    uabal       v16.4s,  v0.4h,  v2.4h
+    uabal2      v17.4s,  v0.8h,  v2.8h
+    uabal       v20.4s,  v1.4h,  v3.4h
+    uabal2      v21.4s,  v1.8h,  v3.8h
+    uabal       v18.4s,  v4.4h,  v6.4h
+    uabal2      v19.4s,  v4.8h,  v6.8h
+    uabal       v22.4s,  v5.4h,  v7.4h
+    uabal2      v23.4s,  v5.8h,  v7.8h
+.endm
+
+.macro SAD_FUNC w, h, name
+function pixel_sad\name\()_\w\()x\h\()_neon, export=1
+    SAD_START_\w
+
+.rept \h / 2 - 1
+    SAD_\w
+.endr
+.if \w > 8
+    add         v20.4s,  v20.4s,  v21.4s
+    add         v16.4s,  v16.4s,  v20.4s
+    add         v22.4s,  v22.4s,  v23.4s
+    add         v18.4s,  v18.4s,  v22.4s
+.endif
+.if \w > 4
+    add         v16.4s,  v16.4s,  v17.4s
+    add         v18.4s,  v18.4s,  v19.4s
+.endif
+    add         v16.4s,  v16.4s,  v18.4s
+    uaddlv      s0,  v16.8h
+    fmov        w0,  s0
+    ret
+endfunc
+.endm
+
+#endif /* BIT_DEPTH == 8 */
+
 SAD_FUNC  4,  4
 SAD_FUNC  4,  8
 SAD_FUNC  4,  16
index 02c969c1f6191a3b240076a56ba11889baebd534..d1e51269b1e8e5541833547535509c64181a7555 100644 (file)
     ret x264_pixel_##name##_4x4_##suffix args;\
 
 #define DECL_X1( name, suffix ) \
-    DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
+    DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
 
 #define DECL_X4( name, suffix ) \
-    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
-    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
+    DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
+    DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
 
 DECL_X1( sad, neon )
 DECL_X4( sad, neon )
index 113df307e68bf8db3a4a05833d5b8f6084bdbf31..6080bb5d0a49078d53f64b55b33a380b3d94fb84 100644 (file)
@@ -1054,6 +1054,13 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
         pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
     }
 #endif // HAVE_MMX
+#if HAVE_AARCH64
+    if( cpu&X264_CPU_NEON )
+    {
+        INIT8( sad, _neon );
+    }
+#endif // HAVE_AARCH64
+
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )