From: Janne Grunau <janne-x264@jannau.net>
Date: Fri, 18 Jul 2014 13:49:10 +0000 (+0100)
Subject: aarch64: deblocking NEON asm
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1343db872b1d7d43dc7fb431a8207efb5ca31e2e;p=libx264

aarch64: deblocking NEON asm

Deblock chroma/luma are based on libav's h264 aarch64 NEON deblocking
filter which was ported by me from the existing ARM NEON asm. No
additional persons to ask for a relicense.
---

diff --git a/Makefile b/Makefile
index 397b54de..171b46d9 100644
--- a/Makefile
+++ b/Makefile
@@ -127,6 +127,7 @@ endif
 ifeq ($(ARCH),AARCH64)
 ifneq ($(AS),)
 ASMSRC += common/aarch64/dct-a.S     \
+          common/aarch64/deblock-a.S \
           common/aarch64/mc-a.S      \
           common/aarch64/pixel-a.S   \
           common/aarch64/predict-a.S \
diff --git a/common/aarch64/asm.S b/common/aarch64/asm.S
index bf3169fe..e689a42c 100644
--- a/common/aarch64/asm.S
+++ b/common/aarch64/asm.S
@@ -107,6 +107,11 @@ MACH    .const_data
     sub        \sub,  \a,  \b
 .endm
 
+.macro unzip t1, t2, s1, s2
+    uzp1        \t1,  \s1,  \s2
+    uzp2        \t2,  \s1,  \s2
+.endm
+
 .macro transpose t1, t2, s1, s2
     trn1        \t1,  \s1,  \s2
     trn2        \t2,  \s1,  \s2
@@ -158,3 +163,59 @@ MACH    .const_data
     trn1        \r3\().2D,  \r9\().2D,  \r7\().2D
     trn2        \r7\().2D,  \r9\().2D,  \r7\().2D
 .endm
+
+.macro  transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+    trn1        \t0\().16b, \r0\().16b, \r1\().16b
+    trn2        \t1\().16b, \r0\().16b, \r1\().16b
+    trn1        \r1\().16b, \r2\().16b, \r3\().16b
+    trn2        \r3\().16b, \r2\().16b, \r3\().16b
+    trn1        \r0\().16b, \r4\().16b, \r5\().16b
+    trn2        \r5\().16b, \r4\().16b, \r5\().16b
+    trn1        \r2\().16b, \r6\().16b, \r7\().16b
+    trn2        \r7\().16b, \r6\().16b, \r7\().16b
+
+    trn1        \r4\().8h,  \r0\().8h,  \r2\().8h
+    trn2        \r2\().8h,  \r0\().8h,  \r2\().8h
+    trn1        \r6\().8h,  \r5\().8h,  \r7\().8h
+    trn2        \r7\().8h,  \r5\().8h,  \r7\().8h
+    trn1        \r5\().8h,  \t1\().8h,  \r3\().8h
+    trn2        \t1\().8h,  \t1\().8h,  \r3\().8h
+    trn1        \r3\().8h,  \t0\().8h,  \r1\().8h
+    trn2        \t0\().8h,  \t0\().8h,  \r1\().8h
+
+    trn1        \r0\().4s,  \r3\().4s,  \r4\().4s
+    trn2        \r4\().4s,  \r3\().4s,  \r4\().4s
+
+    trn1        \r1\().4s,  \r5\().4s,  \r6\().4s
+    trn2        \r5\().4s,  \r5\().4s,  \r6\().4s
+
+    trn2        \r6\().4s,  \t0\().4s,  \r2\().4s
+    trn1        \r2\().4s,  \t0\().4s,  \r2\().4s
+
+    trn1        \r3\().4s,  \t1\().4s,  \r7\().4s
+    trn2        \r7\().4s,  \t1\().4s,  \r7\().4s
+.endm
+
+.macro  transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
+    trn1        \t4\().16b, \r0\().16b,  \r1\().16b
+    trn2        \t5\().16b, \r0\().16b,  \r1\().16b
+    trn1        \t6\().16b, \r2\().16b,  \r3\().16b
+    trn2        \t7\().16b, \r2\().16b,  \r3\().16b
+
+    trn1        \r0\().8h,  \t4\().8h,  \t6\().8h
+    trn2        \r2\().8h,  \t4\().8h,  \t6\().8h
+    trn1        \r1\().8h,  \t5\().8h,  \t7\().8h
+    trn2        \r3\().8h,  \t5\().8h,  \t7\().8h
+.endm
+
+.macro  transpose_4x8.b  r0, r1, r2, r3, t4, t5, t6, t7
+    trn1        \t4\().8b,  \r0\().8b,  \r1\().8b
+    trn2        \t5\().8b,  \r0\().8b,  \r1\().8b
+    trn1        \t6\().8b,  \r2\().8b,  \r3\().8b
+    trn2        \t7\().8b,  \r2\().8b,  \r3\().8b
+
+    trn1        \r0\().4h,  \t4\().4h,  \t6\().4h
+    trn2        \r2\().4h,  \t4\().4h,  \t6\().4h
+    trn1        \r1\().4h,  \t5\().4h,  \t7\().4h
+    trn2        \r3\().4h,  \t5\().4h,  \t7\().4h
+.endm
diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S
new file mode 100644
index 00000000..00be8e70
--- /dev/null
+++ b/common/aarch64/deblock-a.S
@@ -0,0 +1,392 @@
+/*****************************************************************************
+ * deblock.S: aarch64 deblocking
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: Mans Rullgard <mans@mansr.com>
+ *          Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.macro h264_loop_filter_start
+    cmp             w2,  #0
+    ldr             w6,  [x4]
+    ccmp            w3,  #0, #0, ne
+    mov             v24.s[0], w6
+    and             w6,  w6,  w6,  lsl #16
+    b.eq            1f
+    ands            w6,  w6,  w6,  lsl #8
+    b.ge            2f
+1:
+    ret
+2:
+.endm
+
+.macro h264_loop_filter_luma
+    dup             v22.16b, w2                     // alpha
+    uxtl            v24.8h,  v24.8b
+    uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
+    uxtl            v24.4s,  v24.4h
+    uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
+    sli             v24.8h,  v24.8h,  #8
+    uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
+    sli             v24.4s,  v24.4s,  #16
+    cmhi            v21.16b, v22.16b, v21.16b       // < alpha
+    dup             v22.16b, w3                     // beta
+    cmlt            v23.16b, v24.16b, #0
+    cmhi            v28.16b, v22.16b, v28.16b       // < beta
+    cmhi            v30.16b, v22.16b, v30.16b       // < beta
+    bic             v21.16b, v21.16b, v23.16b
+    uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
+    and             v21.16b, v21.16b, v28.16b
+    uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
+    cmhi            v17.16b, v22.16b, v17.16b       // < beta
+    and             v21.16b, v21.16b, v30.16b
+    cmhi            v19.16b, v22.16b, v19.16b       // < beta
+    and             v17.16b, v17.16b, v21.16b
+    and             v19.16b, v19.16b, v21.16b
+    and             v24.16b, v24.16b, v21.16b
+    urhadd          v28.16b, v16.16b,  v0.16b
+    sub             v21.16b, v24.16b, v17.16b
+    uqadd           v23.16b, v18.16b, v24.16b
+    uhadd           v20.16b, v20.16b, v28.16b
+    sub             v21.16b, v21.16b, v19.16b
+    uhadd           v28.16b,  v4.16b, v28.16b
+    umin            v23.16b, v23.16b, v20.16b
+    uqsub           v22.16b, v18.16b, v24.16b
+    uqadd           v4.16b,   v2.16b, v24.16b
+    umax            v23.16b, v23.16b, v22.16b
+    uqsub           v22.16b,  v2.16b, v24.16b
+    umin            v28.16b,  v4.16b, v28.16b
+    uxtl            v4.8h,    v0.8b
+    umax            v28.16b, v28.16b, v22.16b
+    uxtl2           v20.8h,   v0.16b
+    usubw           v4.8h,    v4.8h,  v16.8b
+    usubw2          v20.8h,  v20.8h,  v16.16b
+    shl             v4.8h,    v4.8h,  #2
+    shl             v20.8h,  v20.8h,  #2
+    uaddw           v4.8h,    v4.8h,  v18.8b
+    uaddw2          v20.8h,  v20.8h,  v18.16b
+    usubw           v4.8h,    v4.8h,   v2.8b
+    usubw2          v20.8h,  v20.8h,   v2.16b
+    rshrn           v4.8b,    v4.8h,  #3
+    rshrn2          v4.16b,  v20.8h,  #3
+    bsl             v17.16b, v23.16b, v18.16b
+    bsl             v19.16b, v28.16b,  v2.16b
+    neg             v23.16b, v21.16b
+    uxtl            v28.8h,  v16.8b
+    smin            v4.16b,   v4.16b, v21.16b
+    uxtl2           v21.8h,  v16.16b
+    smax            v4.16b,   v4.16b, v23.16b
+    uxtl            v22.8h,   v0.8b
+    uxtl2           v24.8h,   v0.16b
+    saddw           v28.8h,  v28.8h,  v4.8b
+    saddw2          v21.8h,  v21.8h,  v4.16b
+    ssubw           v22.8h,  v22.8h,  v4.8b
+    ssubw2          v24.8h,  v24.8h,  v4.16b
+    sqxtun          v16.8b,  v28.8h
+    sqxtun2         v16.16b, v21.8h
+    sqxtun          v0.8b,   v22.8h
+    sqxtun2         v0.16b,  v24.8h
+.endm
+
+function x264_deblock_v_luma_neon, export=1
+    h264_loop_filter_start
+
+    ld1             {v0.16b},  [x0], x1
+    ld1             {v2.16b},  [x0], x1
+    ld1             {v4.16b},  [x0], x1
+    sub             x0,  x0,  x1, lsl #2
+    sub             x0,  x0,  x1, lsl #1
+    ld1             {v20.16b},  [x0], x1
+    ld1             {v18.16b},  [x0], x1
+    ld1             {v16.16b},  [x0], x1
+
+    h264_loop_filter_luma
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v17.16b}, [x0], x1
+    st1             {v16.16b}, [x0], x1
+    st1             {v0.16b},  [x0], x1
+    st1             {v19.16b}, [x0]
+
+    ret
+endfunc
+
+function x264_deblock_h_luma_neon, export=1
+    h264_loop_filter_start
+
+    sub             x0,  x0,  #4
+    ld1             {v6.8b},  [x0], x1
+    ld1             {v20.8b}, [x0], x1
+    ld1             {v18.8b}, [x0], x1
+    ld1             {v16.8b}, [x0], x1
+    ld1             {v0.8b},  [x0], x1
+    ld1             {v2.8b},  [x0], x1
+    ld1             {v4.8b},  [x0], x1
+    ld1             {v26.8b}, [x0], x1
+    ld1             {v6.d}[1],  [x0], x1
+    ld1             {v20.d}[1], [x0], x1
+    ld1             {v18.d}[1], [x0], x1
+    ld1             {v16.d}[1], [x0], x1
+    ld1             {v0.d}[1],  [x0], x1
+    ld1             {v2.d}[1],  [x0], x1
+    ld1             {v4.d}[1],  [x0], x1
+    ld1             {v26.d}[1], [x0], x1
+
+    transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
+
+    h264_loop_filter_luma
+
+    transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
+
+    sub             x0,  x0,  x1, lsl #4
+    add             x0,  x0,  #2
+    st1             {v17.s}[0],  [x0], x1
+    st1             {v16.s}[0], [x0], x1
+    st1             {v0.s}[0],  [x0], x1
+    st1             {v19.s}[0], [x0], x1
+    st1             {v17.s}[1],  [x0], x1
+    st1             {v16.s}[1], [x0], x1
+    st1             {v0.s}[1],  [x0], x1
+    st1             {v19.s}[1], [x0], x1
+    st1             {v17.s}[2],  [x0], x1
+    st1             {v16.s}[2], [x0], x1
+    st1             {v0.s}[2],  [x0], x1
+    st1             {v19.s}[2], [x0], x1
+    st1             {v17.s}[3],  [x0], x1
+    st1             {v16.s}[3], [x0], x1
+    st1             {v0.s}[3],  [x0], x1
+    st1             {v19.s}[3], [x0], x1
+
+    ret
+endfunc
+
+.macro h264_loop_filter_chroma
+    dup             v22.16b, w2              // alpha
+    uxtl            v24.8h,  v24.8b
+    uabd            v26.16b, v16.16b, v0.16b   // abs(p0 - q0)
+    uxtl            v4.8h,   v0.8b
+    uxtl2           v5.8h,   v0.16b
+    uabd            v28.16b, v18.16b, v16.16b  // abs(p1 - p0)
+    usubw           v4.8h,   v4.8h,   v16.8b
+    usubw2          v5.8h,   v5.8h,   v16.16b
+    sli             v24.8h,  v24.8h,  #8
+    shl             v4.8h,   v4.8h,   #2
+    shl             v5.8h,   v5.8h,   #2
+    uabd            v30.16b, v2.16b,  v0.16b   // abs(q1 - q0)
+    uxtl            v24.4s,  v24.4h
+    uaddw           v4.8h,   v4.8h,   v18.8b
+    uaddw2          v5.8h,   v5.8h,   v18.16b
+    cmhi            v26.16b, v22.16b, v26.16b  // < alpha
+    usubw           v4.8h,   v4.8h,   v2.8b
+    usubw2          v5.8h,   v5.8h,   v2.16b
+    sli             v24.4s,  v24.4s,  #16
+    dup             v22.16b, w3              // beta
+    rshrn           v4.8b,   v4.8h,   #3
+    rshrn2          v4.16b,  v5.8h,   #3
+    cmhi            v28.16b, v22.16b, v28.16b  // < beta
+    cmhi            v30.16b, v22.16b, v30.16b  // < beta
+    smin            v4.16b,  v4.16b,  v24.16b
+    neg             v25.16b, v24.16b
+    and             v26.16b, v26.16b, v28.16b
+    smax            v4.16b,  v4.16b,  v25.16b
+    and             v26.16b, v26.16b, v30.16b
+    uxtl            v22.8h,  v0.8b
+    uxtl2           v23.8h,  v0.16b
+    and             v4.16b,  v4.16b,  v26.16b
+    uxtl            v28.8h,  v16.8b
+    uxtl2           v29.8h,  v16.16b
+    saddw           v28.8h,  v28.8h,  v4.8b
+    saddw2          v29.8h,  v29.8h,  v4.16b
+    ssubw           v22.8h,  v22.8h,  v4.8b
+    ssubw2          v23.8h,  v23.8h,  v4.16b
+    sqxtun          v16.8b,  v28.8h
+    sqxtun          v0.8b,   v22.8h
+    sqxtun2         v16.16b, v29.8h
+    sqxtun2         v0.16b,  v23.8h
+.endm
+
+function x264_deblock_v_chroma_neon, export=1
+    h264_loop_filter_start
+
+    sub             x0,  x0,  x1, lsl #1
+    ld1             {v18.16b}, [x0], x1
+    ld1             {v16.16b}, [x0], x1
+    ld1             {v0.16b},  [x0], x1
+    ld1             {v2.16b},  [x0]
+
+    h264_loop_filter_chroma
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v16.16b}, [x0], x1
+    st1             {v0.16b},  [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_neon, export=1
+    h264_loop_filter_start
+
+    sub             x0,  x0,  #4
+    ld1             {v18.d}[0], [x0], x1
+    ld1             {v16.d}[0], [x0], x1
+    ld1             {v0.d}[0],  [x0], x1
+    ld1             {v2.d}[0],  [x0], x1
+    ld1             {v18.d}[1], [x0], x1
+    ld1             {v16.d}[1], [x0], x1
+    ld1             {v0.d}[1],  [x0], x1
+    ld1             {v2.d}[1],  [x0], x1
+
+    transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
+
+    h264_loop_filter_chroma
+
+    transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
+
+    sub             x0,  x0,  x1, lsl #3
+    st1             {v18.d}[0], [x0], x1
+    st1             {v16.d}[0], [x0], x1
+    st1             {v0.d}[0],  [x0], x1
+    st1             {v2.d}[0],  [x0], x1
+    st1             {v18.d}[1], [x0], x1
+    st1             {v16.d}[1], [x0], x1
+    st1             {v0.d}[1],  [x0], x1
+    st1             {v2.d}[1],  [x0], x1
+
+    ret
+endfunc
+
+
+//static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
+//                                int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+//                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
+//                                uint8_t bs[2][8][4], int mvy_limit,
+//                                int bframe )
+function x264_deblock_strength_neon, export=1
+    movi        v4.16b, #0
+    lsl         w4,  w4,  #8
+    add         x3,  x3,  #32
+    sub         w4,  w4,  #(1<<8)-3
+    movi        v5.16b, #0
+    dup         v6.8h,  w4
+    mov         x6,  #-32
+
+bframe:
+    // load bytes ref
+    add         x2,  x2,  #16
+    ld1        {v31.d}[1], [x1], #8
+    ld1        {v1.16b}, [x1], #16
+    movi        v0.16b,  #0
+    ld1        {v2.16b}, [x1], #16
+    ext         v3.16b,  v0.16b,  v1.16b,  #15
+    ext         v0.16b,  v0.16b,  v2.16b,  #15
+    unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
+    unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
+    ext         v21.16b, v31.16b, v22.16b, #12
+
+    eor         v0.16b,  v20.16b, v22.16b
+    eor         v1.16b,  v21.16b, v22.16b
+    orr         v4.16b,  v4.16b,  v0.16b
+    orr         v5.16b,  v5.16b,  v1.16b
+
+    ld1        {v21.8h}, [x2], #16      // mv + 0x10
+    ld1        {v19.8h}, [x2], #16      // mv + 0x20
+    ld1        {v22.8h}, [x2], #16      // mv + 0x30
+    ld1        {v18.8h}, [x2], #16      // mv + 0x40
+    ld1        {v23.8h}, [x2], #16      // mv + 0x50
+    ext         v19.16b, v19.16b, v22.16b, #12
+    ext         v18.16b, v18.16b, v23.16b, #12
+    sabd        v0.8h,   v22.8h,  v19.8h
+    ld1        {v19.8h}, [x2], #16      // mv + 0x60
+    sabd        v1.8h,   v23.8h,  v18.8h
+    ld1        {v24.8h}, [x2], #16      // mv + 0x70
+    uqxtn       v0.8b,   v0.8h
+    ld1        {v18.8h}, [x2], #16      // mv + 0x80
+    ld1        {v25.8h}, [x2], #16      // mv + 0x90
+    uqxtn2      v0.16b,  v1.8h
+    ext         v19.16b, v19.16b, v24.16b, #12
+    ext         v18.16b, v18.16b, v25.16b, #12
+    sabd        v1.8h,   v24.8h,  v19.8h
+    sabd        v2.8h,   v25.8h,  v18.8h
+    uqxtn       v1.8b,   v1.8h
+    uqxtn2      v1.16b,  v2.8h
+
+    uqsub       v0.16b,  v0.16b,  v6.16b
+    uqsub       v1.16b,  v1.16b,  v6.16b
+    uqxtn       v0.8b,   v0.8h
+    uqxtn2      v0.16b,  v1.8h
+
+    sabd        v1.8h,   v22.8h,  v23.8h
+    orr         v4.16b,  v4.16b,  v0.16b
+
+    sabd        v0.8h,   v21.8h,  v22.8h
+    sabd        v2.8h,   v23.8h,  v24.8h
+    sabd        v3.8h,   v24.8h,  v25.8h
+    uqxtn       v0.8b,   v0.8h
+    uqxtn2      v0.16b,  v1.8h
+    uqxtn       v1.8b,   v2.8h
+    uqxtn2      v1.16b,  v3.8h
+
+    uqsub       v0.16b,  v0.16b,  v6.16b
+    uqsub       v1.16b,  v1.16b,  v6.16b
+    uqxtn       v0.8b,   v0.8h
+    uqxtn2      v0.16b,  v1.8h
+    subs        w5,  w5,  #1
+    orr         v5.16b,  v5.16b,  v0.16b
+    b.eq        bframe
+
+    movi        v6.16b, #1
+    // load bytes nnz
+    ld1        {v31.d}[1], [x0], #8
+    ld1        {v1.16b}, [x0], #16
+    movi        v0.16b,  #0
+    ld1        {v2.16b}, [x0], #16
+    ext         v3.16b,  v0.16b,  v1.16b,  #15
+    ext         v0.16b,  v0.16b,  v2.16b,  #15
+    unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
+    unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
+    ext         v21.16b, v31.16b, v22.16b, #12
+
+    movrel      x7,  transpose_table
+    ld1        {v7.16b}, [x7]
+    orr         v0.16b,  v20.16b, v22.16b
+    orr         v1.16b,  v21.16b, v22.16b
+    umin        v0.16b,  v0.16b,  v6.16b
+    umin        v1.16b,  v1.16b,  v6.16b
+    umin        v4.16b,  v4.16b,  v6.16b        // mv ? 1 : 0
+    umin        v5.16b,  v5.16b,  v6.16b
+    add         v0.16b,  v0.16b,  v0.16b        // nnz ? 2 : 0
+    add         v1.16b,  v1.16b,  v1.16b
+    umax        v4.16b,  v4.16b,  v0.16b
+    umax        v5.16b,  v5.16b,  v1.16b
+    tbl         v6.16b, {v4.16b}, v7.16b
+    st1        {v5.16b}, [x3], x6       // bs[1]
+    st1        {v6.16b}, [x3]           // bs[0]
+    ret
+endfunc
+
+const transpose_table
+    .byte 0, 4,  8, 12
+    .byte 1, 5,  9, 13
+    .byte 2, 6, 10, 14
+    .byte 3, 7, 11, 15
+endconst
diff --git a/common/deblock.c b/common/deblock.c
index 6b369f2a..382eb721 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -729,7 +729,7 @@ void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int
 void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #endif // ARCH_PPC
 
-#if HAVE_ARMV6
+#if HAVE_ARMV6 || ARCH_AARCH64
 void x264_deblock_v_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_h_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
@@ -838,7 +838,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
    }
 #endif // HAVE_ALTIVEC
 
-#if HAVE_ARMV6
+#if HAVE_ARMV6 || ARCH_AARCH64
    if( cpu&X264_CPU_NEON )
    {
         pf->deblock_luma[1] = x264_deblock_v_luma_neon;