From 50d7fb80d8cb773cd6d495e083867c3685726352 Mon Sep 17 00:00:00 2001
From: David Conrad <lessen42@gmail.com>
Date: Mon, 24 Aug 2009 01:38:42 -0700
Subject: [PATCH] GSOC merge part 8: ARM NEON intra prediction assembly
 functions (partial) 4x4 dc/h/ddr/ddl, 8x8 dc/h, 8x8c h/v, 16x16 dc/h/v

---
 Makefile               |   5 +-
 common/arm/predict-a.S | 272 +++++++++++++++++++++++++++++++++++++++++
 common/arm/predict-c.c |  83 +++++++++++++
 common/arm/predict.h   |  31 +++++
 common/predict.c       |  19 +++
 5 files changed, 408 insertions(+), 2 deletions(-)
 create mode 100644 common/arm/predict-a.S
 create mode 100644 common/arm/predict-c.c
 create mode 100644 common/arm/predict.h

diff --git a/Makefile b/Makefile
index 5a0938d2..2e5e2bd6 100644
--- a/Makefile
+++ b/Makefile
@@ -59,8 +59,9 @@ endif
 ifeq ($(ARCH),ARM)
 ifneq ($(AS),)
 ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
-          common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S
-SRCS   += common/arm/mc-c.c
+          common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
+          common/arm/predict-a.S
+SRCS   += common/arm/mc-c.c common/arm/predict-c.c
 OBJASM  = $(ASMSRC:%.S=%.o)
 endif
 endif
diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
new file mode 100644
index 00000000..46e687b3
--- /dev/null
+++ b/common/arm/predict-a.S
@@ -0,0 +1,272 @@
+/*****************************************************************************
+ * predict_armv6.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+
+.section .rodata
+.align 4
+
+pw_76543210: .short 7,6,5,4,3,2,1,0
+
+.text
+
+// because gcc doesn't believe in using the free shift in add
+function x264_predict_4x4_h_armv6, export=1
+    ldrb    r1, [r0, #0*FDEC_STRIDE-1]
+    ldrb    r2, [r0, #1*FDEC_STRIDE-1]
+    ldrb    r3, [r0, #2*FDEC_STRIDE-1]
+    ldrb    ip, [r0, #3*FDEC_STRIDE-1]
+    add     r1, r1, r1, lsl #8
+    add     r2, r2, r2, lsl #8
+    add     r3, r3, r3, lsl #8
+    add     ip, ip, ip, lsl #8
+    add     r1, r1, r1, lsl #16
+    str     r1, [r0, #0*FDEC_STRIDE]
+    add     r2, r2, r2, lsl #16
+    str     r2, [r0, #1*FDEC_STRIDE]
+    add     r3, r3, r3, lsl #16
+    str     r3, [r0, #2*FDEC_STRIDE]
+    add     ip, ip, ip, lsl #16
+    str     ip, [r0, #3*FDEC_STRIDE]
+    bx      lr
+.endfunc
+
+function x264_predict_4x4_dc_armv6, export=1
+    mov     ip, #0
+    ldr     r1, [r0, #-FDEC_STRIDE]
+    ldrb    r2, [r0, #0*FDEC_STRIDE-1]
+    ldrb    r3, [r0, #1*FDEC_STRIDE-1]
+    usad8   r1, r1, ip
+    add     r2, r2, #4
+    ldrb    ip, [r0, #2*FDEC_STRIDE-1]
+    add     r2, r2, r3
+    ldrb    r3, [r0, #3*FDEC_STRIDE-1]
+    add     r2, r2, ip
+    add     r2, r2, r3
+    add     r1, r1, r2
+    lsr     r1, r1, #3
+    add     r1, r1, r1, lsl #8
+    add     r1, r1, r1, lsl #16
+    str     r1, [r0, #0*FDEC_STRIDE]
+    str     r1, [r0, #1*FDEC_STRIDE]
+    str     r1, [r0, #2*FDEC_STRIDE]
+    str     r1, [r0, #3*FDEC_STRIDE]
+    bx      lr
+.endfunc
+
+// return a1 = (a1+2*b1+c1+2)>>2  a2 = (a2+2*b2+c2+2)>>2
+.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
+    uhadd8  \a1, \a1, \c1
+    uhadd8  \a2, \a2, \c2
+    uhadd8  \c1, \a1, \b1
+    uhadd8  \c2, \a2, \b2
+    eor     \a1, \a1, \b1
+    eor     \a2, \a2, \b2
+    and     \a1, \a1, \pb_1
+    and     \a2, \a2, \pb_1
+    uadd8   \a1, \a1, \c1
+    uadd8   \a2, \a2, \c2
+.endm
+
+function x264_predict_4x4_ddr_armv6, export=1
+    ldr     r1, [r0, # -FDEC_STRIDE]
+    ldrb    r2, [r0, # -FDEC_STRIDE-1]
+    ldrb    r3, [r0, #0*FDEC_STRIDE-1]
+    push    {r4-r6,lr}
+    add     r2, r2, r1, lsl #8
+    ldrb    r4, [r0, #1*FDEC_STRIDE-1]
+    add     r3, r3, r2, lsl #8
+    ldrb    r5, [r0, #2*FDEC_STRIDE-1]
+    ldrb    r6, [r0, #3*FDEC_STRIDE-1]
+    add     r4, r4, r3, lsl #8
+    add     r5, r5, r4, lsl #8
+    add     r6, r6, r5, lsl #8
+    ldr     ip, pb_1
+    PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
+    str     r1, [r0, #0*FDEC_STRIDE]
+    lsl     r2, r1, #8
+    lsl     r3, r1, #16
+    lsl     r4, r4, #8
+    lsl     r5, r1, #24
+    add     r2, r2, r4, lsr #24
+    str     r2, [r0, #1*FDEC_STRIDE]
+    add     r3, r3, r4, lsr #16
+    str     r3, [r0, #2*FDEC_STRIDE]
+    add     r5, r5, r4, lsr #8
+    str     r5, [r0, #3*FDEC_STRIDE]
+    pop     {r4-r6,pc}
+.endfunc
+
+pb_1: .word 0x01010101
+
+function x264_predict_4x4_ddl_neon, export=1
+    sub         r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0}, [r0], ip
+    vdup.8      d3, d0[7]
+    vext.8      d1, d0, d0, #1
+    vext.8      d2, d0, d3, #2
+    vhadd.u8    d0, d0, d2
+    vrhadd.u8   d0, d0, d1
+    vst1.32     {d0[0]}, [r0,:32], ip
+    vext.8      d1, d0, d0, #1
+    vext.8      d2, d0, d0, #2
+    vst1.32     {d1[0]}, [r0,:32], ip
+    vext.8      d3, d0, d0, #3
+    vst1.32     {d2[0]}, [r0,:32], ip
+    vst1.32     {d3[0]}, [r0,:32], ip
+    bx          lr
+.endfunc
+
+function x264_predict_8x8_dc_neon, export=1
+    mov     ip, #0
+    ldrd    r2, [r1, #8]
+    push    {r4-r5,lr}
+    ldrd    r4, [r1, #16]
+    lsl     r3, r3, #8
+    ldrb    lr, [r1, #7]
+    usad8   r2, r2, ip
+    usad8   r3, r3, ip
+    usada8  r2, r4, ip, r2
+    add     lr, lr, #8
+    usada8  r3, r5, ip, r3
+    add     r2, r2, lr
+    mov     ip, #FDEC_STRIDE
+    add     r2, r2, r3
+    lsr     r2, r2, #4
+
+    vdup.8   d0, r2
+.rept 8
+    vst1.64 {d0}, [r0,:64], ip
+.endr
+    pop    {r4-r5,pc}
+.endfunc
+
+
+function x264_predict_8x8_h_neon, export=1
+    add         r1, r1, #7
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d16}, [r1]
+    vdup.8      d0, d16[7]
+    vdup.8      d1, d16[6]
+    vst1.64     {d0}, [r0,:64], ip
+    vdup.8      d2, d16[5]
+    vst1.64     {d1}, [r0,:64], ip
+    vdup.8      d3, d16[4]
+    vst1.64     {d2}, [r0,:64], ip
+    vdup.8      d4, d16[3]
+    vst1.64     {d3}, [r0,:64], ip
+    vdup.8      d5, d16[2]
+    vst1.64     {d4}, [r0,:64], ip
+    vdup.8      d6, d16[1]
+    vst1.64     {d5}, [r0,:64], ip
+    vdup.8      d7, d16[0]
+    vst1.64     {d6}, [r0,:64], ip
+    vst1.64     {d7}, [r0,:64], ip
+    bx          lr
+.endfunc
+
+function x264_predict_8x8c_h_neon, export=1
+    sub         r1, r0, #1
+    mov         ip, #FDEC_STRIDE
+.rept 4
+    vld1.8      {d0[]}, [r1], ip
+    vld1.8      {d2[]}, [r1], ip
+    vst1.64     {d0}, [r0,:64], ip
+    vst1.64     {d2}, [r0,:64], ip
+.endr
+    bx          lr
+.endfunc
+
+function x264_predict_8x8c_v_neon, export=1
+    sub         r0, r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0}, [r0,:64], ip
+.rept 8
+    vst1.64     {d0}, [r0,:64], ip
+.endr
+    bx          lr
+.endfunc
+
+
+function x264_predict_16x16_dc_neon, export=1
+    sub         r3, r0, #FDEC_STRIDE
+    sub         r0, r0, #1
+    vld1.64     {d0-d1}, [r3,:128]
+    ldrb        ip, [r0], #FDEC_STRIDE
+    vaddl.u8    q0, d0, d1
+    ldrb        r1, [r0], #FDEC_STRIDE
+    vadd.u16    d0, d0, d1
+    vpadd.u16   d0, d0, d0
+    vpadd.u16   d0, d0, d0
+.rept 4
+    ldrb        r2, [r0], #FDEC_STRIDE
+    add         ip, ip, r1
+    ldrb        r3, [r0], #FDEC_STRIDE
+    add         ip, ip, r2
+    ldrb        r1, [r0], #FDEC_STRIDE
+    add         ip, ip, r3
+.endr
+    ldrb        r2, [r0], #FDEC_STRIDE
+    add         ip, ip, r1
+    ldrb        r3, [r0], #FDEC_STRIDE
+    add         ip, ip, r2
+
+    sub         r0, r0, #FDEC_STRIDE*16
+    add         ip, ip, r3
+    vdup.16     d1, ip
+    vadd.u16    d0, d0, d1
+    mov         ip, #FDEC_STRIDE
+    add         r0, r0, #1
+    vrshr.u16   d0, d0, #5
+    vdup.8      q0, d0[0]
+.rept 16
+    vst1.64     {d0-d1}, [r0,:64], ip
+.endr
+    bx          lr
+.endfunc
+
+function x264_predict_16x16_h_neon, export=1
+    sub         r1, r0, #1
+    mov         ip, #FDEC_STRIDE
+.rept 8
+    vld1.8      {d0[]}, [r1], ip
+    vmov        d1, d0
+    vld1.8      {d2[]}, [r1], ip
+    vmov        d3, d2
+    vst1.64     {d0-d1}, [r0,:128], ip
+    vst1.64     {d2-d3}, [r0,:128], ip
+.endr
+    bx          lr
+.endfunc
+
+function x264_predict_16x16_v_neon, export=1
+    sub         r0, r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0-d1}, [r0,:128], ip
+.rept 16
+    vst1.64     {d0-d1}, [r0,:128], ip
+.endr
+    bx          lr
+.endfunc
diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
new file mode 100644
index 00000000..1f2cd52a
--- /dev/null
+++ b/common/arm/predict-c.c
@@ -0,0 +1,83 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "predict.h"
+#include "pixel.h"
+
+void x264_predict_4x4_dc_armv6( uint8_t *src );
+void x264_predict_4x4_h_armv6( uint8_t *src );
+void x264_predict_4x4_ddr_armv6( uint8_t *src );
+void x264_predict_4x4_ddl_neon( uint8_t *src );
+
+void x264_predict_8x8c_h_neon( uint8_t *src );
+void x264_predict_8x8c_v_neon( uint8_t *src );
+
+void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[33] );
+void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[33] );
+
+void x264_predict_16x16_dc_neon( uint8_t *src );
+void x264_predict_16x16_h_neon( uint8_t *src );
+void x264_predict_16x16_v_neon( uint8_t *src );
+
+void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
+{
+    if (!(cpu&X264_CPU_ARMV6))
+        return;
+
+    pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
+    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
+    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
+
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
+}
+
+void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+    pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
+    pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
+}
+
+void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
+    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
+}
+
+void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
+    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
+    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
+}
diff --git a/common/arm/predict.h b/common/arm/predict.h
new file mode 100644
index 00000000..fe5ccda8
--- /dev/null
+++ b/common/arm/predict.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+ * predict.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_ARM_PREDICT_H
+#define X264_ARM_PREDICT_H
+
+void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
+void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
+void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
+void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );
+
+#endif
diff --git a/common/predict.c b/common/predict.c
index fd07ae2e..385eb5ce 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -33,6 +33,9 @@
 #ifdef ARCH_PPC
 #   include "ppc/predict.h"
 #endif
+#ifdef ARCH_ARM
+#   include "arm/predict.h"
+#endif
 
 /****************************************************************************
  * 16x16 prediction for intra luma block
@@ -770,6 +773,10 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
         x264_predict_16x16_init_altivec( pf );
     }
 #endif
+
+#ifdef HAVE_ARMV6
+    x264_predict_16x16_init_arm( cpu, pf );
+#endif
 }
 
 void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
@@ -792,6 +799,10 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
         x264_predict_8x8c_init_altivec( pf );
     }
 #endif
+
+#ifdef HAVE_ARMV6
+    x264_predict_8x8c_init_arm( cpu, pf );
+#endif
 }
 
 void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
@@ -813,6 +824,10 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_
 #ifdef HAVE_MMX
     x264_predict_8x8_init_mmx( cpu, pf, predict_filter );
 #endif
+
+#ifdef HAVE_ARMV6
+    x264_predict_8x8_init_arm( cpu, pf, predict_filter );
+#endif
 }
 
 void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
@@ -833,5 +848,9 @@ void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
 #ifdef HAVE_MMX
     x264_predict_4x4_init_mmx( cpu, pf );
 #endif
+
+#ifdef HAVE_ARMV6
+    x264_predict_4x4_init_arm( cpu, pf );
+#endif
 }
 
-- 
2.40.0