From: David Conrad <lessen42@gmail.com>
Date: Sat, 7 Nov 2009 17:25:18 +0000 (-0800)
Subject: Various ARM-related fixes
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=53a5772a35451c897366adda72d3a44c13103c38;p=libx264

Various ARM-related fixes
Fix comment for mc_copy_neon.
Fix memzero_aligned_neon prototype.
Update NEON (i)dct_dc prototypes.
Duplicate x86 behavior for global+hidden functions.
---

diff --git a/Makefile b/Makefile
index 3de0b616..8ba8d815 100644
--- a/Makefile
+++ b/Makefile
@@ -119,6 +119,7 @@ checkasm: tools/checkasm.o libx264.a
 
 %.o: %.S
 	$(AS) $(ASFLAGS) -o $@ $<
+	-@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile
 
 .depend: config.mak
 	rm -f .depend
diff --git a/common/arm/asm.S b/common/arm/asm.S
index f7b9f141..d1631656 100644
--- a/common/arm/asm.S
+++ b/common/arm/asm.S
@@ -20,19 +20,24 @@
 
 #include "config.h"
 
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
         .macro require8, val=1
-        .eabi_attribute 24, \val
+ELF     .eabi_attribute 24, \val
         .endm
 
         .macro preserve8, val=1
-        .eabi_attribute 25, \val
+ELF     .eabi_attribute 25, \val
         .endm
 
-        .macro function name, export=0
-.if \export
+        .macro function name
         .global \name
-.endif
-        .type   \name, %function
+ELF     .hidden \name
+ELF     .type   \name, %function
         .func   \name
 \name:
         .endm
diff --git a/common/arm/cpu-a.S b/common/arm/cpu-a.S
index ccde3bb4..40eff039 100644
--- a/common/arm/cpu-a.S
+++ b/common/arm/cpu-a.S
@@ -27,7 +27,7 @@
 
 // done in gas because .fpu neon overrides the refusal to assemble
 // instructions the selected -march/-mcpu doesn't support
-function x264_cpu_neon_test, export=1
+function x264_cpu_neon_test
     vadd.i16    q0, q0, q0
     bx          lr
 .endfunc
@@ -62,7 +62,7 @@ function x264_cpu_disable_armv7_counter
 
 // return: 0 if transfers neon -> arm transfers take more than 10 cycles
 //         nonzero otherwise
-function x264_cpu_fast_neon_mrc_test, export=1
+function x264_cpu_fast_neon_mrc_test
     // check for user access to performance counters
     mrc         p15, 0, r0, c9, c14, 0
     cmp         r0, #0
diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S
index 7a7b0381..0ed72384 100644
--- a/common/arm/dct-a.S
+++ b/common/arm/dct-a.S
@@ -62,7 +62,7 @@ scan4x4_frame:
 .endm
 
 
-function x264_dct4x4dc_neon, export=1
+function x264_dct4x4dc_neon
     vld1.64         {d0-d3}, [r0,:128]
     SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
     SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
@@ -81,7 +81,7 @@ function x264_dct4x4dc_neon, export=1
     bx              lr
 .endfunc
 
-function x264_idct4x4dc_neon, export=1
+function x264_idct4x4dc_neon
     vld1.64         {d0-d3}, [r0,:128]
     SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
     SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
@@ -105,7 +105,7 @@ function x264_idct4x4dc_neon, export=1
     vsub.s16        \d3, \d7, \d5
 .endm
 
-function x264_sub4x4_dct_neon, export=1
+function x264_sub4x4_dct_neon
     mov             r3, #FENC_STRIDE
     mov             ip, #FDEC_STRIDE
     vld1.32         {d0[]}, [r1,:32], r3
@@ -128,7 +128,7 @@ function x264_sub4x4_dct_neon, export=1
     bx              lr
 .endfunc
 
-function x264_sub8x4_dct_neon, export=1
+function x264_sub8x4_dct_neon
     vld1.64         {d0}, [r1,:64], r3
     vld1.64         {d1}, [r2,:64], ip
     vsubl.u8        q8,  d0,  d1
@@ -164,7 +164,7 @@ function x264_sub8x4_dct_neon, export=1
     bx              lr
 .endfunc
 
-function x264_sub8x8_dct_neon, export=1
+function x264_sub8x8_dct_neon
     push            {lr}
     mov             r3, #FENC_STRIDE
     mov             ip, #FDEC_STRIDE
@@ -173,7 +173,7 @@ function x264_sub8x8_dct_neon, export=1
     b               x264_sub8x4_dct_neon
 .endfunc
 
-function x264_sub16x16_dct_neon, export=1
+function x264_sub16x16_dct_neon
     push            {lr}
     mov             r3, #FENC_STRIDE
     mov             ip, #FDEC_STRIDE
@@ -226,7 +226,7 @@ function x264_sub16x16_dct_neon, export=1
     SUMSUB_SHR2  2, q11, q13, q3,  q13,  q0, q1
 .endm
 
-function x264_sub8x8_dct8_neon, export=1
+function x264_sub8x8_dct8_neon
     mov             r3, #FENC_STRIDE
     mov             ip, #FDEC_STRIDE
     vld1.64         {d16}, [r1,:64], r3
@@ -278,7 +278,7 @@ function x264_sub8x8_dct8_neon, export=1
     bx              lr
 .endfunc
 
-function x264_sub16x16_dct8_neon, export=1
+function x264_sub16x16_dct8_neon
     push            {lr}
     bl              x264_sub8x8_dct8_neon
     sub             r1,  r1,  #FENC_STRIDE*8 - 8
@@ -303,7 +303,7 @@ function x264_sub16x16_dct8_neon, export=1
     vadd.s16        \d6, \d6, \d1
 .endm
 
-function x264_add4x4_idct_neon, export=1
+function x264_add4x4_idct_neon
     mov             r2, #FDEC_STRIDE
     vld1.64         {d0-d3}, [r1,:128]
 
@@ -335,7 +335,7 @@ function x264_add4x4_idct_neon, export=1
     bx              lr
 .endfunc
 
-function x264_add8x4_idct_neon, export=1
+function x264_add8x4_idct_neon
     vld1.64         {d0-d3}, [r1,:128]!
     IDCT_1D         d16, d18, d20, d22, d0, d1, d2, d3
     vld1.64         {d4-d7}, [r1,:128]!
@@ -375,7 +375,7 @@ function x264_add8x4_idct_neon, export=1
     bx              lr
 .endfunc
 
-function x264_add8x8_idct_neon, export=1
+function x264_add8x8_idct_neon
     mov             r2, #FDEC_STRIDE
     mov             ip, lr
     bl              x264_add8x4_idct_neon
@@ -383,7 +383,7 @@ function x264_add8x8_idct_neon, export=1
     b               x264_add8x4_idct_neon
 .endfunc
 
-function x264_add16x16_idct_neon, export=1
+function x264_add16x16_idct_neon
     mov             r2, #FDEC_STRIDE
     mov             ip, lr
     bl              x264_add8x4_idct_neon
@@ -435,7 +435,7 @@ function x264_add16x16_idct_neon, export=1
     SUMSUB_AB       q11, q12, q2,  q12
 .endm
 
-function x264_add8x8_idct8_neon, export=1
+function x264_add8x8_idct8_neon
     mov             r2,  #FDEC_STRIDE
     vld1.64         {d16-d19}, [r1,:128]!
     vld1.64         {d20-d23}, [r1,:128]!
@@ -497,7 +497,7 @@ function x264_add8x8_idct8_neon, export=1
     bx              lr
 .endfunc
 
-function x264_add16x16_idct8_neon, export=1
+function x264_add16x16_idct8_neon
     mov             ip,  lr
     bl              x264_add8x8_idct8_neon
     sub             r0,  r0,  #8*FDEC_STRIDE-8
@@ -510,7 +510,7 @@ function x264_add16x16_idct8_neon, export=1
 .endfunc
 
 
-function x264_add8x8_idct_dc_neon, export=1
+function x264_add8x8_idct_dc_neon
     mov             r2,  #FDEC_STRIDE
     vld1.64         {d16}, [r1,:64]
     vrshr.s16       d16, d16, #6
@@ -593,7 +593,7 @@ function x264_add8x8_idct_dc_neon, export=1
     vst1.64         {d22-d23}, [r2,:128], r3
 .endm
 
-function x264_add16x16_idct_dc_neon, export=1
+function x264_add16x16_idct_dc_neon
     mov             r2,  r0
     mov             r3,  #FDEC_STRIDE
     vmov.i16        q15, #0
@@ -609,7 +609,7 @@ function x264_add16x16_idct_dc_neon, export=1
     bx              lr
 .endfunc
 
-function x264_sub8x8_dct_dc_neon, export=1
+function x264_sub8x8_dct_dc_neon
     mov             r3,  #FENC_STRIDE
     mov             ip,  #FDEC_STRIDE
     vld1.64         {d16}, [r1,:64], r3
@@ -650,7 +650,7 @@ function x264_sub8x8_dct_dc_neon, export=1
 .endfunc
 
 
-function x264_zigzag_scan_4x4_frame_neon, export=1
+function x264_zigzag_scan_4x4_frame_neon
     movrel      r2, scan4x4_frame
     vld1.64     {d0-d3},   [r1,:128]
     vld1.64     {d16-d19}, [r2,:128]
diff --git a/common/arm/dct.h b/common/arm/dct.h
index b8cb4a12..55f53ce9 100644
--- a/common/arm/dct.h
+++ b/common/arm/dct.h
@@ -34,9 +34,9 @@ void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
 void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
 void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
 
-void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[2][2] );
+void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
 void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
-void x264_sub8x8_dct_dc_neon( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
 
 void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
index 6d60242c..f124b556 100644
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -115,7 +115,7 @@
     vqmovun.s16     d1,  q12
 .endm
 
-function x264_deblock_v_luma_neon, export=1
+function x264_deblock_v_luma_neon
     h264_loop_filter_start
 
     vld1.64         {d0, d1},  [r0,:128], r1
@@ -141,7 +141,7 @@ function x264_deblock_v_luma_neon, export=1
     bx              lr
 .endfunc
 
-function x264_deblock_h_luma_neon, export=1
+function x264_deblock_h_luma_neon
     h264_loop_filter_start
 
     sub             r0,  r0,  #4
@@ -226,7 +226,7 @@ function x264_deblock_h_luma_neon, export=1
     vqmovun.s16     d0,  q11
 .endm
 
-function x264_deblock_v_chroma_neon, export=1
+function x264_deblock_v_chroma_neon
     h264_loop_filter_start
 
     sub             r0,  r0,  r1, lsl #1
@@ -244,7 +244,7 @@ function x264_deblock_v_chroma_neon, export=1
     bx              lr
 .endfunc
 
-function x264_deblock_h_chroma_neon, export=1
+function x264_deblock_h_chroma_neon
     h264_loop_filter_start
 
     sub             r0,  r0,  #2
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index afd881c7..a62af393 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -30,7 +30,7 @@
 // They also use nothing above armv5te, but we don't care about pre-armv6
 
 // void prefetch_ref( uint8_t *pix, int stride, int parity )
-function x264_prefetch_ref_arm, export=1
+function x264_prefetch_ref_arm
     sub         r2, r2, #1
     add         r0, r0, #64
     and         r2, r2, r1
@@ -50,7 +50,7 @@ function x264_prefetch_ref_arm, export=1
 
 // void prefetch_fenc( uint8_t *pix_y, int stride_y,
 //                     uint8_t *pix_uv, int stride_uv, int mb_x )
-function x264_prefetch_fenc_arm, export=1
+function x264_prefetch_fenc_arm
     ldr         ip, [sp]
     push        {lr}
     and         lr, ip, #3
@@ -76,7 +76,7 @@ function x264_prefetch_fenc_arm, export=1
 
 
 // void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
-function x264_memcpy_aligned_neon, export=1
+function x264_memcpy_aligned_neon
     orr         r3,  r0,  r1,  lsr #1
     movrel      ip,  memcpy_table
     and         r3,  r3,  #0xc
@@ -138,7 +138,7 @@ memcpy_table:
 .ltorg
 
 // void x264_memzero_aligned( void *dst, size_t n )
-function x264_memzero_aligned_neon, export=1
+function x264_memzero_aligned_neon
     vmov.i8     q0, #0
     vmov.i8     q1, #0
 memzero_loop:
@@ -155,7 +155,7 @@ memzero_loop:
 //                 uint8_t *src1, int src1_stride,
 //                 uint8_t *src2, int src2_stride, int weight );
 .macro AVGH w h
-function x264_pixel_avg_\w\()x\h\()_neon, export=1
+function x264_pixel_avg_\w\()x\h\()_neon
     ldr         ip, [sp, #8]
     push        {r4-r6,lr}
     cmp         ip, #32
@@ -230,7 +230,7 @@ AVGH 16, 16
 .endm
 
 .macro AVG_WEIGHT ext
-function x264_pixel_avg_weight_w4_\ext\()_neon, export=1
+function x264_pixel_avg_weight_w4_\ext\()_neon
     load_weights_\ext
 1:  // height loop
     subs            lr,  lr,  #2
@@ -246,7 +246,7 @@ function x264_pixel_avg_weight_w4_\ext\()_neon, export=1
     pop             {r4-r6,pc}
 .endfunc
 
-function x264_pixel_avg_weight_w8_\ext\()_neon, export=1
+function x264_pixel_avg_weight_w8_\ext\()_neon
     load_weights_\ext
 1:  // height loop
     subs            lr,  lr,  #4
@@ -270,7 +270,7 @@ function x264_pixel_avg_weight_w8_\ext\()_neon, export=1
     pop             {r4-r6,pc}
 .endfunc
 
-function x264_pixel_avg_weight_w16_\ext\()_neon, export=1
+function x264_pixel_avg_weight_w16_\ext\()_neon
     load_weights_\ext
 1:  // height loop
     subs            lr,  lr,  #2
@@ -295,7 +295,7 @@ AVG_WEIGHT add_add
 AVG_WEIGHT add_sub
 AVG_WEIGHT sub_add
 
-function x264_pixel_avg_w4_neon, export=1
+function x264_pixel_avg_w4_neon
     subs        lr,  lr,  #2
     vld1.32     {d0[]}, [r2], r3
     vld1.32     {d2[]}, [r4], r5
@@ -309,7 +309,7 @@ function x264_pixel_avg_w4_neon, export=1
     pop         {r4-r6,pc}
 .endfunc
 
-function x264_pixel_avg_w8_neon, export=1
+function x264_pixel_avg_w8_neon
     subs        lr,  lr,  #4
     vld1.64     {d0}, [r2], r3
     vld1.64     {d2}, [r4], r5
@@ -331,7 +331,7 @@ function x264_pixel_avg_w8_neon, export=1
     pop         {r4-r6,pc}
 .endfunc
 
-function x264_pixel_avg_w16_neon, export=1
+function x264_pixel_avg_w16_neon
     subs        lr,  lr,  #4
     vld1.64     {d0-d1}, [r2], r3
     vld1.64     {d2-d3}, [r4], r5
@@ -354,7 +354,7 @@ function x264_pixel_avg_w16_neon, export=1
 .endfunc
 
 
-function x264_pixel_avg2_w4_neon, export=1
+function x264_pixel_avg2_w4_neon
     ldr         ip,  [sp, #4]
     push        {lr}
     ldr         lr,  [sp, #4]
@@ -372,7 +372,7 @@ avg2_w4_loop:
     pop         {pc}
 .endfunc
 
-function x264_pixel_avg2_w8_neon, export=1
+function x264_pixel_avg2_w8_neon
     ldr         ip,  [sp, #4]
     push        {lr}
     ldr         lr,  [sp, #4]
@@ -390,7 +390,7 @@ avg2_w8_loop:
     pop         {pc}
 .endfunc
 
-function x264_pixel_avg2_w16_neon, export=1
+function x264_pixel_avg2_w16_neon
     ldr         ip,  [sp, #4]
     push        {lr}
     ldr         lr,  [sp, #4]
@@ -408,7 +408,7 @@ avg2_w16_loop:
     pop         {pc}
 .endfunc
 
-function x264_pixel_avg2_w20_neon, export=1
+function x264_pixel_avg2_w20_neon
     ldr         ip,  [sp, #4]
     push        {lr}
     sub         r1,  r1,  #16
@@ -432,8 +432,8 @@ avg2_w20_loop:
 .endfunc
 
 
-// void mc_copy( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int height )
-function x264_mc_copy_w4_neon, export=1
+// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
+function x264_mc_copy_w4_neon
     ldr         ip,  [sp]
 copy_w4_loop:
     subs        ip,  ip,  #4
@@ -449,7 +449,7 @@ copy_w4_loop:
     bx          lr
 .endfunc
 
-function x264_mc_copy_w8_neon, export=1
+function x264_mc_copy_w8_neon
     ldr         ip,  [sp]
 copy_w8_loop:
     subs        ip,  ip,  #4
@@ -465,7 +465,7 @@ copy_w8_loop:
     bx          lr
 .endfunc
 
-function x264_mc_copy_w16_neon, export=1
+function x264_mc_copy_w16_neon
     ldr         ip,  [sp]
 copy_w16_loop:
     subs        ip,  ip,  #4
@@ -481,7 +481,7 @@ copy_w16_loop:
     bx          lr
 .endfunc
 
-function x264_mc_copy_w16_aligned_neon, export=1
+function x264_mc_copy_w16_aligned_neon
     ldr         ip,  [sp]
 copy_w16_aligned_loop:
     subs        ip,  ip,  #4
@@ -501,7 +501,7 @@ copy_w16_aligned_loop:
 // void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
 //                           uint8_t *src, int i_src_stride,
 //                           int dx, int dy, int i_width, int i_height );
-function x264_mc_chroma_neon, export=1
+function x264_mc_chroma_neon
     push            {r4-r6, lr}
     ldrd            r4,  [sp, #16]
     ldr             r6,  [sp, #24]
@@ -741,7 +741,7 @@ mc_chroma_w8:
 
 
 // hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
-function x264_hpel_filter_v_neon, export=1
+function x264_hpel_filter_v_neon
     ldr             ip,  [sp]
     sub             r1,  r1,  r3,  lsl #1
     push            {lr}
@@ -781,7 +781,7 @@ filter_v_loop:
 .endfunc
 
 // hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
-function x264_hpel_filter_c_neon, export=1
+function x264_hpel_filter_c_neon
     sub             r1,  #16
     vld1.64         {d0-d3}, [r1,:128]!
 
@@ -866,7 +866,7 @@ filter_c_loop:
 .endfunc
 
 // hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
-function x264_hpel_filter_h_neon, export=1
+function x264_hpel_filter_h_neon
     sub             r1,  #16
     vmov.u8         d30, #5
     vld1.64         {d0-d3}, [r1,:128]!
@@ -956,7 +956,7 @@ filter_h_loop:
 // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
 //                         uint8_t *dstc, int src_stride, int dst_stride, int width,
 //                         int height )
-function x264_frame_init_lowres_core_neon, export=1
+function x264_frame_init_lowres_core_neon
     push            {r4-r10,lr}
     vpush           {d8-d15}
     ldrd            r4,  [sp, #96]
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index 201dc6b0..c6aaeb0e 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -27,7 +27,7 @@ void x264_prefetch_ref_arm( uint8_t *, int, int );
 void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );
 
 void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
-void x264_memzero_aligned_neon( void *dst, size_t n );
+void x264_memzero_aligned_neon( void *dst, int n );
 
 void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
 void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index effe9395..ca406acd 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -40,7 +40,7 @@ mask_ac8:
 .text
 
 .macro SAD4_ARMV6 h
-function x264_pixel_sad_4x\h\()_armv6, export=1
+function x264_pixel_sad_4x\h\()_armv6
     push        {r4-r6,lr}
     ldr         r4, [r2], r3
     ldr         r5, [r0], r1
@@ -109,7 +109,7 @@ SAD4_ARMV6 8
 .endm
 
 .macro SAD_FUNC w, h, name, align:vararg
-function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
+function x264_pixel_sad\name\()_\w\()x\h\()_neon
 .if \w == 16
     .set r, \h / 2 - 1
 .else
@@ -199,7 +199,7 @@ SAD_FUNC  16, 16, _aligned, ,:128
 .endm
 
 .macro SAD_FUNC_DUAL w, h
-function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual, export=1
+function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual
     SAD_DUAL_START_\w
 .rept \h / 2 - \w / 8
     SAD_DUAL_\w
@@ -321,7 +321,7 @@ SAD_FUNC_DUAL  16, 16
 .endm
 
 .macro SAD_X_FUNC x, w, h
-function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
+function x264_pixel_sad_x\x\()_\w\()x\h\()_neon
     push        {r6-r7,lr}
 .if \x == 3
     ldrd        r6,  [sp, #12]
@@ -463,7 +463,7 @@ SAD_X_FUNC  4, 16, 16
 .endm
 
 .macro SSD_FUNC w h
-function x264_pixel_ssd_\w\()x\h\()_neon, export=1
+function x264_pixel_ssd_\w\()x\h\()_neon
     SSD_START_\w
 .rept \h-2
     SSD_\w
@@ -491,7 +491,7 @@ SSD_FUNC  16, 16
     \vpadal         \qsqr_sum, \qsqr_last
 .endm
 
-function x264_pixel_var_8x8_neon, export=1
+function x264_pixel_var_8x8_neon
     vld1.64         {d16}, [r0,:64], r1
     vmull.u8        q1,  d16, d16
     vmovl.u8        q0,  d16
@@ -517,7 +517,7 @@ function x264_pixel_var_8x8_neon, export=1
     b               x264_var_end
 .endfunc
 
-function x264_pixel_var_16x16_neon, export=1
+function x264_pixel_var_16x16_neon
     vld1.64         {d16-d17}, [r0,:128], r1
     vmull.u8        q12, d16, d16
     vmovl.u8        q0,  d16
@@ -573,7 +573,7 @@ function x264_var_end
     vmlal.s16       \acc, \d1, \d1
 .endm
 
-function x264_pixel_var2_8x8_neon, export=1
+function x264_pixel_var2_8x8_neon
     DIFF_SUM        q0,  d0,  d1
     DIFF_SUM        q8,  d16, d17
     SQR_ACC         q1,  d0,  d1,  vmull.s16
@@ -620,7 +620,7 @@ function x264_pixel_var2_8x8_neon, export=1
     vsubl.u8    \q3, d6,  d7
 .endm
 
-function x264_pixel_satd_4x4_neon, export=1
+function x264_pixel_satd_4x4_neon
     vld1.32     {d1[]},  [r2], r3
     vld1.32     {d0[]},  [r0,:32], r1
     vld1.32     {d3[]},  [r2], r3
@@ -642,7 +642,7 @@ function x264_pixel_satd_4x4_neon, export=1
     bx          lr
 .endfunc
 
-function x264_pixel_satd_4x8_neon, export=1
+function x264_pixel_satd_4x8_neon
     vld1.32     {d1[]},  [r2], r3
     vld1.32     {d0[]},  [r0,:32], r1
     vld1.32     {d3[]},  [r2], r3
@@ -669,7 +669,7 @@ function x264_pixel_satd_4x8_neon, export=1
     b           x264_satd_4x8_8x4_end_neon
 .endfunc
 
-function x264_pixel_satd_8x4_neon, export=1
+function x264_pixel_satd_8x4_neon
     vld1.64     {d1}, [r2], r3
     vld1.64     {d0}, [r0,:64], r1
     vsubl.u8    q0,  d0,  d1
@@ -713,7 +713,7 @@ function x264_satd_4x8_8x4_end_neon
     bx          lr
 .endfunc
 
-function x264_pixel_satd_8x8_neon, export=1
+function x264_pixel_satd_8x8_neon
     mov         ip,  lr
 
     bl x264_satd_8x8_neon
@@ -727,7 +727,7 @@ function x264_pixel_satd_8x8_neon, export=1
     bx          lr
 .endfunc
 
-function x264_pixel_satd_8x16_neon, export=1
+function x264_pixel_satd_8x16_neon
     vpush       {d8-d11}
     mov         ip,  lr
 
@@ -798,7 +798,7 @@ function x264_satd_8x4v_8x8h_neon
     bx          lr
 .endfunc
 
-function x264_pixel_satd_16x8_neon, export=1
+function x264_pixel_satd_16x8_neon
     vpush       {d8-d11}
     mov         ip, lr
 
@@ -820,7 +820,7 @@ function x264_pixel_satd_16x8_neon, export=1
     bx          lr
 .endfunc
 
-function x264_pixel_satd_16x16_neon, export=1
+function x264_pixel_satd_16x16_neon
     vpush       {d8-d11}
     mov         ip, lr
 
@@ -879,7 +879,7 @@ function x264_satd_16x4_neon
 .endfunc
 
 
-function x264_pixel_sa8d_8x8_neon, export=1
+function x264_pixel_sa8d_8x8_neon
     mov             ip,  lr
     bl              x264_sa8d_8x8_neon
     vadd.u16        q0,  q8,  q9
@@ -891,7 +891,7 @@ function x264_pixel_sa8d_8x8_neon, export=1
     bx              lr
 .endfunc
 
-function x264_pixel_sa8d_16x16_neon, export=1
+function x264_pixel_sa8d_16x16_neon
     vpush           {d8-d11}
     mov             ip,  lr
     bl              x264_sa8d_8x8_neon
@@ -988,7 +988,7 @@ function x264_sa8d_8x8_neon
 
 
 .macro HADAMARD_AC w h
-function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
+function x264_pixel_hadamard_ac_\w\()x\h\()_neon
     vpush           {d8-d15}
     movrel          ip, mask_ac4
     vmov.i8         q4, #0
@@ -1143,7 +1143,7 @@ function x264_hadamard_ac_8x8_neon
     vmull.u8    \ssb, \db, \db
 .endm
 
-function x264_pixel_ssim_4x4x2_core_neon, export=1
+function x264_pixel_ssim_4x4x2_core_neon
     ldr         ip, [sp]
     vld1.64     {d0}, [r0], r1
     vld1.64     {d2}, [r2], r3
@@ -1172,7 +1172,7 @@ function x264_pixel_ssim_4x4x2_core_neon, export=1
 .endfunc
 
 // FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2
-function x264_pixel_ssim_end4_neon, export=1
+function x264_pixel_ssim_end4_neon
     vld1.32     {d16-d19}, [r0,:128]!
     vld1.32     {d20-d23}, [r1,:128]!
     vadd.s32    q0,  q8,  q10
diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
index 8ff61a23..9a914784 100644
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -32,7 +32,7 @@ pw_76543210: .short 7,6,5,4,3,2,1,0
 .text
 
 // because gcc doesn't believe in using the free shift in add
-function x264_predict_4x4_h_armv6, export=1
+function x264_predict_4x4_h_armv6
     ldrb    r1, [r0, #0*FDEC_STRIDE-1]
     ldrb    r2, [r0, #1*FDEC_STRIDE-1]
     ldrb    r3, [r0, #2*FDEC_STRIDE-1]
@@ -52,7 +52,7 @@ function x264_predict_4x4_h_armv6, export=1
     bx      lr
 .endfunc
 
-function x264_predict_4x4_dc_armv6, export=1
+function x264_predict_4x4_dc_armv6
     mov     ip, #0
     ldr     r1, [r0, #-FDEC_STRIDE]
     ldrb    r2, [r0, #0*FDEC_STRIDE-1]
@@ -89,7 +89,7 @@ function x264_predict_4x4_dc_armv6, export=1
     uadd8   \a2, \a2, \c2
 .endm
 
-function x264_predict_4x4_ddr_armv6, export=1
+function x264_predict_4x4_ddr_armv6
     ldr     r1, [r0, # -FDEC_STRIDE]
     ldrb    r2, [r0, # -FDEC_STRIDE-1]
     ldrb    r3, [r0, #0*FDEC_STRIDE-1]
@@ -118,7 +118,7 @@ function x264_predict_4x4_ddr_armv6, export=1
     pop     {r4-r6,pc}
 .endfunc
 
-function x264_predict_4x4_ddl_neon, export=1
+function x264_predict_4x4_ddl_neon
     sub         r0, #FDEC_STRIDE
     mov         ip, #FDEC_STRIDE
     vld1.64     {d0}, [r0], ip
@@ -137,7 +137,7 @@ function x264_predict_4x4_ddl_neon, export=1
     bx          lr
 .endfunc
 
-function x264_predict_8x8_dc_neon, export=1
+function x264_predict_8x8_dc_neon
     mov     ip, #0
     ldrd    r2, [r1, #8]
     push    {r4-r5,lr}
@@ -162,7 +162,7 @@ function x264_predict_8x8_dc_neon, export=1
 .endfunc
 
 
-function x264_predict_8x8_h_neon, export=1
+function x264_predict_8x8_h_neon
     add         r1, r1, #7
     mov         ip, #FDEC_STRIDE
     vld1.64     {d16}, [r1]
@@ -185,7 +185,7 @@ function x264_predict_8x8_h_neon, export=1
     bx          lr
 .endfunc
 
-function x264_predict_8x8c_h_neon, export=1
+function x264_predict_8x8c_h_neon
     sub         r1, r0, #1
     mov         ip, #FDEC_STRIDE
 .rept 4
@@ -197,7 +197,7 @@ function x264_predict_8x8c_h_neon, export=1
     bx          lr
 .endfunc
 
-function x264_predict_8x8c_v_neon, export=1
+function x264_predict_8x8c_v_neon
     sub         r0, r0, #FDEC_STRIDE
     mov         ip, #FDEC_STRIDE
     vld1.64     {d0}, [r0,:64], ip
@@ -208,7 +208,7 @@ function x264_predict_8x8c_v_neon, export=1
 .endfunc
 
 
-function x264_predict_16x16_dc_neon, export=1
+function x264_predict_16x16_dc_neon
     sub         r3, r0, #FDEC_STRIDE
     sub         r0, r0, #1
     vld1.64     {d0-d1}, [r3,:128]
@@ -245,7 +245,7 @@ function x264_predict_16x16_dc_neon, export=1
     bx          lr
 .endfunc
 
-function x264_predict_16x16_h_neon, export=1
+function x264_predict_16x16_h_neon
     sub         r1, r0, #1
     mov         ip, #FDEC_STRIDE
 .rept 8
@@ -259,7 +259,7 @@ function x264_predict_16x16_h_neon, export=1
     bx          lr
 .endfunc
 
-function x264_predict_16x16_v_neon, export=1
+function x264_predict_16x16_v_neon
     sub         r0, r0, #FDEC_STRIDE
     mov         ip, #FDEC_STRIDE
     vld1.64     {d0-d1}, [r0,:128], ip
diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index 4dd71829..0b49eb47 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -64,7 +64,7 @@ pmovmskb_byte:
 .endm
 
 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
-function x264_quant_2x2_dc_neon, export=1
+function x264_quant_2x2_dc_neon
     vld1.64     {d0}, [r0,:64]
     vabs.s16    d3,  d0
     vdup.16     d2,  r2
@@ -80,7 +80,7 @@ function x264_quant_2x2_dc_neon, export=1
 .endfunc
 
 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
-function x264_quant_4x4_dc_neon, export=1
+function x264_quant_4x4_dc_neon
     vld1.64     {d28-d31}, [r0,:128]
     vabs.s16    q8,  q14
     vabs.s16    q9,  q15
@@ -92,7 +92,7 @@ function x264_quant_4x4_dc_neon, export=1
 .endfunc
 
 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
-function x264_quant_4x4_neon, export=1
+function x264_quant_4x4_neon
     vld1.64     {d28-d31}, [r0,:128]
     vabs.s16    q8,  q14
     vabs.s16    q9,  q15
@@ -104,7 +104,7 @@ function x264_quant_4x4_neon, export=1
 .endfunc
 
 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
-function x264_quant_8x8_neon, export=1
+function x264_quant_8x8_neon
     vld1.64     {d28-d31}, [r0,:128]
     vabs.s16    q8,  q14
     vabs.s16    q9,  q15
@@ -139,7 +139,7 @@ function x264_quant_8x8_neon, export=1
 
 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 .macro DEQUANT size bits
-function x264_dequant_\size\()_neon, export=1
+function x264_dequant_\size\()_neon
     DEQUANT_START \bits+2, \bits
 .ifc \size, 8x8
     mov         r2,  #4
@@ -220,7 +220,7 @@ DEQUANT 4x4, 4
 DEQUANT 8x8, 6
 
 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
-function x264_dequant_4x4_dc_neon, export=1
+function x264_dequant_4x4_dc_neon
     DEQUANT_START 6, 6, yes
     blt         dequant_4x4_dc_rshift
 
@@ -267,7 +267,7 @@ dequant_4x4_dc_rshift:
 
 
 // int coeff_last( int16_t *l )
-function x264_coeff_last4_arm, export=1
+function x264_coeff_last4_arm
     ldrd        r2,  [r0]
     subs        r0,  r3,  #0
     movne       r0,  #2
@@ -278,7 +278,7 @@ function x264_coeff_last4_arm, export=1
 .endfunc
 
 .macro COEFF_LAST_1x size
-function x264_coeff_last\size\()_neon, export=1
+function x264_coeff_last\size\()_neon
 .if \size == 15
     sub         r0,  r0,  #2
     vld1.64     {d0-d3}, [r0]
@@ -306,7 +306,7 @@ function x264_coeff_last\size\()_neon, export=1
 COEFF_LAST_1x 15
 COEFF_LAST_1x 16
 
-function x264_coeff_last64_neon, export=1
+function x264_coeff_last64_neon
     vld1.64     {d16-d19}, [r0,:128]!
     vqmovn.u16  d16, q8
     vqmovn.u16  d17, q9