From: David Conrad Date: Sat, 7 Nov 2009 17:25:18 +0000 (-0800) Subject: Various ARM-related fixes X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=53a5772a35451c897366adda72d3a44c13103c38;p=libx264 Various ARM-related fixes Fix comment for mc_copy_neon. Fix memzero_aligned_neon prototype. Update NEON (i)dct_dc prototypes. Duplicate x86 behavior for global+hidden functions. --- diff --git a/Makefile b/Makefile index 3de0b616..8ba8d815 100644 --- a/Makefile +++ b/Makefile @@ -119,6 +119,7 @@ checkasm: tools/checkasm.o libx264.a %.o: %.S $(AS) $(ASFLAGS) -o $@ $< + -@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile .depend: config.mak rm -f .depend diff --git a/common/arm/asm.S b/common/arm/asm.S index f7b9f141..d1631656 100644 --- a/common/arm/asm.S +++ b/common/arm/asm.S @@ -20,19 +20,24 @@ #include "config.h" +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + .macro require8, val=1 - .eabi_attribute 24, \val +ELF .eabi_attribute 24, \val .endm .macro preserve8, val=1 - .eabi_attribute 25, \val +ELF .eabi_attribute 25, \val .endm - .macro function name, export=0 -.if \export + .macro function name .global \name -.endif - .type \name, %function +ELF .hidden \name +ELF .type \name, %function .func \name \name: .endm diff --git a/common/arm/cpu-a.S b/common/arm/cpu-a.S index ccde3bb4..40eff039 100644 --- a/common/arm/cpu-a.S +++ b/common/arm/cpu-a.S @@ -27,7 +27,7 @@ // done in gas because .fpu neon overrides the refusal to assemble // instructions the selected -march/-mcpu doesn't support -function x264_cpu_neon_test, export=1 +function x264_cpu_neon_test vadd.i16 q0, q0, q0 bx lr .endfunc @@ -62,7 +62,7 @@ function x264_cpu_disable_armv7_counter // return: 0 if transfers neon -> arm transfers take more than 10 cycles // nonzero otherwise -function x264_cpu_fast_neon_mrc_test, export=1 +function x264_cpu_fast_neon_mrc_test // check for user access to performance counters mrc p15, 0, r0, c9, c14, 0 cmp r0, #0 diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S index 7a7b0381..0ed72384 100644 --- a/common/arm/dct-a.S +++ b/common/arm/dct-a.S @@ -62,7 +62,7 @@ scan4x4_frame: .endm -function x264_dct4x4dc_neon, export=1 +function x264_dct4x4dc_neon vld1.64 {d0-d3}, [r0,:128] SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3 SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7 @@ -81,7 +81,7 @@ function x264_dct4x4dc_neon, export=1 bx lr .endfunc -function x264_idct4x4dc_neon, export=1 +function x264_idct4x4dc_neon vld1.64 {d0-d3}, [r0,:128] SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3 SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7 @@ -105,7 +105,7 @@ function x264_idct4x4dc_neon, export=1 vsub.s16 \d3, \d7, \d5 .endm -function x264_sub4x4_dct_neon, export=1 +function x264_sub4x4_dct_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.32 {d0[]}, [r1,:32], r3 @@ -128,7 +128,7 @@ function x264_sub4x4_dct_neon, export=1 bx lr .endfunc -function x264_sub8x4_dct_neon, export=1 +function x264_sub8x4_dct_neon vld1.64 {d0}, [r1,:64], r3 vld1.64 {d1}, [r2,:64], ip vsubl.u8 q8, d0, d1 @@ -164,7 +164,7 @@ function x264_sub8x4_dct_neon, export=1 bx lr .endfunc -function x264_sub8x8_dct_neon, export=1 +function x264_sub8x8_dct_neon push {lr} mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE @@ -173,7 +173,7 @@ function x264_sub8x8_dct_neon, export=1 b x264_sub8x4_dct_neon .endfunc -function x264_sub16x16_dct_neon, export=1 +function x264_sub16x16_dct_neon push {lr} mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE @@ -226,7 +226,7 @@ function x264_sub16x16_dct_neon, export=1 SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1 .endm -function x264_sub8x8_dct8_neon, export=1 +function x264_sub8x8_dct8_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1,:64], r3 @@ -278,7 +278,7 @@ function x264_sub8x8_dct8_neon, export=1 bx lr .endfunc -function x264_sub16x16_dct8_neon, export=1 +function x264_sub16x16_dct8_neon push {lr} bl x264_sub8x8_dct8_neon sub r1, r1, #FENC_STRIDE*8 - 8 @@ -303,7 +303,7 @@ function x264_sub16x16_dct8_neon, export=1 vadd.s16 \d6, \d6, \d1 .endm -function x264_add4x4_idct_neon, export=1 +function x264_add4x4_idct_neon mov r2, #FDEC_STRIDE vld1.64 {d0-d3}, [r1,:128] @@ -335,7 +335,7 @@ function x264_add4x4_idct_neon, export=1 bx lr .endfunc -function x264_add8x4_idct_neon, export=1 +function x264_add8x4_idct_neon vld1.64 {d0-d3}, [r1,:128]! IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3 vld1.64 {d4-d7}, [r1,:128]! @@ -375,7 +375,7 @@ function x264_add8x4_idct_neon, export=1 bx lr .endfunc -function x264_add8x8_idct_neon, export=1 +function x264_add8x8_idct_neon mov r2, #FDEC_STRIDE mov ip, lr bl x264_add8x4_idct_neon @@ -383,7 +383,7 @@ function x264_add8x8_idct_neon, export=1 b x264_add8x4_idct_neon .endfunc -function x264_add16x16_idct_neon, export=1 +function x264_add16x16_idct_neon mov r2, #FDEC_STRIDE mov ip, lr bl x264_add8x4_idct_neon @@ -435,7 +435,7 @@ function x264_add16x16_idct_neon, export=1 SUMSUB_AB q11, q12, q2, q12 .endm -function x264_add8x8_idct8_neon, export=1 +function x264_add8x8_idct8_neon mov r2, #FDEC_STRIDE vld1.64 {d16-d19}, [r1,:128]! vld1.64 {d20-d23}, [r1,:128]! @@ -497,7 +497,7 @@ function x264_add8x8_idct8_neon, export=1 bx lr .endfunc -function x264_add16x16_idct8_neon, export=1 +function x264_add16x16_idct8_neon mov ip, lr bl x264_add8x8_idct8_neon sub r0, r0, #8*FDEC_STRIDE-8 @@ -510,7 +510,7 @@ function x264_add16x16_idct8_neon, export=1 .endfunc -function x264_add8x8_idct_dc_neon, export=1 +function x264_add8x8_idct_dc_neon mov r2, #FDEC_STRIDE vld1.64 {d16}, [r1,:64] vrshr.s16 d16, d16, #6 @@ -593,7 +593,7 @@ function x264_add8x8_idct_dc_neon, export=1 vst1.64 {d22-d23}, [r2,:128], r3 .endm -function x264_add16x16_idct_dc_neon, export=1 +function x264_add16x16_idct_dc_neon mov r2, r0 mov r3, #FDEC_STRIDE vmov.i16 q15, #0 @@ -609,7 +609,7 @@ function x264_add16x16_idct_dc_neon, export=1 bx lr .endfunc -function x264_sub8x8_dct_dc_neon, export=1 +function x264_sub8x8_dct_dc_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1,:64], r3 @@ -650,7 +650,7 @@ function x264_sub8x8_dct_dc_neon, export=1 .endfunc -function x264_zigzag_scan_4x4_frame_neon, export=1 +function x264_zigzag_scan_4x4_frame_neon movrel r2, scan4x4_frame vld1.64 {d0-d3}, [r1,:128] vld1.64 {d16-d19}, [r2,:128] diff --git a/common/arm/dct.h b/common/arm/dct.h index b8cb4a12..55f53ce9 100644 --- a/common/arm/dct.h +++ b/common/arm/dct.h @@ -34,9 +34,9 @@ void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] ); void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] ); void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] ); -void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[2][2] ); +void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] ); void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] ); -void x264_sub8x8_dct_dc_neon( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S index 6d60242c..f124b556 100644 --- a/common/arm/deblock-a.S +++ b/common/arm/deblock-a.S @@ -115,7 +115,7 @@ vqmovun.s16 d1, q12 .endm -function x264_deblock_v_luma_neon, export=1 +function x264_deblock_v_luma_neon h264_loop_filter_start vld1.64 {d0, d1}, [r0,:128], r1 @@ -141,7 +141,7 @@ function x264_deblock_v_luma_neon, export=1 bx lr .endfunc -function x264_deblock_h_luma_neon, export=1 +function x264_deblock_h_luma_neon h264_loop_filter_start sub r0, r0, #4 @@ -226,7 +226,7 @@ function x264_deblock_h_luma_neon, export=1 vqmovun.s16 d0, q11 .endm -function x264_deblock_v_chroma_neon, export=1 +function x264_deblock_v_chroma_neon h264_loop_filter_start sub r0, r0, r1, lsl #1 @@ -244,7 +244,7 @@ function x264_deblock_v_chroma_neon, export=1 bx lr .endfunc -function x264_deblock_h_chroma_neon, export=1 +function x264_deblock_h_chroma_neon h264_loop_filter_start sub r0, r0, #2 diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index afd881c7..a62af393 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -30,7 +30,7 @@ // They also use nothing above armv5te, but we don't care about pre-armv6 // void prefetch_ref( uint8_t *pix, int stride, int parity ) -function x264_prefetch_ref_arm, export=1 +function x264_prefetch_ref_arm sub r2, r2, #1 add r0, r0, #64 and r2, r2, r1 @@ -50,7 +50,7 @@ function x264_prefetch_ref_arm, export=1 // void prefetch_fenc( uint8_t *pix_y, int stride_y, // uint8_t *pix_uv, int stride_uv, int mb_x ) -function x264_prefetch_fenc_arm, export=1 +function x264_prefetch_fenc_arm ldr ip, [sp] push {lr} and lr, ip, #3 @@ -76,7 +76,7 @@ function x264_prefetch_fenc_arm, export=1 // void *x264_memcpy_aligned( void * dst, const void * src, size_t n ) -function x264_memcpy_aligned_neon, export=1 +function x264_memcpy_aligned_neon orr r3, r0, r1, lsr #1 movrel ip, memcpy_table and r3, r3, #0xc @@ -138,7 +138,7 @@ memcpy_table: .ltorg // void x264_memzero_aligned( void *dst, size_t n ) -function x264_memzero_aligned_neon, export=1 +function x264_memzero_aligned_neon vmov.i8 q0, #0 vmov.i8 q1, #0 memzero_loop: @@ -155,7 +155,7 @@ memzero_loop: // uint8_t *src1, int src1_stride, // uint8_t *src2, int src2_stride, int weight ); .macro AVGH w h -function x264_pixel_avg_\w\()x\h\()_neon, export=1 +function x264_pixel_avg_\w\()x\h\()_neon ldr ip, [sp, #8] push {r4-r6,lr} cmp ip, #32 @@ -230,7 +230,7 @@ AVGH 16, 16 .endm .macro AVG_WEIGHT ext -function x264_pixel_avg_weight_w4_\ext\()_neon, export=1 +function x264_pixel_avg_weight_w4_\ext\()_neon load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -246,7 +246,7 @@ function x264_pixel_avg_weight_w4_\ext\()_neon, export=1 pop {r4-r6,pc} .endfunc -function x264_pixel_avg_weight_w8_\ext\()_neon, export=1 +function x264_pixel_avg_weight_w8_\ext\()_neon load_weights_\ext 1: // height loop subs lr, lr, #4 @@ -270,7 +270,7 @@ function x264_pixel_avg_weight_w8_\ext\()_neon, export=1 pop {r4-r6,pc} .endfunc -function x264_pixel_avg_weight_w16_\ext\()_neon, export=1 +function x264_pixel_avg_weight_w16_\ext\()_neon load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -295,7 +295,7 @@ AVG_WEIGHT add_add AVG_WEIGHT add_sub AVG_WEIGHT sub_add -function x264_pixel_avg_w4_neon, export=1 +function x264_pixel_avg_w4_neon subs lr, lr, #2 vld1.32 {d0[]}, [r2], r3 vld1.32 {d2[]}, [r4], r5 @@ -309,7 +309,7 @@ function x264_pixel_avg_w4_neon, export=1 pop {r4-r6,pc} .endfunc -function x264_pixel_avg_w8_neon, export=1 +function x264_pixel_avg_w8_neon subs lr, lr, #4 vld1.64 {d0}, [r2], r3 vld1.64 {d2}, [r4], r5 @@ -331,7 +331,7 @@ function x264_pixel_avg_w8_neon, export=1 pop {r4-r6,pc} .endfunc -function x264_pixel_avg_w16_neon, export=1 +function x264_pixel_avg_w16_neon subs lr, lr, #4 vld1.64 {d0-d1}, [r2], r3 vld1.64 {d2-d3}, [r4], r5 @@ -354,7 +354,7 @@ function x264_pixel_avg_w16_neon, export=1 .endfunc -function x264_pixel_avg2_w4_neon, export=1 +function x264_pixel_avg2_w4_neon ldr ip, [sp, #4] push {lr} ldr lr, [sp, #4] @@ -372,7 +372,7 @@ avg2_w4_loop: pop {pc} .endfunc -function x264_pixel_avg2_w8_neon, export=1 +function x264_pixel_avg2_w8_neon ldr ip, [sp, #4] push {lr} ldr lr, [sp, #4] @@ -390,7 +390,7 @@ avg2_w8_loop: pop {pc} .endfunc -function x264_pixel_avg2_w16_neon, export=1 +function x264_pixel_avg2_w16_neon ldr ip, [sp, #4] push {lr} ldr lr, [sp, #4] @@ -408,7 +408,7 @@ avg2_w16_loop: pop {pc} .endfunc -function x264_pixel_avg2_w20_neon, export=1 +function x264_pixel_avg2_w20_neon ldr ip, [sp, #4] push {lr} sub r1, r1, #16 @@ -432,8 +432,8 @@ avg2_w20_loop: .endfunc -// void mc_copy( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int height ) -function x264_mc_copy_w4_neon, export=1 +// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height ) +function x264_mc_copy_w4_neon ldr ip, [sp] copy_w4_loop: subs ip, ip, #4 @@ -449,7 +449,7 @@ copy_w4_loop: bx lr .endfunc -function x264_mc_copy_w8_neon, export=1 +function x264_mc_copy_w8_neon ldr ip, [sp] copy_w8_loop: subs ip, ip, #4 @@ -465,7 +465,7 @@ copy_w8_loop: bx lr .endfunc -function x264_mc_copy_w16_neon, export=1 +function x264_mc_copy_w16_neon ldr ip, [sp] copy_w16_loop: subs ip, ip, #4 @@ -481,7 +481,7 @@ copy_w16_loop: bx lr .endfunc -function x264_mc_copy_w16_aligned_neon, export=1 +function x264_mc_copy_w16_aligned_neon ldr ip, [sp] copy_w16_aligned_loop: subs ip, ip, #4 @@ -501,7 +501,7 @@ copy_w16_aligned_loop: // void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride, // uint8_t *src, int i_src_stride, // int dx, int dy, int i_width, int i_height ); -function x264_mc_chroma_neon, export=1 +function x264_mc_chroma_neon push {r4-r6, lr} ldrd r4, [sp, #16] ldr r6, [sp, #24] @@ -741,7 +741,7 @@ mc_chroma_w8: // hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width) -function x264_hpel_filter_v_neon, export=1 +function x264_hpel_filter_v_neon ldr ip, [sp] sub r1, r1, r3, lsl #1 push {lr} @@ -781,7 +781,7 @@ filter_v_loop: .endfunc // hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); -function x264_hpel_filter_c_neon, export=1 +function x264_hpel_filter_c_neon sub r1, #16 vld1.64 {d0-d3}, [r1,:128]! @@ -866,7 +866,7 @@ filter_c_loop: .endfunc // hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); -function x264_hpel_filter_h_neon, export=1 +function x264_hpel_filter_h_neon sub r1, #16 vmov.u8 d30, #5 vld1.64 {d0-d3}, [r1,:128]! @@ -956,7 +956,7 @@ filter_h_loop: // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, // uint8_t *dstc, int src_stride, int dst_stride, int width, // int height ) -function x264_frame_init_lowres_core_neon, export=1 +function x264_frame_init_lowres_core_neon push {r4-r10,lr} vpush {d8-d15} ldrd r4, [sp, #96] diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index 201dc6b0..c6aaeb0e 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -27,7 +27,7 @@ void x264_prefetch_ref_arm( uint8_t *, int, int ); void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int ); void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n ); -void x264_memzero_aligned_neon( void *dst, size_t n ); +void x264_memzero_aligned_neon( void *dst, int n ); void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S index effe9395..ca406acd 100644 --- a/common/arm/pixel-a.S +++ b/common/arm/pixel-a.S @@ -40,7 +40,7 @@ mask_ac8: .text .macro SAD4_ARMV6 h -function x264_pixel_sad_4x\h\()_armv6, export=1 +function x264_pixel_sad_4x\h\()_armv6 push {r4-r6,lr} ldr r4, [r2], r3 ldr r5, [r0], r1 @@ -109,7 +109,7 @@ SAD4_ARMV6 8 .endm .macro SAD_FUNC w, h, name, align:vararg -function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1 +function x264_pixel_sad\name\()_\w\()x\h\()_neon .if \w == 16 .set r, \h / 2 - 1 .else @@ -199,7 +199,7 @@ SAD_FUNC 16, 16, _aligned, ,:128 .endm .macro SAD_FUNC_DUAL w, h -function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual, export=1 +function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual SAD_DUAL_START_\w .rept \h / 2 - \w / 8 SAD_DUAL_\w @@ -321,7 +321,7 @@ SAD_FUNC_DUAL 16, 16 .endm .macro SAD_X_FUNC x, w, h -function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 +function x264_pixel_sad_x\x\()_\w\()x\h\()_neon push {r6-r7,lr} .if \x == 3 ldrd r6, [sp, #12] @@ -463,7 +463,7 @@ SAD_X_FUNC 4, 16, 16 .endm .macro SSD_FUNC w h -function x264_pixel_ssd_\w\()x\h\()_neon, export=1 +function x264_pixel_ssd_\w\()x\h\()_neon SSD_START_\w .rept \h-2 SSD_\w @@ -491,7 +491,7 @@ SSD_FUNC 16, 16 \vpadal \qsqr_sum, \qsqr_last .endm -function x264_pixel_var_8x8_neon, export=1 +function x264_pixel_var_8x8_neon vld1.64 {d16}, [r0,:64], r1 vmull.u8 q1, d16, d16 vmovl.u8 q0, d16 @@ -517,7 +517,7 @@ function x264_pixel_var_8x8_neon, export=1 b x264_var_end .endfunc -function x264_pixel_var_16x16_neon, export=1 +function x264_pixel_var_16x16_neon vld1.64 {d16-d17}, [r0,:128], r1 vmull.u8 q12, d16, d16 vmovl.u8 q0, d16 @@ -573,7 +573,7 @@ function x264_var_end vmlal.s16 \acc, \d1, \d1 .endm -function x264_pixel_var2_8x8_neon, export=1 +function x264_pixel_var2_8x8_neon DIFF_SUM q0, d0, d1 DIFF_SUM q8, d16, d17 SQR_ACC q1, d0, d1, vmull.s16 @@ -620,7 +620,7 @@ function x264_pixel_var2_8x8_neon, export=1 vsubl.u8 \q3, d6, d7 .endm -function x264_pixel_satd_4x4_neon, export=1 +function x264_pixel_satd_4x4_neon vld1.32 {d1[]}, [r2], r3 vld1.32 {d0[]}, [r0,:32], r1 vld1.32 {d3[]}, [r2], r3 @@ -642,7 +642,7 @@ function x264_pixel_satd_4x4_neon, export=1 bx lr .endfunc -function x264_pixel_satd_4x8_neon, export=1 +function x264_pixel_satd_4x8_neon vld1.32 {d1[]}, [r2], r3 vld1.32 {d0[]}, [r0,:32], r1 vld1.32 {d3[]}, [r2], r3 @@ -669,7 +669,7 @@ function x264_pixel_satd_4x8_neon, export=1 b x264_satd_4x8_8x4_end_neon .endfunc -function x264_pixel_satd_8x4_neon, export=1 +function x264_pixel_satd_8x4_neon vld1.64 {d1}, [r2], r3 vld1.64 {d0}, [r0,:64], r1 vsubl.u8 q0, d0, d1 @@ -713,7 +713,7 @@ function x264_satd_4x8_8x4_end_neon bx lr .endfunc -function x264_pixel_satd_8x8_neon, export=1 +function x264_pixel_satd_8x8_neon mov ip, lr bl x264_satd_8x8_neon @@ -727,7 +727,7 @@ function x264_pixel_satd_8x8_neon, export=1 bx lr .endfunc -function x264_pixel_satd_8x16_neon, export=1 +function x264_pixel_satd_8x16_neon vpush {d8-d11} mov ip, lr @@ -798,7 +798,7 @@ function x264_satd_8x4v_8x8h_neon bx lr .endfunc -function x264_pixel_satd_16x8_neon, export=1 +function x264_pixel_satd_16x8_neon vpush {d8-d11} mov ip, lr @@ -820,7 +820,7 @@ function x264_pixel_satd_16x8_neon, export=1 bx lr .endfunc -function x264_pixel_satd_16x16_neon, export=1 +function x264_pixel_satd_16x16_neon vpush {d8-d11} mov ip, lr @@ -879,7 +879,7 @@ function x264_satd_16x4_neon .endfunc -function x264_pixel_sa8d_8x8_neon, export=1 +function x264_pixel_sa8d_8x8_neon mov ip, lr bl x264_sa8d_8x8_neon vadd.u16 q0, q8, q9 @@ -891,7 +891,7 @@ function x264_pixel_sa8d_8x8_neon, export=1 bx lr .endfunc -function x264_pixel_sa8d_16x16_neon, export=1 +function x264_pixel_sa8d_16x16_neon vpush {d8-d11} mov ip, lr bl x264_sa8d_8x8_neon @@ -988,7 +988,7 @@ function x264_sa8d_8x8_neon .macro HADAMARD_AC w h -function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1 +function x264_pixel_hadamard_ac_\w\()x\h\()_neon vpush {d8-d15} movrel ip, mask_ac4 vmov.i8 q4, #0 @@ -1143,7 +1143,7 @@ function x264_hadamard_ac_8x8_neon vmull.u8 \ssb, \db, \db .endm -function x264_pixel_ssim_4x4x2_core_neon, export=1 +function x264_pixel_ssim_4x4x2_core_neon ldr ip, [sp] vld1.64 {d0}, [r0], r1 vld1.64 {d2}, [r2], r3 @@ -1172,7 +1172,7 @@ function x264_pixel_ssim_4x4x2_core_neon, export=1 .endfunc // FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2 -function x264_pixel_ssim_end4_neon, export=1 +function x264_pixel_ssim_end4_neon vld1.32 {d16-d19}, [r0,:128]! vld1.32 {d20-d23}, [r1,:128]! vadd.s32 q0, q8, q10 diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S index 8ff61a23..9a914784 100644 --- a/common/arm/predict-a.S +++ b/common/arm/predict-a.S @@ -32,7 +32,7 @@ pw_76543210: .short 7,6,5,4,3,2,1,0 .text // because gcc doesn't believe in using the free shift in add -function x264_predict_4x4_h_armv6, export=1 +function x264_predict_4x4_h_armv6 ldrb r1, [r0, #0*FDEC_STRIDE-1] ldrb r2, [r0, #1*FDEC_STRIDE-1] ldrb r3, [r0, #2*FDEC_STRIDE-1] @@ -52,7 +52,7 @@ function x264_predict_4x4_h_armv6, export=1 bx lr .endfunc -function x264_predict_4x4_dc_armv6, export=1 +function x264_predict_4x4_dc_armv6 mov ip, #0 ldr r1, [r0, #-FDEC_STRIDE] ldrb r2, [r0, #0*FDEC_STRIDE-1] @@ -89,7 +89,7 @@ function x264_predict_4x4_dc_armv6, export=1 uadd8 \a2, \a2, \c2 .endm -function x264_predict_4x4_ddr_armv6, export=1 +function x264_predict_4x4_ddr_armv6 ldr r1, [r0, # -FDEC_STRIDE] ldrb r2, [r0, # -FDEC_STRIDE-1] ldrb r3, [r0, #0*FDEC_STRIDE-1] @@ -118,7 +118,7 @@ function x264_predict_4x4_ddr_armv6, export=1 pop {r4-r6,pc} .endfunc -function x264_predict_4x4_ddl_neon, export=1 +function x264_predict_4x4_ddl_neon sub r0, #FDEC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d0}, [r0], ip @@ -137,7 +137,7 @@ function x264_predict_4x4_ddl_neon, export=1 bx lr .endfunc -function x264_predict_8x8_dc_neon, export=1 +function x264_predict_8x8_dc_neon mov ip, #0 ldrd r2, [r1, #8] push {r4-r5,lr} @@ -162,7 +162,7 @@ function x264_predict_8x8_dc_neon, export=1 .endfunc -function x264_predict_8x8_h_neon, export=1 +function x264_predict_8x8_h_neon add r1, r1, #7 mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1] @@ -185,7 +185,7 @@ function x264_predict_8x8_h_neon, export=1 bx lr .endfunc -function x264_predict_8x8c_h_neon, export=1 +function x264_predict_8x8c_h_neon sub r1, r0, #1 mov ip, #FDEC_STRIDE .rept 4 @@ -197,7 +197,7 @@ function x264_predict_8x8c_h_neon, export=1 bx lr .endfunc -function x264_predict_8x8c_v_neon, export=1 +function x264_predict_8x8c_v_neon sub r0, r0, #FDEC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d0}, [r0,:64], ip @@ -208,7 +208,7 @@ function x264_predict_8x8c_v_neon, export=1 .endfunc -function x264_predict_16x16_dc_neon, export=1 +function x264_predict_16x16_dc_neon sub r3, r0, #FDEC_STRIDE sub r0, r0, #1 vld1.64 {d0-d1}, [r3,:128] @@ -245,7 +245,7 @@ function x264_predict_16x16_dc_neon, export=1 bx lr .endfunc -function x264_predict_16x16_h_neon, export=1 +function x264_predict_16x16_h_neon sub r1, r0, #1 mov ip, #FDEC_STRIDE .rept 8 @@ -259,7 +259,7 @@ function x264_predict_16x16_h_neon, export=1 bx lr .endfunc -function x264_predict_16x16_v_neon, export=1 +function x264_predict_16x16_v_neon sub r0, r0, #FDEC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d0-d1}, [r0,:128], ip diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S index 4dd71829..0b49eb47 100644 --- a/common/arm/quant-a.S +++ b/common/arm/quant-a.S @@ -64,7 +64,7 @@ pmovmskb_byte: .endm // quant_2x2_dc( int16_t dct[4], int mf, int bias ) -function x264_quant_2x2_dc_neon, export=1 +function x264_quant_2x2_dc_neon vld1.64 {d0}, [r0,:64] vabs.s16 d3, d0 vdup.16 d2, r2 @@ -80,7 +80,7 @@ function x264_quant_2x2_dc_neon, export=1 .endfunc // quant_4x4_dc( int16_t dct[16], int mf, int bias ) -function x264_quant_4x4_dc_neon, export=1 +function x264_quant_4x4_dc_neon vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 @@ -92,7 +92,7 @@ function x264_quant_4x4_dc_neon, export=1 .endfunc // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) -function x264_quant_4x4_neon, export=1 +function x264_quant_4x4_neon vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 @@ -104,7 +104,7 @@ function x264_quant_4x4_neon, export=1 .endfunc // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) -function x264_quant_8x8_neon, export=1 +function x264_quant_8x8_neon vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 @@ -139,7 +139,7 @@ function x264_quant_8x8_neon, export=1 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp ) .macro DEQUANT size bits -function x264_dequant_\size\()_neon, export=1 +function x264_dequant_\size\()_neon DEQUANT_START \bits+2, \bits .ifc \size, 8x8 mov r2, #4 @@ -220,7 +220,7 @@ DEQUANT 4x4, 4 DEQUANT 8x8, 6 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp ) -function x264_dequant_4x4_dc_neon, export=1 +function x264_dequant_4x4_dc_neon DEQUANT_START 6, 6, yes blt dequant_4x4_dc_rshift @@ -267,7 +267,7 @@ dequant_4x4_dc_rshift: // int coeff_last( int16_t *l ) -function x264_coeff_last4_arm, export=1 +function x264_coeff_last4_arm ldrd r2, [r0] subs r0, r3, #0 movne r0, #2 @@ -278,7 +278,7 @@ function x264_coeff_last4_arm, export=1 .endfunc .macro COEFF_LAST_1x size -function x264_coeff_last\size\()_neon, export=1 +function x264_coeff_last\size\()_neon .if \size == 15 sub r0, r0, #2 vld1.64 {d0-d3}, [r0] @@ -306,7 +306,7 @@ function x264_coeff_last\size\()_neon, export=1 COEFF_LAST_1x 15 COEFF_LAST_1x 16 -function x264_coeff_last64_neon, export=1 +function x264_coeff_last64_neon vld1.64 {d16-d19}, [r0,:128]! vqmovn.u16 d16, q8 vqmovn.u16 d17, q9