From: Vittorio Giovara Date: Tue, 17 Jan 2017 16:04:19 +0000 (+0100) Subject: arm: Set the function symbol prefix in a single location X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=498cca0b74ab90c363b761083c7fdcf56fc60904;p=libx264 arm: Set the function symbol prefix in a single location --- diff --git a/common/arm/asm.S b/common/arm/asm.S index 0472d115..5f96cdd2 100644 --- a/common/arm/asm.S +++ b/common/arm/asm.S @@ -34,9 +34,9 @@ #endif #ifdef PREFIX -# define EXTERN_ASM _ +# define EXTERN_ASM _x264_ #else -# define EXTERN_ASM +# define EXTERN_ASM x264_ #endif #ifdef __ELF__ @@ -75,7 +75,11 @@ ELF .eabi_attribute 25, \val .macro function name, export=1 .macro endfunc +.if \export +ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name +.else ELF .size \name, . - \name +.endif FUNC .endfunc .purgem endfunc .endm diff --git a/common/arm/bitstream-a.S b/common/arm/bitstream-a.S index 62be077d..39b5068a 100644 --- a/common/arm/bitstream-a.S +++ b/common/arm/bitstream-a.S @@ -25,7 +25,7 @@ #include "asm.S" -function x264_nal_escape_neon +function nal_escape_neon push {r4-r5,lr} vmov.u8 q0, #0xff vmov.u8 q8, #4 diff --git a/common/arm/cpu-a.S b/common/arm/cpu-a.S index a61a379a..e5e20f26 100644 --- a/common/arm/cpu-a.S +++ b/common/arm/cpu-a.S @@ -29,7 +29,7 @@ // done in gas because .fpu neon overrides the refusal to assemble // instructions the selected -march/-mcpu doesn't support -function x264_cpu_neon_test +function cpu_neon_test vadd.i16 q0, q0, q0 bx lr endfunc @@ -37,7 +37,7 @@ endfunc // return: 0 on success // 1 if counters were already enabled // 9 if lo-res counters were already enabled -function x264_cpu_enable_armv7_counter, export=0 +function cpu_enable_armv7_counter, export=0 mrc p15, 0, r2, c9, c12, 0 // read PMNC ands r0, r2, #1 andne r0, r2, #9 @@ -50,7 +50,7 @@ function x264_cpu_enable_armv7_counter, export=0 bx lr endfunc -function x264_cpu_disable_armv7_counter, export=0 +function cpu_disable_armv7_counter, export=0 mrc p15, 0, r0, c9, c12, 0 // read PMNC bic r0, r0, #1 // disable counters mcr p15, 0, r0, c9, c12, 0 // write PMNC @@ -64,14 +64,14 @@ endfunc // return: 0 if transfers neon -> arm transfers take more than 10 cycles // nonzero otherwise -function x264_cpu_fast_neon_mrc_test +function cpu_fast_neon_mrc_test // check for user access to performance counters mrc p15, 0, r0, c9, c14, 0 cmp r0, #0 bxeq lr push {r4-r6,lr} - bl x264_cpu_enable_armv7_counter + bl cpu_enable_armv7_counter ands r1, r0, #8 mov r3, #0 mov ip, #4 @@ -99,7 +99,7 @@ average_loop: // disable counters if we enabled them ands r0, r0, #1 - bleq x264_cpu_disable_armv7_counter + bleq cpu_disable_armv7_counter lsr r0, r3, #5 cmp r0, #10 diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S index 48a34985..9aa2cda7 100644 --- a/common/arm/dct-a.S +++ b/common/arm/dct-a.S @@ -62,7 +62,7 @@ endconst .endm -function x264_dct4x4dc_neon +function dct4x4dc_neon vld1.64 {d0-d3}, [r0,:128] SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3 SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7 @@ -81,7 +81,7 @@ function x264_dct4x4dc_neon bx lr endfunc -function x264_idct4x4dc_neon +function idct4x4dc_neon vld1.64 {d0-d3}, [r0,:128] SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3 SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7 @@ -105,7 +105,7 @@ endfunc vsub.s16 \d3, \d7, \d5 .endm -function x264_sub4x4_dct_neon +function sub4x4_dct_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.32 {d0[]}, [r1,:32], r3 @@ -128,7 +128,7 @@ function x264_sub4x4_dct_neon bx lr endfunc -function x264_sub8x4_dct_neon, export=0 +function sub8x4_dct_neon, export=0 vld1.64 {d0}, [r1,:64], r3 vld1.64 {d1}, [r2,:64], ip vsubl.u8 q8, d0, d1 @@ -164,34 +164,34 @@ function x264_sub8x4_dct_neon, export=0 bx lr endfunc -function x264_sub8x8_dct_neon +function sub8x8_dct_neon push {lr} mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE - bl x264_sub8x4_dct_neon + bl sub8x4_dct_neon pop {lr} - b x264_sub8x4_dct_neon + b sub8x4_dct_neon endfunc -function x264_sub16x16_dct_neon +function sub16x16_dct_neon push {lr} mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE - bl x264_sub8x4_dct_neon - bl x264_sub8x4_dct_neon + bl sub8x4_dct_neon + bl sub8x4_dct_neon sub r1, r1, #8*FENC_STRIDE-8 sub r2, r2, #8*FDEC_STRIDE-8 - bl x264_sub8x4_dct_neon - bl x264_sub8x4_dct_neon + bl sub8x4_dct_neon + bl sub8x4_dct_neon sub r1, r1, #8 sub r2, r2, #8 - bl x264_sub8x4_dct_neon - bl x264_sub8x4_dct_neon + bl sub8x4_dct_neon + bl sub8x4_dct_neon sub r1, r1, #8*FENC_STRIDE-8 sub r2, r2, #8*FDEC_STRIDE-8 - bl x264_sub8x4_dct_neon + bl sub8x4_dct_neon pop {lr} - b x264_sub8x4_dct_neon + b sub8x4_dct_neon endfunc @@ -226,7 +226,7 @@ endfunc SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1 .endm -function x264_sub8x8_dct8_neon +function sub8x8_dct8_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1,:64], r3 @@ -278,19 +278,19 @@ function x264_sub8x8_dct8_neon bx lr endfunc -function x264_sub16x16_dct8_neon +function sub16x16_dct8_neon push {lr} - bl X(x264_sub8x8_dct8_neon) + bl X(sub8x8_dct8_neon) sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 - bl X(x264_sub8x8_dct8_neon) + bl X(sub8x8_dct8_neon) sub r1, r1, #8 sub r2, r2, #8 - bl X(x264_sub8x8_dct8_neon) + bl X(sub8x8_dct8_neon) pop {lr} sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 - b X(x264_sub8x8_dct8_neon) + b X(sub8x8_dct8_neon) endfunc @@ -303,7 +303,7 @@ endfunc vadd.s16 \d6, \d6, \d1 .endm -function x264_add4x4_idct_neon +function add4x4_idct_neon mov r2, #FDEC_STRIDE vld1.64 {d0-d3}, [r1,:128] @@ -335,7 +335,7 @@ function x264_add4x4_idct_neon bx lr endfunc -function x264_add8x4_idct_neon, export=0 +function add8x4_idct_neon, export=0 vld1.64 {d0-d3}, [r1,:128]! IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3 vld1.64 {d4-d7}, [r1,:128]! @@ -375,29 +375,29 @@ function x264_add8x4_idct_neon, export=0 bx lr endfunc -function x264_add8x8_idct_neon +function add8x8_idct_neon mov r2, #FDEC_STRIDE mov ip, lr - bl x264_add8x4_idct_neon + bl add8x4_idct_neon mov lr, ip - b x264_add8x4_idct_neon + b add8x4_idct_neon endfunc -function x264_add16x16_idct_neon +function add16x16_idct_neon mov r2, #FDEC_STRIDE mov ip, lr - bl x264_add8x4_idct_neon - bl x264_add8x4_idct_neon + bl add8x4_idct_neon + bl add8x4_idct_neon sub r0, r0, #8*FDEC_STRIDE-8 - bl x264_add8x4_idct_neon - bl x264_add8x4_idct_neon + bl add8x4_idct_neon + bl add8x4_idct_neon sub r0, r0, #8 - bl x264_add8x4_idct_neon - bl x264_add8x4_idct_neon + bl add8x4_idct_neon + bl add8x4_idct_neon sub r0, r0, #8*FDEC_STRIDE-8 - bl x264_add8x4_idct_neon + bl add8x4_idct_neon mov lr, ip - b x264_add8x4_idct_neon + b add8x4_idct_neon endfunc @@ -435,7 +435,7 @@ endfunc SUMSUB_AB q11, q12, q2, q12 .endm -function x264_add8x8_idct8_neon +function add8x8_idct8_neon mov r2, #FDEC_STRIDE vld1.64 {d16-d19}, [r1,:128]! vld1.64 {d20-d23}, [r1,:128]! @@ -497,20 +497,20 @@ function x264_add8x8_idct8_neon bx lr endfunc -function x264_add16x16_idct8_neon +function add16x16_idct8_neon mov ip, lr - bl X(x264_add8x8_idct8_neon) + bl X(add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 - bl X(x264_add8x8_idct8_neon) + bl X(add8x8_idct8_neon) sub r0, r0, #8 - bl X(x264_add8x8_idct8_neon) + bl X(add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 mov lr, ip - b X(x264_add8x8_idct8_neon) + b X(add8x8_idct8_neon) endfunc -function x264_add8x8_idct_dc_neon +function add8x8_idct_dc_neon mov r2, #FDEC_STRIDE vld1.64 {d16}, [r1,:64] vrshr.s16 d16, d16, #6 @@ -593,7 +593,7 @@ endfunc vst1.64 {d22-d23}, [r2,:128], r3 .endm -function x264_add16x16_idct_dc_neon +function add16x16_idct_dc_neon mov r2, r0 mov r3, #FDEC_STRIDE vmov.i16 q15, #0 @@ -609,7 +609,7 @@ function x264_add16x16_idct_dc_neon bx lr endfunc -function x264_sub8x8_dct_dc_neon +function sub8x8_dct_dc_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1,:64], r3 @@ -657,7 +657,7 @@ function x264_sub8x8_dct_dc_neon bx lr endfunc -function x264_sub8x16_dct_dc_neon +function sub8x16_dct_dc_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1,:64], r3 @@ -751,7 +751,7 @@ function x264_sub8x16_dct_dc_neon endfunc -function x264_zigzag_scan_4x4_frame_neon +function zigzag_scan_4x4_frame_neon movrel r2, scan4x4_frame vld1.64 {d0-d3}, [r1,:128] vld1.64 {d16-d19}, [r2,:128] diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S index b0241d05..c997b3f3 100644 --- a/common/arm/deblock-a.S +++ b/common/arm/deblock-a.S @@ -117,7 +117,7 @@ vqmovun.s16 d1, q12 .endm -function x264_deblock_v_luma_neon +function deblock_v_luma_neon h264_loop_filter_start vld1.64 {d0, d1}, [r0,:128], r1 @@ -143,7 +143,7 @@ function x264_deblock_v_luma_neon bx lr endfunc -function x264_deblock_h_luma_neon +function deblock_h_luma_neon h264_loop_filter_start sub r0, r0, #4 @@ -324,7 +324,7 @@ endfunc .endm -function x264_deblock_v_luma_intra_neon +function deblock_v_luma_intra_neon push {lr} vld1.64 {d0, d1}, [r0,:128], r1 vld1.64 {d2, d3}, [r0,:128], r1 @@ -352,7 +352,7 @@ function x264_deblock_v_luma_intra_neon pop {pc} endfunc -function x264_deblock_h_luma_intra_neon +function deblock_h_luma_intra_neon push {lr} sub r0, r0, #4 vld1.64 {d22}, [r0], r1 @@ -447,7 +447,7 @@ endfunc vqmovun.s16 d1, q12 .endm -function x264_deblock_v_chroma_neon +function deblock_v_chroma_neon h264_loop_filter_start sub r0, r0, r1, lsl #1 @@ -465,7 +465,7 @@ function x264_deblock_v_chroma_neon bx lr endfunc -function x264_deblock_h_chroma_neon +function deblock_h_chroma_neon h264_loop_filter_start sub r0, r0, #4 @@ -499,7 +499,7 @@ deblock_h_chroma: bx lr endfunc -function x264_deblock_h_chroma_422_neon +function deblock_h_chroma_422_neon h264_loop_filter_start push {lr} sub r0, r0, #4 @@ -547,7 +547,7 @@ endfunc vqmovun.s16 d0, q11 .endm -function x264_deblock_h_chroma_mbaff_neon +function deblock_h_chroma_mbaff_neon h264_loop_filter_start sub r0, r0, #4 @@ -610,7 +610,7 @@ endfunc vbit q0, q2, q13 .endm -function x264_deblock_v_chroma_intra_neon +function deblock_v_chroma_intra_neon sub r0, r0, r1, lsl #1 vld2.8 {d18,d19}, [r0,:128], r1 vld2.8 {d16,d17}, [r0,:128], r1 @@ -626,7 +626,7 @@ function x264_deblock_v_chroma_intra_neon bx lr endfunc -function x264_deblock_h_chroma_intra_neon +function deblock_h_chroma_intra_neon sub r0, r0, #4 vld1.8 {d18}, [r0], r1 vld1.8 {d16}, [r0], r1 @@ -657,15 +657,15 @@ function x264_deblock_h_chroma_intra_neon bx lr endfunc -function x264_deblock_h_chroma_422_intra_neon +function deblock_h_chroma_422_intra_neon push {lr} - bl X(x264_deblock_h_chroma_intra_neon) + bl X(deblock_h_chroma_intra_neon) add r0, r0, #2 pop {lr} - b X(x264_deblock_h_chroma_intra_neon) + b X(deblock_h_chroma_intra_neon) endfunc -function x264_deblock_h_chroma_intra_mbaff_neon +function deblock_h_chroma_intra_mbaff_neon sub r0, r0, #4 vld1.8 {d18}, [r0], r1 vld1.8 {d16}, [r0], r1 @@ -688,7 +688,7 @@ function x264_deblock_h_chroma_intra_mbaff_neon bx lr endfunc -function x264_deblock_strength_neon +function deblock_strength_neon ldr ip, [sp] vmov.i8 q8, #0 lsl ip, ip, #8 diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index 2d6dc2ed..eb14ed6e 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -38,7 +38,7 @@ endconst // They also use nothing above armv5te, but we don't care about pre-armv6 // void prefetch_ref( uint8_t *pix, intptr_t stride, int parity ) -function x264_prefetch_ref_arm +function prefetch_ref_arm sub r2, r2, #1 add r0, r0, #64 and r2, r2, r1 @@ -58,7 +58,7 @@ endfunc // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, // uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) -function x264_prefetch_fenc_arm +function prefetch_fenc_arm ldr ip, [sp] push {lr} and lr, ip, #3 @@ -83,8 +83,8 @@ function x264_prefetch_fenc_arm endfunc -// void *x264_memcpy_aligned( void *dst, const void *src, size_t n ) -function x264_memcpy_aligned_neon +// void *memcpy_aligned( void *dst, const void *src, size_t n ) +function memcpy_aligned_neon orr r3, r0, r1, lsr #1 movrel ip, memcpy_table and r3, r3, #0xc @@ -150,8 +150,8 @@ endconst .ltorg -// void x264_memzero_aligned( void *dst, size_t n ) -function x264_memzero_aligned_neon +// void memzero_aligned( void *dst, size_t n ) +function memzero_aligned_neon vmov.i8 q0, #0 vmov.i8 q1, #0 memzero_loop: @@ -168,18 +168,18 @@ endfunc // uint8_t *src1, intptr_t src1_stride, // uint8_t *src2, intptr_t src2_stride, int weight ); .macro AVGH w h -function x264_pixel_avg_\w\()x\h\()_neon +function pixel_avg_\w\()x\h\()_neon ldr ip, [sp, #8] push {r4-r6,lr} cmp ip, #32 ldrd r4, r5, [sp, #16] mov lr, #\h - beq x264_pixel_avg_w\w\()_neon + beq pixel_avg_w\w\()_neon rsbs r6, ip, #64 - blt x264_pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 + blt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 cmp ip, #0 - bge x264_pixel_avg_weight_w\w\()_add_add_neon - b x264_pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 + bge pixel_avg_weight_w\w\()_add_add_neon + b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 endfunc .endm @@ -244,7 +244,7 @@ AVGH 16, 16 .endm .macro AVG_WEIGHT ext -function x264_pixel_avg_weight_w4_\ext\()_neon, export=0 +function pixel_avg_weight_w4_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -260,7 +260,7 @@ function x264_pixel_avg_weight_w4_\ext\()_neon, export=0 pop {r4-r6,pc} endfunc -function x264_pixel_avg_weight_w8_\ext\()_neon, export=0 +function pixel_avg_weight_w8_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #4 @@ -284,7 +284,7 @@ function x264_pixel_avg_weight_w8_\ext\()_neon, export=0 pop {r4-r6,pc} endfunc -function x264_pixel_avg_weight_w16_\ext\()_neon, export=0 +function pixel_avg_weight_w16_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -309,7 +309,7 @@ AVG_WEIGHT add_add AVG_WEIGHT add_sub AVG_WEIGHT sub_add -function x264_pixel_avg_w4_neon, export=0 +function pixel_avg_w4_neon, export=0 subs lr, lr, #2 vld1.32 {d0[]}, [r2], r3 vld1.32 {d2[]}, [r4], r5 @@ -319,11 +319,11 @@ function x264_pixel_avg_w4_neon, export=0 vrhadd.u8 d1, d1, d3 vst1.32 {d0[0]}, [r0,:32], r1 vst1.32 {d1[0]}, [r0,:32], r1 - bgt x264_pixel_avg_w4_neon + bgt pixel_avg_w4_neon pop {r4-r6,pc} endfunc -function x264_pixel_avg_w8_neon, export=0 +function pixel_avg_w8_neon, export=0 subs lr, lr, #4 vld1.64 {d0}, [r2], r3 vld1.64 {d2}, [r4], r5 @@ -341,11 +341,11 @@ function x264_pixel_avg_w8_neon, export=0 vrhadd.u8 d3, d3, d5 vst1.64 {d2}, [r0,:64], r1 vst1.64 {d3}, [r0,:64], r1 - bgt x264_pixel_avg_w8_neon + bgt pixel_avg_w8_neon pop {r4-r6,pc} endfunc -function x264_pixel_avg_w16_neon, export=0 +function pixel_avg_w16_neon, export=0 subs lr, lr, #4 vld1.64 {d0-d1}, [r2], r3 vld1.64 {d2-d3}, [r4], r5 @@ -363,12 +363,12 @@ function x264_pixel_avg_w16_neon, export=0 vrhadd.u8 q3, q3, q0 vst1.64 {d4-d5}, [r0,:128], r1 vst1.64 {d6-d7}, [r0,:128], r1 - bgt x264_pixel_avg_w16_neon + bgt pixel_avg_w16_neon pop {r4-r6,pc} endfunc -function x264_pixel_avg2_w4_neon +function pixel_avg2_w4_neon ldr ip, [sp, #4] push {lr} ldr lr, [sp, #4] @@ -386,7 +386,7 @@ avg2_w4_loop: pop {pc} endfunc -function x264_pixel_avg2_w8_neon +function pixel_avg2_w8_neon ldr ip, [sp, #4] push {lr} ldr lr, [sp, #4] @@ -404,7 +404,7 @@ avg2_w8_loop: pop {pc} endfunc -function x264_pixel_avg2_w16_neon +function pixel_avg2_w16_neon ldr ip, [sp, #4] push {lr} ldr lr, [sp, #4] @@ -422,7 +422,7 @@ avg2_w16_loop: pop {pc} endfunc -function x264_pixel_avg2_w20_neon +function pixel_avg2_w20_neon ldr ip, [sp, #4] push {lr} sub r1, r1, #16 @@ -464,7 +464,7 @@ endfunc // void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, intptr_t dst_stride, // const x264_weight_t *weight, int height ) -function x264_mc_weight_w20_neon +function mc_weight_w20_neon weight_prologue full sub r1, #16 weight20_loop: @@ -500,7 +500,7 @@ weight20_loop: pop {r4-r5,pc} endfunc -function x264_mc_weight_w16_neon +function mc_weight_w16_neon weight_prologue full weight16_loop: subs ip, #2 @@ -528,7 +528,7 @@ weight16_loop: pop {r4-r5,pc} endfunc -function x264_mc_weight_w8_neon +function mc_weight_w8_neon weight_prologue full weight8_loop: subs ip, #2 @@ -548,7 +548,7 @@ weight8_loop: pop {r4-r5,pc} endfunc -function x264_mc_weight_w4_neon +function mc_weight_w4_neon weight_prologue full weight4_loop: subs ip, #2 @@ -564,7 +564,7 @@ weight4_loop: pop {r4-r5,pc} endfunc -function x264_mc_weight_w20_nodenom_neon +function mc_weight_w20_nodenom_neon weight_prologue nodenom sub r1, #16 weight20_nodenom_loop: @@ -595,7 +595,7 @@ weight20_nodenom_loop: pop {r4-r5,pc} endfunc -function x264_mc_weight_w16_nodenom_neon +function mc_weight_w16_nodenom_neon weight_prologue nodenom weight16_nodenom_loop: subs ip, #2 @@ -619,7 +619,7 @@ weight16_nodenom_loop: pop {r4-r5,pc} endfunc -function x264_mc_weight_w8_nodenom_neon +function mc_weight_w8_nodenom_neon weight_prologue nodenom weight8_nodenom_loop: subs ip, #2 @@ -637,7 +637,7 @@ weight8_nodenom_loop: pop {r4-r5,pc} endfunc -function x264_mc_weight_w4_nodenom_neon +function mc_weight_w4_nodenom_neon weight_prologue nodenom weight4_nodenom_loop: subs ip, #2 @@ -661,7 +661,7 @@ endfunc .endm .macro weight_simple name op -function x264_mc_weight_w20_\name\()_neon +function mc_weight_w20_\name\()_neon weight_simple_prologue weight20_\name\()_loop: subs ip, #2 @@ -676,7 +676,7 @@ weight20_\name\()_loop: pop {pc} endfunc -function x264_mc_weight_w16_\name\()_neon +function mc_weight_w16_\name\()_neon weight_simple_prologue weight16_\name\()_loop: subs ip, #2 @@ -690,7 +690,7 @@ weight16_\name\()_loop: pop {pc} endfunc -function x264_mc_weight_w8_\name\()_neon +function mc_weight_w8_\name\()_neon weight_simple_prologue weight8_\name\()_loop: subs ip, #2 @@ -703,7 +703,7 @@ weight8_\name\()_loop: pop {pc} endfunc -function x264_mc_weight_w4_\name\()_neon +function mc_weight_w4_\name\()_neon weight_simple_prologue weight4_\name\()_loop: subs ip, #2 @@ -722,7 +722,7 @@ weight_simple offsetsub, vqsub.u8 // void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height ) -function x264_mc_copy_w4_neon +function mc_copy_w4_neon ldr ip, [sp] copy_w4_loop: subs ip, ip, #4 @@ -738,7 +738,7 @@ copy_w4_loop: bx lr endfunc -function x264_mc_copy_w8_neon +function mc_copy_w8_neon ldr ip, [sp] copy_w8_loop: subs ip, ip, #4 @@ -754,7 +754,7 @@ copy_w8_loop: bx lr endfunc -function x264_mc_copy_w16_neon +function mc_copy_w16_neon ldr ip, [sp] copy_w16_loop: subs ip, ip, #4 @@ -770,7 +770,7 @@ copy_w16_loop: bx lr endfunc -function x264_mc_copy_w16_aligned_neon +function mc_copy_w16_aligned_neon ldr ip, [sp] copy_w16_aligned_loop: subs ip, ip, #4 @@ -787,11 +787,10 @@ copy_w16_aligned_loop: endfunc -// void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride, -// uint8_t *src, intptr_t i_src_stride, -// int dx, int dy, int i_width, int i_height ); - -function x264_mc_chroma_neon +// void mc_chroma( uint8_t *dst, intptr_t i_dst_stride, +// uint8_t *src, intptr_t i_src_stride, +// int dx, int dy, int i_width, int i_height ); +function mc_chroma_neon push {r4-r8, lr} vpush {d8-d11} ldrd r4, r5, [sp, #56] @@ -1138,7 +1137,7 @@ endfunc // hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width ) -function x264_hpel_filter_v_neon +function hpel_filter_v_neon ldr ip, [sp] sub r1, r1, r3, lsl #1 push {lr} @@ -1178,7 +1177,7 @@ filter_v_loop: endfunc // hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); -function x264_hpel_filter_c_neon +function hpel_filter_c_neon sub r1, #16 vld1.64 {d0-d3}, [r1,:128]! @@ -1263,7 +1262,7 @@ filter_c_loop: endfunc // hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); -function x264_hpel_filter_h_neon +function hpel_filter_h_neon sub r1, #16 vmov.u8 d30, #5 vld1.64 {d0-d3}, [r1,:128]! @@ -1353,7 +1352,7 @@ endfunc // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, // uint8_t *dstc, intptr_t src_stride, intptr_t dst_stride, int width, // int height ) -function x264_frame_init_lowres_core_neon +function frame_init_lowres_core_neon push {r4-r10,lr} vpush {d8-d15} ldrd r4, r5, [sp, #96] @@ -1441,7 +1440,7 @@ lowres_xloop_end: pop {r4-r10,pc} endfunc -function x264_load_deinterleave_chroma_fdec_neon +function load_deinterleave_chroma_fdec_neon mov ip, #FDEC_STRIDE/2 1: vld2.8 {d0-d1}, [r1,:128], r2 @@ -1454,7 +1453,7 @@ function x264_load_deinterleave_chroma_fdec_neon bx lr endfunc -function x264_load_deinterleave_chroma_fenc_neon +function load_deinterleave_chroma_fenc_neon mov ip, #FENC_STRIDE/2 1: vld2.8 {d0-d1}, [r1,:128], r2 @@ -1467,7 +1466,7 @@ function x264_load_deinterleave_chroma_fenc_neon bx lr endfunc -function x264_plane_copy_core_neon +function plane_copy_core_neon push {r4,lr} ldr r4, [sp, #8] ldr lr, [sp, #12] @@ -1498,7 +1497,7 @@ function x264_plane_copy_core_neon pop {r4,pc} endfunc -function x264_plane_copy_deinterleave_neon +function plane_copy_deinterleave_neon push {r4-r7, lr} ldrd r6, r7, [sp, #28] ldrd r4, r5, [sp, #20] @@ -1524,7 +1523,7 @@ block: pop {r4-r7, pc} endfunc -function x264_plane_copy_deinterleave_rgb_neon +function plane_copy_deinterleave_rgb_neon push {r4-r8, r10, r11, lr} ldrd r4, r5, [sp, #32] ldrd r6, r7, [sp, #40] @@ -1576,7 +1575,7 @@ block4: pop {r4-r8, r10, r11, pc} endfunc -function x264_plane_copy_interleave_core_neon +function plane_copy_interleave_core_neon push {r4-r7, lr} ldrd r6, r7, [sp, #28] ldrd r4, r5, [sp, #20] @@ -1603,7 +1602,7 @@ blocki: pop {r4-r7, pc} endfunc -function x264_plane_copy_swap_core_neon +function plane_copy_swap_core_neon push {r4-r5, lr} ldrd r4, r5, [sp, #12] add lr, r4, #15 @@ -1627,7 +1626,7 @@ function x264_plane_copy_swap_core_neon pop {r4-r5, pc} endfunc -function x264_store_interleave_chroma_neon +function store_interleave_chroma_neon push {lr} ldr lr, [sp, #4] mov ip, #FDEC_STRIDE @@ -1651,7 +1650,7 @@ endfunc vadd.u16 q0, q0, q2 .endm -function x264_integral_init4h_neon +function integral_init4h_neon sub r3, r0, r2, lsl #1 vld1.8 {d6, d7}, [r1, :128]! 1: @@ -1686,7 +1685,7 @@ endfunc vadd.u16 q0, q0, \s .endm -function x264_integral_init8h_neon +function integral_init8h_neon sub r3, r0, r2, lsl #1 vld1.8 {d16, d17}, [r1, :128]! 1: @@ -1703,7 +1702,7 @@ function x264_integral_init8h_neon bx lr endfunc -function x264_integral_init4v_neon +function integral_init4v_neon push {r4-r5} mov r3, r0 add r4, r0, r2, lsl #3 @@ -1742,7 +1741,7 @@ function x264_integral_init4v_neon bx lr endfunc -function x264_integral_init8v_neon +function integral_init8v_neon add r2, r0, r1, lsl #4 sub r1, r1, #8 ands r3, r1, #16 - 1 @@ -1766,7 +1765,7 @@ function x264_integral_init8v_neon bx lr endfunc -function x264_mbtree_propagate_cost_neon +function mbtree_propagate_cost_neon push {r4-r5,lr} ldrd r4, r5, [sp, #12] ldr lr, [sp, #20] @@ -1816,7 +1815,7 @@ function x264_mbtree_propagate_cost_neon pop {r4-r5,pc} endfunc -function x264_mbtree_propagate_list_internal_neon +function mbtree_propagate_list_internal_neon vld1.16 {d4[]}, [sp] @ bipred_weight movrel r12, pw_0to15 vmov.u16 q10, #0xc000 @@ -1882,7 +1881,7 @@ function x264_mbtree_propagate_list_internal_neon endfunc @ void mbtree_fix8_pack( int16_t *dst, float *src, int count ) -function x264_mbtree_fix8_pack_neon, export=1 +function mbtree_fix8_pack_neon, export=1 subs r3, r2, #8 blt 2f 1: @@ -1910,7 +1909,7 @@ function x264_mbtree_fix8_pack_neon, export=1 endfunc @ void mbtree_fix8_unpack( float *dst, int16_t *src, int count ) -function x264_mbtree_fix8_unpack_neon, export=1 +function mbtree_fix8_unpack_neon, export=1 subs r3, r2, #8 blt 2f 1: diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S index 155e1cfa..9943290b 100644 --- a/common/arm/pixel-a.S +++ b/common/arm/pixel-a.S @@ -46,7 +46,7 @@ endconst .text .macro SAD4_ARMV6 h -function x264_pixel_sad_4x\h\()_armv6 +function pixel_sad_4x\h\()_armv6 push {r4-r6,lr} ldr r4, [r2], r3 ldr r5, [r0], r1 @@ -115,7 +115,7 @@ SAD4_ARMV6 8 .endm .macro SAD_FUNC w, h, name, align:vararg -function x264_pixel_sad\name\()_\w\()x\h\()_neon +function pixel_sad\name\()_\w\()x\h\()_neon SAD_START_\w \align .if \w == 16 @@ -206,7 +206,7 @@ SAD_FUNC 16, 16, _aligned, ,:128 .endm .macro SAD_FUNC_DUAL w, h -function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual +function pixel_sad_aligned_\w\()x\h\()_neon_dual SAD_DUAL_START_\w .rept \h / 2 - \w / 8 SAD_DUAL_\w @@ -328,7 +328,7 @@ SAD_FUNC_DUAL 16, 16 .endm .macro SAD_X_FUNC x, w, h -function x264_pixel_sad_x\x\()_\w\()x\h\()_neon +function pixel_sad_x\x\()_\w\()x\h\()_neon push {r6-r7,lr} .if \x == 3 ldrd r6, r7, [sp, #12] @@ -390,7 +390,7 @@ SAD_X_FUNC 4, 8, 16 SAD_X_FUNC 4, 16, 8 SAD_X_FUNC 4, 16, 16 -function x264_pixel_vsad_neon +function pixel_vsad_neon subs r2, r2, #2 vld1.8 {q0}, [r0], r1 vld1.8 {q1}, [r0], r1 @@ -414,7 +414,7 @@ function x264_pixel_vsad_neon bx lr endfunc -function x264_pixel_asd8_neon +function pixel_asd8_neon ldr r12, [sp, #0] sub r12, r12, #2 vld1.8 {d0}, [r0], r1 @@ -523,7 +523,7 @@ endfunc .endm .macro SSD_FUNC w h -function x264_pixel_ssd_\w\()x\h\()_neon +function pixel_ssd_\w\()x\h\()_neon SSD_START_\w .rept \h-2 SSD_\w @@ -544,7 +544,7 @@ SSD_FUNC 8, 16 SSD_FUNC 16, 8 SSD_FUNC 16, 16 -function x264_pixel_ssd_nv12_core_neon +function pixel_ssd_nv12_core_neon push {r4-r5} ldrd r4, r5, [sp, #8] add r12, r4, #8 @@ -624,7 +624,7 @@ endfunc \vpadal \qsqr_sum, \qsqr_last .endm -function x264_pixel_var_8x8_neon +function pixel_var_8x8_neon vld1.64 {d16}, [r0,:64], r1 vmull.u8 q1, d16, d16 vmovl.u8 q0, d16 @@ -645,10 +645,10 @@ function x264_pixel_var_8x8_neon VAR_SQR_SUM q1, q9, q14, d24 vld1.64 {d26}, [r0,:64], r1 VAR_SQR_SUM q2, q10, q15, d26 - b x264_var_end + b var_end endfunc -function x264_pixel_var_8x16_neon +function pixel_var_8x16_neon vld1.64 {d16}, [r0,:64], r1 vld1.64 {d18}, [r0,:64], r1 vmull.u8 q1, d16, d16 @@ -677,10 +677,10 @@ function x264_pixel_var_8x16_neon b 1b 2: VAR_SQR_SUM q2, q13, q15, d22 - b x264_var_end + b var_end endfunc -function x264_pixel_var_16x16_neon +function pixel_var_16x16_neon vld1.64 {d16-d17}, [r0,:128], r1 vmull.u8 q12, d16, d16 vmovl.u8 q0, d16 @@ -704,7 +704,7 @@ var16_loop: bgt var16_loop endfunc -function x264_var_end, export=0 +function var_end, export=0 vpaddl.u16 q8, q14 vpaddl.u16 q9, q15 vadd.u32 q1, q1, q8 @@ -744,7 +744,7 @@ endfunc vmlal.s16 \acc, \d1, \d1 .endm -function x264_pixel_var2_8x8_neon +function pixel_var2_8x8_neon mov r3, #16 DIFF_SUM q0, q10, d0, d1, d20, d21 DIFF_SUM q8, q11, d16, d17, d22, d23 @@ -783,7 +783,7 @@ function x264_pixel_var2_8x8_neon bx lr endfunc -function x264_pixel_var2_8x16_neon +function pixel_var2_8x16_neon mov r3, #16 vld1.64 {d16}, [r0,:64]! vld1.64 {d17}, [r1,:64], r3 @@ -846,7 +846,7 @@ endfunc vsubl.u8 \q3, d6, d7 .endm -function x264_pixel_satd_4x4_neon +function pixel_satd_4x4_neon vld1.32 {d1[]}, [r2], r3 vld1.32 {d0[]}, [r0,:32], r1 vld1.32 {d3[]}, [r2], r3 @@ -868,7 +868,7 @@ function x264_pixel_satd_4x4_neon bx lr endfunc -function x264_pixel_satd_4x8_neon +function pixel_satd_4x8_neon vld1.32 {d1[]}, [r2], r3 vld1.32 {d0[]}, [r0,:32], r1 vld1.32 {d3[]}, [r2], r3 @@ -892,10 +892,10 @@ function x264_pixel_satd_4x8_neon vld1.32 {d6[1]}, [r0,:32], r1 vsubl.u8 q3, d6, d7 SUMSUB_AB q10, q11, q2, q3 - b x264_satd_4x8_8x4_end_neon + b satd_4x8_8x4_end_neon endfunc -function x264_pixel_satd_8x4_neon +function pixel_satd_8x4_neon vld1.64 {d1}, [r2], r3 vld1.64 {d0}, [r0,:64], r1 vsubl.u8 q0, d0, d1 @@ -912,7 +912,7 @@ function x264_pixel_satd_8x4_neon SUMSUB_AB q10, q11, q2, q3 endfunc -function x264_satd_4x8_8x4_end_neon, export=0 +function satd_4x8_8x4_end_neon, export=0 vadd.s16 q0, q8, q10 vadd.s16 q1, q9, q11 vsub.s16 q2, q8, q10 @@ -939,10 +939,10 @@ function x264_satd_4x8_8x4_end_neon, export=0 bx lr endfunc -function x264_pixel_satd_8x8_neon +function pixel_satd_8x8_neon mov ip, lr - bl x264_satd_8x8_neon + bl satd_8x8_neon vadd.u16 q0, q12, q13 vadd.u16 q1, q14, q15 @@ -953,15 +953,15 @@ function x264_pixel_satd_8x8_neon bx lr endfunc -function x264_pixel_satd_8x16_neon +function pixel_satd_8x16_neon vpush {d8-d11} mov ip, lr - bl x264_satd_8x8_neon + bl satd_8x8_neon vadd.u16 q4, q12, q13 vadd.u16 q5, q14, q15 - bl x264_satd_8x8_neon + bl satd_8x8_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 @@ -975,7 +975,7 @@ function x264_pixel_satd_8x16_neon bx lr endfunc -function x264_satd_8x8_neon, export=0 +function satd_8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 vld1.64 {d7}, [r2], r3 SUMSUB_AB q0, q1, q8, q9 @@ -996,7 +996,7 @@ function x264_satd_8x8_neon, export=0 endfunc // one vertical hadamard pass and two horizontal -function x264_satd_8x4v_8x8h_neon, export=0 +function satd_8x4v_8x8h_neon, export=0 SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15 vtrn.16 q8, q9 SUMSUB_AB q12, q14, q0, q2 @@ -1024,15 +1024,15 @@ function x264_satd_8x4v_8x8h_neon, export=0 bx lr endfunc -function x264_pixel_satd_16x8_neon +function pixel_satd_16x8_neon vpush {d8-d11} mov ip, lr - bl x264_satd_16x4_neon + bl satd_16x4_neon vadd.u16 q4, q12, q13 vadd.u16 q5, q14, q15 - bl x264_satd_16x4_neon + bl satd_16x4_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 @@ -1046,27 +1046,27 @@ function x264_pixel_satd_16x8_neon bx lr endfunc -function x264_pixel_satd_16x16_neon +function pixel_satd_16x16_neon vpush {d8-d11} mov ip, lr - bl x264_satd_16x4_neon + bl satd_16x4_neon vadd.u16 q4, q12, q13 vadd.u16 q5, q14, q15 - bl x264_satd_16x4_neon + bl satd_16x4_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 vadd.u16 q5, q5, q15 - bl x264_satd_16x4_neon + bl satd_16x4_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 vadd.u16 q5, q5, q15 - bl x264_satd_16x4_neon + bl satd_16x4_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 @@ -1080,7 +1080,7 @@ function x264_pixel_satd_16x16_neon bx lr endfunc -function x264_satd_16x4_neon, export=0 +function satd_16x4_neon, export=0 vld1.64 {d2-d3}, [r2], r3 vld1.64 {d0-d1}, [r0,:128], r1 vsubl.u8 q8, d0, d2 @@ -1101,13 +1101,13 @@ function x264_satd_16x4_neon, export=0 vsubl.u8 q15, d5, d7 SUMSUB_AB q2, q3, q10, q11 SUMSUB_ABCD q8, q10, q9, q11, q0, q2, q1, q3 - b x264_satd_8x4v_8x8h_neon + b satd_8x4v_8x8h_neon endfunc -function x264_pixel_sa8d_8x8_neon +function pixel_sa8d_8x8_neon mov ip, lr - bl x264_sa8d_8x8_neon + bl sa8d_8x8_neon vadd.u16 q0, q8, q9 HORIZ_ADD d0, d0, d1 mov lr, ip @@ -1117,23 +1117,23 @@ function x264_pixel_sa8d_8x8_neon bx lr endfunc -function x264_pixel_sa8d_16x16_neon +function pixel_sa8d_16x16_neon vpush {d8-d11} mov ip, lr - bl x264_sa8d_8x8_neon + bl sa8d_8x8_neon vpaddl.u16 q4, q8 vpaddl.u16 q5, q9 - bl x264_sa8d_8x8_neon + bl sa8d_8x8_neon vpadal.u16 q4, q8 vpadal.u16 q5, q9 sub r0, r0, r1, lsl #4 sub r2, r2, r3, lsl #4 add r0, r0, #8 add r2, r2, #8 - bl x264_sa8d_8x8_neon + bl sa8d_8x8_neon vpadal.u16 q4, q8 vpadal.u16 q5, q9 - bl x264_sa8d_8x8_neon + bl sa8d_8x8_neon vpaddl.u16 q8, q8 vpaddl.u16 q9, q9 vadd.u32 q0, q4, q8 @@ -1182,7 +1182,7 @@ endfunc .endm .macro sa8d_satd_8x8 satd= -function x264_sa8d_\satd\()8x8_neon, export=0 +function sa8d_\satd\()8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 vld1.64 {d7}, [r2], r3 SUMSUB_AB q0, q1, q8, q9 @@ -1254,19 +1254,19 @@ endfunc sa8d_satd_8x8 sa8d_satd_8x8 satd_ -function x264_pixel_sa8d_satd_16x16_neon +function pixel_sa8d_satd_16x16_neon push {lr} vpush {q4-q7} vmov.u32 q4, #0 vmov.u32 q5, #0 - bl x264_sa8d_satd_8x8_neon - bl x264_sa8d_satd_8x8_neon + bl sa8d_satd_8x8_neon + bl sa8d_satd_8x8_neon sub r0, r0, r1, lsl #4 sub r2, r2, r3, lsl #4 add r0, r0, #8 add r2, r2, #8 - bl x264_sa8d_satd_8x8_neon - bl x264_sa8d_satd_8x8_neon + bl sa8d_satd_8x8_neon + bl sa8d_satd_8x8_neon vadd.u32 d1, d10, d11 vadd.u32 d0, d8, d9 vpadd.u32 d1, d1, d1 @@ -1280,7 +1280,7 @@ endfunc .macro HADAMARD_AC w h -function x264_pixel_hadamard_ac_\w\()x\h\()_neon +function pixel_hadamard_ac_\w\()x\h\()_neon vpush {d8-d15} movrel ip, mask_ac4 vmov.i8 q4, #0 @@ -1289,18 +1289,18 @@ function x264_pixel_hadamard_ac_\w\()x\h\()_neon vmov.i8 q5, #0 mov ip, lr - bl x264_hadamard_ac_8x8_neon + bl hadamard_ac_8x8_neon .if \h > 8 - bl x264_hadamard_ac_8x8_neon + bl hadamard_ac_8x8_neon .endif .if \w > 8 sub r0, r0, r1, lsl #3 add r0, r0, #8 - bl x264_hadamard_ac_8x8_neon + bl hadamard_ac_8x8_neon .endif .if \w * \h == 256 sub r0, r0, r1, lsl #4 - bl x264_hadamard_ac_8x8_neon + bl hadamard_ac_8x8_neon .endif vadd.s32 d8, d8, d9 @@ -1321,7 +1321,7 @@ HADAMARD_AC 16, 8 HADAMARD_AC 16, 16 // q4: satd q5: sa8d q6: mask_ac4 q7: mask_ac8 -function x264_hadamard_ac_8x8_neon, export=0 +function hadamard_ac_8x8_neon, export=0 vld1.64 {d2}, [r0,:64], r1 vld1.64 {d3}, [r0,:64], r1 vaddl.u8 q0, d2, d3 @@ -1435,7 +1435,7 @@ endfunc vmull.u8 \ssb, \db, \db .endm -function x264_pixel_ssim_4x4x2_core_neon +function pixel_ssim_4x4x2_core_neon ldr ip, [sp] vld1.64 {d0}, [r0], r1 vld1.64 {d2}, [r2], r3 @@ -1464,7 +1464,7 @@ function x264_pixel_ssim_4x4x2_core_neon endfunc // FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2 -function x264_pixel_ssim_end4_neon +function pixel_ssim_end4_neon vld1.32 {d16-d19}, [r0,:128]! vld1.32 {d20-d23}, [r1,:128]! vadd.s32 q0, q8, q10 diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S index 06366ccf..59e3458b 100644 --- a/common/arm/predict-a.S +++ b/common/arm/predict-a.S @@ -77,7 +77,7 @@ endconst // because gcc doesn't believe in using the free shift in add -function x264_predict_4x4_h_armv6 +function predict_4x4_h_armv6 ldrb r1, [r0, #0*FDEC_STRIDE-1] ldrb r2, [r0, #1*FDEC_STRIDE-1] ldrb r3, [r0, #2*FDEC_STRIDE-1] @@ -97,7 +97,7 @@ function x264_predict_4x4_h_armv6 bx lr endfunc -function x264_predict_4x4_v_armv6 +function predict_4x4_v_armv6 ldr r1, [r0, #0 - 1 * FDEC_STRIDE] str r1, [r0, #0 + 0 * FDEC_STRIDE] str r1, [r0, #0 + 1 * FDEC_STRIDE] @@ -106,7 +106,7 @@ function x264_predict_4x4_v_armv6 bx lr endfunc -function x264_predict_4x4_dc_armv6 +function predict_4x4_dc_armv6 mov ip, #0 ldr r1, [r0, #-FDEC_STRIDE] ldrb r2, [r0, #0*FDEC_STRIDE-1] @@ -129,7 +129,7 @@ function x264_predict_4x4_dc_armv6 bx lr endfunc -function x264_predict_4x4_dc_top_neon +function predict_4x4_dc_top_neon mov r12, #FDEC_STRIDE sub r1, r0, #FDEC_STRIDE vld1.32 d1[], [r1,:32] @@ -158,7 +158,7 @@ endfunc uadd8 \a2, \a2, \c2 .endm -function x264_predict_4x4_ddr_armv6 +function predict_4x4_ddr_armv6 ldr r1, [r0, # -FDEC_STRIDE] ldrb r2, [r0, # -FDEC_STRIDE-1] ldrb r3, [r0, #0*FDEC_STRIDE-1] @@ -187,7 +187,7 @@ function x264_predict_4x4_ddr_armv6 pop {r4-r6,pc} endfunc -function x264_predict_4x4_ddl_neon +function predict_4x4_ddl_neon sub r0, #FDEC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d0}, [r0], ip @@ -206,7 +206,7 @@ function x264_predict_4x4_ddl_neon bx lr endfunc -function x264_predict_8x8_dc_neon +function predict_8x8_dc_neon mov ip, #0 ldrd r2, r3, [r1, #8] push {r4-r5,lr} @@ -230,7 +230,7 @@ function x264_predict_8x8_dc_neon pop {r4-r5,pc} endfunc -function x264_predict_8x8_h_neon +function predict_8x8_h_neon add r1, r1, #7 mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1] @@ -253,7 +253,7 @@ function x264_predict_8x8_h_neon bx lr endfunc -function x264_predict_8x8_v_neon +function predict_8x8_v_neon add r1, r1, #16 mov r12, #FDEC_STRIDE vld1.8 {d0}, [r1,:64] @@ -263,7 +263,7 @@ function x264_predict_8x8_v_neon bx lr endfunc -function x264_predict_8x8_ddl_neon +function predict_8x8_ddl_neon add r1, #16 vld1.8 {d0, d1}, [r1,:128] vmov.i8 q3, #0 @@ -291,7 +291,7 @@ function x264_predict_8x8_ddl_neon bx lr endfunc -function x264_predict_8x8_ddr_neon +function predict_8x8_ddr_neon vld1.8 {d0-d3}, [r1,:128] vext.8 q2, q0, q1, #7 vext.8 q3, q0, q1, #9 @@ -321,7 +321,7 @@ function x264_predict_8x8_ddr_neon bx lr endfunc -function x264_predict_8x8_vl_neon +function predict_8x8_vl_neon add r1, #16 mov r12, #FDEC_STRIDE @@ -352,7 +352,7 @@ function x264_predict_8x8_vl_neon bx lr endfunc -function x264_predict_8x8_vr_neon +function predict_8x8_vr_neon add r1, #8 mov r12, #FDEC_STRIDE vld1.8 {d4,d5}, [r1,:64] @@ -384,7 +384,7 @@ function x264_predict_8x8_vr_neon bx lr endfunc -function x264_predict_8x8_hd_neon +function predict_8x8_hd_neon mov r12, #FDEC_STRIDE add r1, #7 @@ -417,7 +417,7 @@ function x264_predict_8x8_hd_neon bx lr endfunc -function x264_predict_8x8_hu_neon +function predict_8x8_hu_neon mov r12, #FDEC_STRIDE add r1, #7 vld1.8 {d7}, [r1] @@ -450,7 +450,7 @@ function x264_predict_8x8_hu_neon bx lr endfunc -function x264_predict_8x8c_dc_top_neon +function predict_8x8c_dc_top_neon sub r2, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE vld1.8 {d0}, [r2,:64] @@ -463,7 +463,7 @@ function x264_predict_8x8c_dc_top_neon b pred8x8_dc_end endfunc -function x264_predict_8x8c_dc_left_neon +function predict_8x8c_dc_left_neon mov r1, #FDEC_STRIDE sub r2, r0, #1 ldcol.8 d0, r2, r1 @@ -475,7 +475,7 @@ function x264_predict_8x8c_dc_left_neon b pred8x8_dc_end endfunc -function x264_predict_8x8c_dc_neon +function predict_8x8c_dc_neon sub r2, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE vld1.8 {d0}, [r2,:64] @@ -501,7 +501,7 @@ pred8x8_dc_end: bx lr endfunc -function x264_predict_8x8c_h_neon +function predict_8x8c_h_neon sub r1, r0, #1 mov ip, #FDEC_STRIDE .rept 4 @@ -513,7 +513,7 @@ function x264_predict_8x8c_h_neon bx lr endfunc -function x264_predict_8x8c_v_neon +function predict_8x8c_v_neon sub r0, r0, #FDEC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d0}, [r0,:64], ip @@ -523,7 +523,7 @@ function x264_predict_8x8c_v_neon bx lr endfunc -function x264_predict_8x8c_p_neon +function predict_8x8c_p_neon sub r3, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE add r2, r3, #4 @@ -572,7 +572,7 @@ function x264_predict_8x8c_p_neon endfunc -function x264_predict_8x16c_dc_top_neon +function predict_8x16c_dc_top_neon sub r2, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE vld1.8 {d0}, [r2,:64] @@ -597,7 +597,7 @@ function x264_predict_8x16c_dc_top_neon bx lr endfunc -function x264_predict_8x16c_h_neon +function predict_8x16c_h_neon sub r1, r0, #1 mov ip, #FDEC_STRIDE .rept 8 @@ -609,7 +609,7 @@ function x264_predict_8x16c_h_neon bx lr endfunc -function x264_predict_8x16c_p_neon +function predict_8x16c_p_neon sub r3, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE add r2, r3, #4 @@ -667,7 +667,7 @@ function x264_predict_8x16c_p_neon endfunc -function x264_predict_16x16_dc_top_neon +function predict_16x16_dc_top_neon sub r2, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE vld1.8 {q0}, [r2,:128] @@ -677,7 +677,7 @@ function x264_predict_16x16_dc_top_neon b pred16x16_dc_end endfunc -function x264_predict_16x16_dc_left_neon +function predict_16x16_dc_left_neon mov r1, #FDEC_STRIDE sub r2, r0, #1 ldcol.8 d0, r2, r1 @@ -688,7 +688,7 @@ function x264_predict_16x16_dc_left_neon b pred16x16_dc_end endfunc -function x264_predict_16x16_dc_neon +function predict_16x16_dc_neon sub r3, r0, #FDEC_STRIDE sub r0, r0, #1 vld1.64 {d0-d1}, [r3,:128] @@ -726,7 +726,7 @@ pred16x16_dc_end: bx lr endfunc -function x264_predict_16x16_h_neon +function predict_16x16_h_neon sub r1, r0, #1 mov ip, #FDEC_STRIDE .rept 8 @@ -740,7 +740,7 @@ function x264_predict_16x16_h_neon bx lr endfunc -function x264_predict_16x16_v_neon +function predict_16x16_v_neon sub r0, r0, #FDEC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d0-d1}, [r0,:128], ip @@ -750,7 +750,7 @@ function x264_predict_16x16_v_neon bx lr endfunc -function x264_predict_16x16_p_neon +function predict_16x16_p_neon sub r3, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE add r2, r3, #8 diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S index a7f6cd2d..ad715466 100644 --- a/common/arm/quant-a.S +++ b/common/arm/quant-a.S @@ -75,7 +75,7 @@ endconst .endm // quant_2x2_dc( int16_t dct[4], int mf, int bias ) -function x264_quant_2x2_dc_neon +function quant_2x2_dc_neon vld1.64 {d0}, [r0,:64] vabs.s16 d3, d0 vdup.16 d2, r2 @@ -91,7 +91,7 @@ function x264_quant_2x2_dc_neon endfunc // quant_4x4_dc( int16_t dct[16], int mf, int bias ) -function x264_quant_4x4_dc_neon +function quant_4x4_dc_neon vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 @@ -103,7 +103,7 @@ function x264_quant_4x4_dc_neon endfunc // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) -function x264_quant_4x4_neon +function quant_4x4_neon vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 @@ -115,7 +115,7 @@ function x264_quant_4x4_neon endfunc // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) -function x264_quant_4x4x4_neon +function quant_4x4x4_neon vpush {d8-d15} vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 @@ -156,7 +156,7 @@ function x264_quant_4x4x4_neon endfunc // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) -function x264_quant_8x8_neon +function quant_8x8_neon vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 @@ -191,7 +191,7 @@ endfunc // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp ) .macro DEQUANT size bits -function x264_dequant_\size\()_neon +function dequant_\size\()_neon DEQUANT_START \bits+2, \bits .ifc \size, 8x8 mov r2, #4 @@ -272,7 +272,7 @@ DEQUANT 4x4, 4 DEQUANT 8x8, 6 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp ) -function x264_dequant_4x4_dc_neon +function dequant_4x4_dc_neon DEQUANT_START 6, 6, yes blt dequant_4x4_dc_rshift @@ -318,7 +318,7 @@ dequant_4x4_dc_rshift: endfunc .macro decimate_score_1x size -function x264_decimate_score\size\()_neon +function decimate_score\size\()_neon vld1.16 {q0, q1}, [r0, :128] movrel r3, mask_2bit vmov.s8 q3, #0x01 @@ -347,7 +347,7 @@ function x264_decimate_score\size\()_neon lsr r1, r1, #2 .endif rbit r1, r1 - movrelx r3, X(x264_decimate_table4), r2 + movrelx r3, X(decimate_table4), r2 1: clz r2, r1 lsl r1, r1, r2 @@ -363,7 +363,7 @@ endfunc decimate_score_1x 15 decimate_score_1x 16 -function x264_decimate_score64_neon +function decimate_score64_neon push {lr} vld1.16 {q8, q9}, [r0, :128]! vld1.16 {q10, q11}, [r0, :128]! @@ -416,7 +416,7 @@ function x264_decimate_score64_neon mvn r12, r12 mov r0, #0 mov lr, #32 - movrelx r3, X(x264_decimate_table8), r2 + movrelx r3, X(decimate_table8), r2 beq 2f 1: clz r2, r1 @@ -449,7 +449,7 @@ function x264_decimate_score64_neon endfunc // int coeff_last( int16_t *l ) -function x264_coeff_last4_arm +function coeff_last4_arm ldrd r2, r3, [r0] subs r0, r3, #0 movne r0, #2 @@ -459,7 +459,7 @@ function x264_coeff_last4_arm bx lr endfunc -function x264_coeff_last8_arm +function coeff_last8_arm ldrd r2, r3, [r0, #8] orrs ip, r2, r3 movne r0, #4 @@ -474,7 +474,7 @@ function x264_coeff_last8_arm endfunc .macro COEFF_LAST_1x size -function x264_coeff_last\size\()_neon +function coeff_last\size\()_neon .if \size == 15 sub r0, r0, #2 .endif @@ -500,7 +500,7 @@ endfunc COEFF_LAST_1x 15 COEFF_LAST_1x 16 -function x264_coeff_last64_neon +function coeff_last64_neon vld1.64 {d16-d19}, [r0,:128]! vqmovn.u16 d16, q8 vqmovn.u16 d17, q9 @@ -545,7 +545,7 @@ function x264_coeff_last64_neon bx lr endfunc -function x264_denoise_dct_neon +function denoise_dct_neon 1: subs r3, r3, #16 vld1.16 {q0, q1}, [r0] vld1.32 {q12, q13}, [r1]! diff --git a/tools/checkasm-arm.S b/tools/checkasm-arm.S index 4bd0ca04..c7be4671 100644 --- a/tools/checkasm-arm.S +++ b/tools/checkasm-arm.S @@ -52,7 +52,7 @@ endconst .macro clobbercheck variant .equ pushed, 4*10 -function x264_checkasm_call_\variant +function checkasm_call_\variant push {r4-r11, lr} .ifc \variant, neon vpush {q4-q7} @@ -128,7 +128,11 @@ function x264_checkasm_call_\variant mov r12, #0 str r12, [r2] movrel r0, error_message - blx X(puts) +#ifdef PREFIX + blx _puts +#else + blx puts +#endif 0: pop {r0, r1} .ifc \variant, neon