#include "asm.S"
.fpu neon
-.align
+.align 2
// done in gas because .fpu neon overrides the refusal to assemble
// instructions the selected -march/-mcpu doesn't support
sub r2, r2, r1
cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles
addle r3, r3, r2
- subles ip, ip, #1
+ subsle ip, ip, #1
bgt average_loop
// disable counters if we enabled them
ldr ip, [sp, #8]
push {r4-r6,lr}
cmp ip, #32
- ldrd r4, [sp, #16]
+ ldrd r4, r5, [sp, #16]
mov lr, #\h
beq x264_pixel_avg_w\w\()_neon
rsbs r6, ip, #64
.ifc \type, full
ldr lr, [r4, #32] // denom
.endif
- ldrd r4, [r4, #32+4] // scale, offset
+ ldrd r4, r5, [r4, #32+4] // scale, offset
vdup.16 q0, r4
vdup.16 q1, r5
.ifc \type, full
function x264_mc_chroma_neon
push {r4-r8, lr}
vpush {d8-d11}
- ldrd r4, [sp, #56]
- ldrd r6, [sp, #64]
+ ldrd r4, r5, [sp, #56]
+ ldrd r6, r7, [sp, #64]
asr lr, r6, #3
mul lr, r4, lr
function x264_frame_init_lowres_core_neon
push {r4-r10,lr}
vpush {d8-d15}
- ldrd r4, [sp, #96]
- ldrd r6, [sp, #104]
+ ldrd r4, r5, [sp, #96]
+ ldrd r6, r7, [sp, #104]
ldr lr, [sp, #112]
sub r10, r6, r7 // dst_stride - width
and r10, r10, #~15
function x264_pixel_sad_x\x\()_\w\()x\h\()_neon
push {r6-r7,lr}
.if \x == 3
- ldrd r6, [sp, #12]
+ ldrd r6, r7, [sp, #12]
.else
- ldrd r6, [sp, #16]
+ ldrd r6, r7, [sp, #16]
ldr r12, [sp, #12]
.endif
mov lr, #FENC_STRIDE
vadd.s32 d1, d2, d3
vpadd.s32 d0, d0, d1
- vmov.32 r0, r1, d0
+ vmov r0, r1, d0
vst1.32 {d0[1]}, [ip,:32]
mul r0, r0, r0
sub r0, r1, r0, lsr #6
// int coeff_last( int16_t *l )
function x264_coeff_last4_arm
- ldrd r2, [r0]
+ ldrd r2, r3, [r0]
subs r0, r3, #0
movne r0, #2
movne r2, r3
subs r1, ip, r1, lsr #2
addge r0, r1, #\size - 8
- sublts r0, r3, r0, lsr #2
+ subslt r0, r3, r0, lsr #2
movlt r0, #0
bx lr
.endfunc
subs r1, ip, r1
addge r0, r1, #32
- sublts r0, ip, r0
+ subslt r0, ip, r0
movlt r0, #0
bx lr
.endfunc