.unreq SHIFT
.unreq LOOP_COUNT
.endfunc
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(void)
+ * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
+ * JDIMENSION downsampled_width,
+ * JSAMPARRAY input_data,
+ * JSAMPARRAY * output_data_ptr);
+ *
+ * Note: the use of unaligned writes is the main remaining bottleneck in
+ * this code, which can be potentially solved to get up to tens
+ * of percents performance improvement on Cortex-A8/Cortex-A9.
+ */
+
+/*
+ * Upsample 16 source pixels to 32 destination pixels. The new 16 source
+ * pixels are loaded to q0. The previous 16 source pixels are in q1. The
+ * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
+ * Register d28 is used for multiplication by 3. Register q15 is used
+ * for adding +1 bias.
+ */
+.macro upsample16 OUTPTR, INPTR
+ vld1.8 {q0}, [\INPTR]!
+ vmovl.u8 q8, d0
+ vext.8 q2, q1, q0, #15
+ vmovl.u8 q9, d1
+ vaddw.u8 q10, q15, d4
+ vaddw.u8 q11, q15, d5
+ vmlal.u8 q8, d4, d28
+ vmlal.u8 q9, d5, d28
+ vmlal.u8 q10, d0, d28
+ vmlal.u8 q11, d1, d28
+ vmov q1, q0 /* backup source pixels to q1 */
+ vrshrn.u16 d6, q8, #2
+ vrshrn.u16 d7, q9, #2
+ vshrn.u16 d8, q10, #2
+ vshrn.u16 d9, q11, #2
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
+ * macro, the roles of q0 and q1 registers are reversed for even and odd
+ * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
+ * Also this unrolling allows to reorder loads and stores to compensate
+ * multiplication latency and reduce stalls.
+ */
+.macro upsample32 OUTPTR, INPTR
+ /* even 16 pixels group */
+ vld1.8 {q0}, [\INPTR]!
+ vmovl.u8 q8, d0
+ vext.8 q2, q1, q0, #15
+ vmovl.u8 q9, d1
+ vaddw.u8 q10, q15, d4
+ vaddw.u8 q11, q15, d5
+ vmlal.u8 q8, d4, d28
+ vmlal.u8 q9, d5, d28
+ vmlal.u8 q10, d0, d28
+ vmlal.u8 q11, d1, d28
+ /* odd 16 pixels group */
+ vld1.8 {q1}, [\INPTR]!
+ vrshrn.u16 d6, q8, #2
+ vrshrn.u16 d7, q9, #2
+ vshrn.u16 d8, q10, #2
+ vshrn.u16 d9, q11, #2
+ vmovl.u8 q8, d2
+ vext.8 q2, q0, q1, #15
+ vmovl.u8 q9, d3
+ vaddw.u8 q10, q15, d4
+ vaddw.u8 q11, q15, d5
+ vmlal.u8 q8, d4, d28
+ vmlal.u8 q9, d5, d28
+ vmlal.u8 q10, d2, d28
+ vmlal.u8 q11, d3, d28
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
+ vrshrn.u16 d6, q8, #2
+ vrshrn.u16 d7, q9, #2
+ vshrn.u16 d8, q10, #2
+ vshrn.u16 d9, q11, #2
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
+ */
+.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
+ /* special case for the first and last pixels */
+ sub \WIDTH, \WIDTH, #1
+ add \OUTPTR, \OUTPTR, #1
+ ldrb \TMP1, [\INPTR, \WIDTH]
+ strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
+ ldrb \TMP1, [\INPTR], #1
+ strb \TMP1, [\OUTPTR, #-1]
+ vmov.8 d3[7], \TMP1
+
+ subs \WIDTH, \WIDTH, #32
+ blt 5f
+0: /* process 32 pixels per iteration */
+ upsample32 \OUTPTR, \INPTR
+ subs \WIDTH, \WIDTH, #32
+ bge 0b
+5:
+ adds \WIDTH, \WIDTH, #16
+ blt 1f
+0: /* process 16 pixels if needed */
+ upsample16 \OUTPTR, \INPTR
+ subs \WIDTH, \WIDTH, #16
+1:
+ adds \WIDTH, \WIDTH, #16
+ beq 9f
+
+ /* load the remaining 1-15 pixels */
+ add \INPTR, \INPTR, \WIDTH
+ tst \WIDTH, #1
+ beq 2f
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[0]}, [\INPTR]
+2:
+ tst \WIDTH, #2
+ beq 2f
+ vext.8 d0, d0, d0, #6
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[1]}, [\INPTR]
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[0]}, [\INPTR]
+2:
+ tst \WIDTH, #4
+ beq 2f
+ vrev64.32 d0, d0
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[3]}, [\INPTR]
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[2]}, [\INPTR]
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[1]}, [\INPTR]
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[0]}, [\INPTR]
+2:
+ tst \WIDTH, #8
+ beq 2f
+ vmov d1, d0
+ sub \INPTR, \INPTR, #8
+ vld1.8 {d0}, [\INPTR]
+2: /* upsample the remaining pixels */
+ vmovl.u8 q8, d0
+ vext.8 q2, q1, q0, #15
+ vmovl.u8 q9, d1
+ vaddw.u8 q10, q15, d4
+ vaddw.u8 q11, q15, d5
+ vmlal.u8 q8, d4, d28
+ vmlal.u8 q9, d5, d28
+ vmlal.u8 q10, d0, d28
+ vmlal.u8 q11, d1, d28
+ vrshrn.u16 d10, q8, #2
+ vrshrn.u16 d12, q9, #2
+ vshrn.u16 d11, q10, #2
+ vshrn.u16 d13, q11, #2
+ vzip.8 d10, d11
+ vzip.8 d12, d13
+ /* store the remaining pixels */
+ tst \WIDTH, #8
+ beq 2f
+ vst1.8 {d10, d11}, [\OUTPTR]!
+ vmov q5, q6
+2:
+ tst \WIDTH, #4
+ beq 2f
+ vst1.8 {d10}, [\OUTPTR]!
+ vmov d10, d11
+2:
+ tst \WIDTH, #2
+ beq 2f
+ vst1.8 {d10[0]}, [\OUTPTR]!
+ vst1.8 {d10[1]}, [\OUTPTR]!
+ vst1.8 {d10[2]}, [\OUTPTR]!
+ vst1.8 {d10[3]}, [\OUTPTR]!
+ vext.8 d10, d10, d10, #4
+2:
+ tst \WIDTH, #1
+ beq 2f
+ vst1.8 {d10[0]}, [\OUTPTR]!
+ vst1.8 {d10[1]}, [\OUTPTR]!
+2:
+9:
+.endm
+
+asm_function jsimd_h2v1_fancy_upsample_neon
+
+ MAX_V_SAMP_FACTOR .req r0
+ DOWNSAMPLED_WIDTH .req r1
+ INPUT_DATA .req r2
+ OUTPUT_DATA_PTR .req r3
+ OUTPUT_DATA .req OUTPUT_DATA_PTR
+
+ OUTPTR .req r4
+ INPTR .req r5
+ WIDTH .req ip
+ TMP .req lr
+
+ push {r4, r5, r6, lr}
+ vpush {d8-d15}
+
+ ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
+ cmp MAX_V_SAMP_FACTOR, #0
+ ble 99f
+
+ /* initialize constants */
+ vmov.u8 d28, #3
+ vmov.u16 q15, #1
+11:
+ ldr INPTR, [INPUT_DATA], #4
+ ldr OUTPTR, [OUTPUT_DATA], #4
+ mov WIDTH, DOWNSAMPLED_WIDTH
+ upsample_row OUTPTR, INPTR, WIDTH, TMP
+ subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
+ bgt 11b
+
+99:
+ vpop {d8-d15}
+ pop {r4, r5, r6, pc}
+
+ .unreq MAX_V_SAMP_FACTOR
+ .unreq DOWNSAMPLED_WIDTH
+ .unreq INPUT_DATA
+ .unreq OUTPUT_DATA_PTR
+ .unreq OUTPUT_DATA
+
+ .unreq OUTPTR
+ .unreq INPTR
+ .unreq WIDTH
+ .unreq TMP
+
+.endfunc
+
+.purgem upsample16
+.purgem upsample32
+.purgem upsample_row