From: DRC Date: Tue, 6 Sep 2011 18:55:45 +0000 (+0000) Subject: Make ARM ISLOW iDCT faster on typical cases, and eliminate the possibility of 16... X-Git-Tag: 1.1.90~25 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5129e3960fbb7cb0a5348bdd542c07ad6c57ab35;p=libjpeg-turbo Make ARM ISLOW iDCT faster on typical cases, and eliminate the possibility of 16-bit overflows when handling arbitrary coefficients. git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@692 632fc199-4ca6-4c93-a231-07263d6284db --- diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S index 9ef6efc..b1c7166 100644 --- a/simd/jsimd_arm_neon.S +++ b/simd/jsimd_arm_neon.S @@ -263,46 +263,74 @@ asm_function jsimd_idct_islow_neon vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 vmlal.s16 q6, d5, XFIX_1_175875602 vmull.s16 q7, d4, XFIX_1_175875602 + /* Check for the zero coefficients in the right 4x8 half */ + push {r4, r5} vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 vsubl.s16 q3, ROW0L, ROW4L + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] vmull.s16 q2, ROW2L, XFIX_0_541196100 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 + orr r0, r4, r5 vmov q4, q6 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 vshl.s32 q3, q3, #13 + orr r0, r0, r4 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + orr r0, r0, r5 vadd.s32 q1, q3, q2 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] vmov q5, q7 vadd.s32 q1, q1, q6 + orr r0, r0, r4 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 + orr r0, r0, r5 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 vrshrn.s32 ROW1L, q1, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] vsub.s32 q1, q1, q6 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 + orr r0, r0, r4 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + orr r0, r0, r5 vsub.s32 q1, q1, q6 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] vmlal.s16 q6, ROW6L, XFIX_0_541196100 vsub.s32 q3, q3, q2 + orr r0, r0, r4 vrshrn.s32 ROW6L, q1, #11 + orr r0, r0, r5 vadd.s32 q1, q3, q5 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] vsub.s32 q3, q3, q5 vaddl.s16 q5, ROW0L, ROW4L + orr r0, r0, r4 vrshrn.s32 ROW2L, q1, #11 + orr r0, r0, r5 vrshrn.s32 ROW5L, q3, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] vshl.s32 q5, q5, #13 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 + orr r0, r0, r4 vadd.s32 q2, q5, q6 + orrs r0, r0, r5 vsub.s32 q1, q5, q6 vadd.s32 q6, q2, q7 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] vsub.s32 q2, q2, q7 vadd.s32 q5, q1, q4 + orr r0, r4, r5 vsub.s32 q3, q1, q4 + pop {r4, r5} vrshrn.s32 ROW7L, q2, #11 vrshrn.s32 ROW3L, q5, #11 vrshrn.s32 ROW0L, q6, #11 vrshrn.s32 ROW4L, q3, #11 + + beq 3f /* Go to do some special handling for the sparse right 4x8 half */ + /* 1-D IDCT, pass 1, right 4x8 half */ vld1.s16 {d2}, [ip, :64] /* reload constants */ vadd.s16 d10, ROW7R, ROW3R @@ -359,102 +387,101 @@ asm_function jsimd_idct_islow_neon vrshrn.s32 ROW3R, q5, #11 vrshrn.s32 ROW0R, q6, #11 vrshrn.s32 ROW4R, q3, #11 + /* Transpose right 4x8 half */ + vtrn.16 ROW6R, ROW7R + vtrn.16 ROW2R, ROW3R + vtrn.16 ROW0R, ROW1R + vtrn.16 ROW4R, ROW5R + vtrn.32 ROW1R, ROW3R + vtrn.32 ROW4R, ROW6R + vtrn.32 ROW0R, ROW2R + vtrn.32 ROW5R, ROW7R + +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ vld1.s16 {d2}, [ip, :64] /* reload constants */ - /* Transpose right 4x8 half */ - vtrn.16 ROW6R, ROW7R - vtrn.16 ROW2R, ROW3R - vtrn.16 ROW0R, ROW1R - vtrn.16 ROW4R, ROW5R - vmov.s16 q7, #(CENTERJSAMPLE << 5) - vtrn.32 ROW1R, ROW3R - vtrn.32 ROW4R, ROW6R - vtrn.32 ROW0R, ROW2R - vtrn.32 ROW5R, ROW7R - /* 1-D IDCT, pass 2, left 4x8 half */ - vswp ROW7L, ROW3R - vadd.s16 d10, ROW7L, ROW3L - vswp ROW5L, ROW1R - vadd.s16 d8, ROW5L, ROW1L - vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, d8, XFIX_1_175875602 - vswp ROW4L, ROW0R - vadd.s16 q8, q8, q7 - vmull.s16 q7, d10, XFIX_1_175875602 - vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 - vsubl.s16 q3, ROW0L, ROW4L - vswp ROW6L, ROW2R + vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ vmull.s16 q2, ROW2L, XFIX_0_541196100 - vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 + vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ vmov q4, q6 - vmlsl.s16 q6, ROW5L, XFIX_2_562915447 + vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 vshl.s32 q3, q3, #13 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 vadd.s32 q1, q3, q2 vmov q5, q7 vadd.s32 q1, q1, q6 - vmlsl.s16 q7, ROW7L, XFIX_0_899976223 + vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 vshrn.s32 ROW1L, q1, #16 vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 + vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ vmlsl.s16 q5, ROW3L, XFIX_2_562915447 vsub.s32 q1, q1, q6 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - vmlal.s16 q6, ROW6L, XFIX_0_541196100 + vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ vsub.s32 q3, q3, q2 - vshrn.s32 ROW6L, q1, #16 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ vadd.s32 q1, q3, q5 vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0L, ROW4L + vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ vshrn.s32 ROW2L, q1, #16 - vshrn.s32 ROW5L, q3, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 + vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ vadd.s32 q2, q5, q6 vsub.s32 q1, q5, q6 vadd.s32 q6, q2, q7 vsub.s32 q2, q2, q7 vadd.s32 q5, q1, q4 vsub.s32 q3, q1, q4 - vshrn.s32 ROW7L, q2, #16 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ vshrn.s32 ROW3L, q5, #16 vshrn.s32 ROW0L, q6, #16 - vshrn.s32 ROW4L, q3, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ /* 1-D IDCT, pass 2, right 4x8 half */ vld1.s16 {d2}, [ip, :64] /* reload constants */ - vadd.s16 d10, ROW7R, ROW3R - vadd.s16 d8, ROW5R, ROW1R - vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, d8, XFIX_1_175875602 - vmull.s16 q7, d10, XFIX_1_175875602 - vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 - vsubl.s16 q3, ROW0R, ROW4R - vmull.s16 q2, ROW2R, XFIX_0_541196100 + vmull.s16 q6, ROW5R, XFIX_1_175875602 + vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmull.s16 q7, ROW7R, XFIX_1_175875602 + vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 vmov q4, q6 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 - vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW1R, XFIX_0_899976223 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ vadd.s32 q1, q3, q2 vmov q5, q7 vadd.s32 q1, q1, q6 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 - vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 - vshrn.s32 ROW1R, q1, #16 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ vsub.s32 q1, q1, q6 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 - vmlsl.s16 q5, ROW3R, XFIX_2_562915447 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ vmlal.s16 q6, ROW6R, XFIX_0_541196100 vsub.s32 q3, q3, q2 vshrn.s32 ROW6R, q1, #16 vadd.s32 q1, q3, q5 vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0R, ROW4R - vshrn.s32 ROW2R, q1, #16 + vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ vshrn.s32 ROW5R, q3, #16 vshl.s32 q5, q5, #13 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 @@ -465,50 +492,157 @@ asm_function jsimd_idct_islow_neon vadd.s32 q5, q1, q4 vsub.s32 q3, q1, q4 vshrn.s32 ROW7R, q2, #16 - vshrn.s32 ROW3R, q5, #16 - vshrn.s32 ROW0R, q6, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ vshrn.s32 ROW4R, q3, #16 - /* Descale to 8-bit and range limit */ - vqrshrun.s16 d16, q8, #2 - vqrshrun.s16 d17, q9, #2 - vqrshrun.s16 d18, q10, #2 - vqrshrun.s16 d19, q11, #2 + +2: /* Descale to 8-bit and range limit */ + vqrshrn.s16 d16, q8, #2 + vqrshrn.s16 d17, q9, #2 + vqrshrn.s16 d18, q10, #2 + vqrshrn.s16 d19, q11, #2 vpop {d8-d15} /* restore NEON registers */ - vqrshrun.s16 d20, q12, #2 - vqrshrun.s16 d21, q13, #2 - vqrshrun.s16 d22, q14, #2 - vqrshrun.s16 d23, q15, #2 - /* Transpose the final 8-bit samples */ - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.8 d16, d17 - vtrn.8 d18, d19 - /* Store results to the output buffer */ - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d16}, [TMP1] - vst1.8 {d17}, [TMP2] - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d18}, [TMP1] - vtrn.8 d20, d21 - vst1.8 {d19}, [TMP2] - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - vst1.8 {d20}, [TMP1] - vtrn.8 d22, d23 - vst1.8 {d21}, [TMP2] - vst1.8 {d22}, [TMP3] - vst1.8 {d23}, [TMP4] + vqrshrn.s16 d20, q12, #2 + /* Transpose the final 8-bit samples and do signed->unsigned conversion */ + vtrn.16 q8, q9 + vqrshrn.s16 d21, q13, #2 + vqrshrn.s16 d22, q14, #2 + vmov.u8 q0, #(CENTERJSAMPLE) + vqrshrn.s16 d23, q15, #2 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + vadd.u8 q8, q8, q0 + vadd.u8 q9, q9, q0 + vtrn.16 q10, q11 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vadd.u8 q10, q10, q0 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vtrn.8 d22, d23 + vst1.8 {d20}, [TMP1] + vadd.u8 q11, q11, q0 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] bx lr +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ + + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vtrn.16 ROW2L, ROW3L + vtrn.16 ROW0L, ROW1L + vtrn.16 ROW4L, ROW5L + vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ + vtrn.32 ROW1L, ROW3L + vtrn.32 ROW4L, ROW6L + vtrn.32 ROW0L, ROW2L + vtrn.32 ROW5L, ROW7L + + cmp r0, #0 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ + + /* Only row 0 is non-zero for the right 4x8 half */ + vdup.s16 ROW1R, ROW0R[1] + vdup.s16 ROW2R, ROW0R[2] + vdup.s16 ROW3R, ROW0R[3] + vdup.s16 ROW4R, ROW0R[0] + vdup.s16 ROW5R, ROW0R[1] + vdup.s16 ROW6R, ROW0R[2] + vdup.s16 ROW7R, ROW0R[3] + vdup.s16 ROW0R, ROW0R[0] + b 1b /* Go to 'normal' second pass */ + +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vshll.s16 q3, ROW0L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW0L, #13 + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW5L, XFIX_1_175875602 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW7L, XFIX_1_175875602 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW6L, XFIX_0_541196100 + vshll.s16 q3, ROW4L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW4L, #13 + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ + vshrn.s32 ROW5R, q3, #16 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ + vshrn.s32 ROW4R, q3, #16 + b 2b /* Go to epilogue */ + .unreq DCT_TABLE .unreq COEF_BLOCK .unreq OUTPUT_BUF