From: mayeut Date: Thu, 22 Mar 2018 16:36:43 +0000 (-0500) Subject: C/SSE2 optimization of encode_mcu_AC_first() X-Git-Tag: 1.5.90~12 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5b177b3cab5cfb661256c1e74df160158ec6c34e;p=libjpeg-turbo C/SSE2 optimization of encode_mcu_AC_first() This commit adds C and SSE2 optimizations for the encode_mcu_AC_first() function used in progressive Huffman encoding. The image used for testing can be retrieved from this page: https://blog.cloudflare.com/doubling-the-speed-of-jpegtran All timings done on `Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz` clang version is `Apple LLVM version 9.0.0 (clang-900.0.39.2)` gcc-5 version is `gcc-5 (Homebrew GCC 5.5.0) 5.5.0` gcc-7 version is `gcc-7 (Homebrew GCC 7.2.0) 7.2.0` Here are the results in comparison to libjpeg-turbo@293263c using `time ./jpegtran -outfile /dev/null -progressive -optimise -copy none print_poster_0025.jpg` C clang x86_64: +19% gcc-5 x86_64: +80% gcc-7 x86_64: +57% clang i386: +5% gcc-5 i386: +59% gcc-7 i386: +51% SSE2 clang x86_64: +79% gcc-5 x86_64: +158% gcc-7 x86_64: +122% clang i386: +71% gcc-5 i386: +134% gcc-7 i386: +135% Discussion in libjpeg-turbo/libjpeg-turbo#46 --- diff --git a/ChangeLog.md b/ChangeLog.md index ccf98a8..eb2a0e6 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -122,6 +122,11 @@ approximately 2-3.5x. 14. Fixed a build error when building with older MinGW releases (regression caused by 1.5.1[7].) +15. Added SIMD acceleration for progressive Huffman encoding on SSE2-capable +x86 and x86-64 platforms. This speeds up the compression of full-color +progressive JPEGs by about 85-90% on average (relative to libjpeg-turbo 1.5.x) +when using modern Intel and AMD CPUs. + 1.5.3 ===== diff --git a/jcphuff.c b/jcphuff.c index 31f1db3..bcec852 100644 --- a/jcphuff.c +++ b/jcphuff.c @@ -73,6 +73,10 @@ typedef struct { struct jpeg_entropy_encoder pub; /* public fields */ + /* Pointer to routine to prepare data for encode_mcu_AC_first() */ + void (*AC_first_prepare) (const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits); /* Pointer to routine to prepare data for encode_mcu_AC_refine() */ int (*AC_refine_prepare) (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, @@ -144,6 +148,9 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr; /* Forward declarations */ METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data); +METHODDEF(void) encode_mcu_AC_first_prepare + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *values, size_t *zerobits); METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data); METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo, @@ -208,6 +215,10 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics) entropy->pub.encode_mcu = encode_mcu_DC_first; else entropy->pub.encode_mcu = encode_mcu_AC_first; + if (jsimd_can_encode_mcu_AC_first_prepare()) + entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare; + else + entropy->AC_first_prepare = encode_mcu_AC_first_prepare; } else { if (is_DC_band) entropy->pub.encode_mcu = encode_mcu_DC_refine; @@ -541,21 +552,116 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data) } +/* + * Data preparation for encode_mcu_AC_first(). + */ + +#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \ + for (k = 0; k < Sl; k++) { \ + temp = block[jpeg_natural_order_start[k]]; \ + if (temp == 0) \ + continue; \ + /* We must apply the point transform by Al. For AC coefficients this \ + * is an integer division with rounding towards 0. To do this portably \ + * in C, we shift after obtaining the absolute value; so the code is \ + * interwoven with finding the abs value (temp) and output bits (temp2). \ + */ \ + temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \ + temp ^= temp2; \ + temp -= temp2; /* temp is abs value of input */ \ + temp >>= Al; /* apply the point transform */ \ + /* Watch out for case that nonzero coef is zero after point transform */ \ + if (temp == 0) \ + continue; \ + /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \ + temp2 ^= temp; \ + values[k] = temp; \ + values[k + DCTSIZE2] = temp2; \ + zerobits |= ((size_t)1U) << k; \ + } \ +} + +METHODDEF(void) +encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *bits) +{ + register int k, temp, temp2; + size_t zerobits = 0U; + int Sl0 = Sl; + +#if SIZEOF_SIZE_T == 4 + if (Sl0 > 32) + Sl0 = 32; +#endif + + COMPUTE_ABSVALUES_AC_FIRST(Sl0); + + bits[0] = zerobits; +#if SIZEOF_SIZE_T == 4 + zerobits = 0U; + + if (Sl > 32) { + Sl -= 32; + jpeg_natural_order_start += 32; + values += 32; + + COMPUTE_ABSVALUES_AC_FIRST(Sl); + } + bits[1] = zerobits; +#endif +} + /* * MCU encoding for AC initial scan (either spectral selection, * or first pass of successive approximation). */ +#define ENCODE_COEFS_AC_FIRST(label) { \ + while (zerobits) { \ + r = count_zeroes(&zerobits); \ + cvalue += r; \ +label \ + temp = cvalue[0]; \ + temp2 = cvalue[DCTSIZE2]; \ + \ + /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ + while (r > 15) { \ + emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \ + r -= 16; \ + } \ + \ + /* Find the number of bits needed for the magnitude of the coefficient */ \ + nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ \ + /* Check for out-of-range coefficient values */ \ + if (nbits > MAX_COEF_BITS) \ + ERREXIT(cinfo, JERR_BAD_DCT_COEF); \ + \ + /* Count/emit Huffman symbol for run length / number of bits */ \ + emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \ + \ + /* Emit that number of bits of the value, if positive, */ \ + /* or the complement of its magnitude, if negative. */ \ + emit_bits(entropy, (unsigned int)temp2, nbits); \ + \ + cvalue++; \ + zerobits >>= 1; \ + } \ +} + METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data) { phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy; - register int temp, temp2, temp3; - register int nbits; - register int r, k; - int Se = cinfo->Se; + register int temp, temp2; + register int nbits, r; + int Sl = cinfo->Se - cinfo->Ss + 1; int Al = cinfo->Al; - JBLOCKROW block; + JCOEF values_unaligned[2 * DCTSIZE2 + 15]; + JCOEF *values; + const JCOEF *cvalue; + size_t zerobits; + size_t bits[8 / SIZEOF_SIZE_T]; entropy->next_output_byte = cinfo->dest->next_output_byte; entropy->free_in_buffer = cinfo->dest->free_in_buffer; @@ -565,61 +671,48 @@ encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data) if (entropy->restarts_to_go == 0) emit_restart(entropy, entropy->next_restart_num); - /* Encode the MCU data block */ - block = MCU_data[0]; - - /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */ +#ifdef WITH_SIMD + cvalue = values = (JCOEF *)PAD((size_t)values_unaligned, 16); +#else + /* Not using SIMD, so alignment is not needed */ + cvalue = values = values_unaligned; +#endif - r = 0; /* r = run length of zeros */ + /* Prepare data */ + entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss, + Sl, Al, values, bits); - for (k = cinfo->Ss; k <= Se; k++) { - if ((temp = (*block)[jpeg_natural_order[k]]) == 0) { - r++; - continue; - } - /* We must apply the point transform by Al. For AC coefficients this - * is an integer division with rounding towards 0. To do this portably - * in C, we shift after obtaining the absolute value; so the code is - * interwoven with finding the abs value (temp) and output bits (temp2). - */ - temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); - temp ^= temp3; - temp -= temp3; /* temp is abs value of input */ - temp >>= Al; /* apply the point transform */ - /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ - temp2 = temp ^ temp3; - /* Watch out for case that nonzero coef is zero after point transform */ - if (temp == 0) { - r++; - continue; - } + zerobits = bits[0]; +#if SIZEOF_SIZE_T == 4 + zerobits |= bits[1]; +#endif - /* Emit any pending EOBRUN */ - if (entropy->EOBRUN > 0) - emit_eobrun(entropy); - /* if run length > 15, must emit special run-length-16 codes (0xF0) */ - while (r > 15) { - emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); - r -= 16; - } + /* Emit any pending EOBRUN */ + if (zerobits && (entropy->EOBRUN > 0)) + emit_eobrun(entropy); - /* Find the number of bits needed for the magnitude of the coefficient */ - nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ - /* Check for out-of-range coefficient values */ - if (nbits > MAX_COEF_BITS) - ERREXIT(cinfo, JERR_BAD_DCT_COEF); +#if SIZEOF_SIZE_T == 4 + zerobits = bits[0]; +#endif - /* Count/emit Huffman symbol for run length / number of bits */ - emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); + /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */ - /* Emit that number of bits of the value, if positive, */ - /* or the complement of its magnitude, if negative. */ - emit_bits(entropy, (unsigned int)temp2, nbits); + ENCODE_COEFS_AC_FIRST(); - r = 0; /* reset zero run length */ +#if SIZEOF_SIZE_T == 4 + zerobits = bits[1]; + if (zerobits) { + int diff = ((values + DCTSIZE2 / 2) - cvalue); + r = count_zeroes(&zerobits); + r += diff; + cvalue += r; + goto first_iter_ac_first; } - if (r > 0) { /* If there are trailing zeroes, */ + ENCODE_COEFS_AC_FIRST(first_iter_ac_first:); +#endif + + if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */ entropy->EOBRUN++; /* count an EOB */ if (entropy->EOBRUN == 0x7FFF) emit_eobrun(entropy); /* force it out to avoid overflow */ diff --git a/jsimd.h b/jsimd.h index c134e97..51e2b8c 100644 --- a/jsimd.h +++ b/jsimd.h @@ -104,6 +104,12 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer, c_derived_tbl *dctbl, c_derived_tbl *actbl); +EXTERN(int) jsimd_can_encode_mcu_AC_first_prepare(void); + +EXTERN(void) jsimd_encode_mcu_AC_first_prepare + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *values, size_t *zerobits); + EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void); EXTERN(int) jsimd_encode_mcu_AC_refine_prepare diff --git a/jsimd_none.c b/jsimd_none.c index 5b13d19..3cb6c80 100644 --- a/jsimd_none.c +++ b/jsimd_none.c @@ -390,6 +390,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, return NULL; } +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + GLOBAL(int) jsimd_can_encode_mcu_AC_refine_prepare(void) { diff --git a/simd/arm/jsimd.c b/simd/arm/jsimd.c index e0f1a4c..0fb8197 100644 --- a/simd/arm/jsimd.c +++ b/simd/arm/jsimd.c @@ -692,6 +692,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, dctbl, actbl); } +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + GLOBAL(int) jsimd_can_encode_mcu_AC_refine_prepare(void) { diff --git a/simd/arm64/jsimd.c b/simd/arm64/jsimd.c index bd689ab..0e6c7b9 100644 --- a/simd/arm64/jsimd.c +++ b/simd/arm64/jsimd.c @@ -770,6 +770,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, last_dc_val, dctbl, actbl); } +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + GLOBAL(int) jsimd_can_encode_mcu_AC_refine_prepare(void) { diff --git a/simd/i386/jcphuff-sse2.asm b/simd/i386/jcphuff-sse2.asm index 078ae4a..25c63c7 100644 --- a/simd/i386/jcphuff-sse2.asm +++ b/simd/i386/jcphuff-sse2.asm @@ -25,7 +25,8 @@ BITS 32 ; -------------------------------------------------------------------------- -; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2() +; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and +; jsimd_encode_mcu_AC_refine_prepare_sse2() %macro LOAD16 0 pxor N0, N0 @@ -245,6 +246,179 @@ mov INT [edi+SIZEOF_INT], edx %endmacro +; +; Prepare data for jsimd_encode_mcu_AC_first(). +; +; GLOBAL(void) +; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *values, +; size_t *zerobits) +; +; eax + 8 = const JCOEF *block +; eax + 12 = const int *jpeg_natural_order_start +; eax + 16 = int Sl +; eax + 20 = int Al +; eax + 24 = JCOEF *values +; eax + 28 = size_t *zerobits + +%define ZERO xmm7 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define LENEND eax +%define LUT ebx +%define T0 ecx +%define T1 edx +%define BLOCK esi +%define VALUES edi +%define LEN ebp + +%define ZEROBITS INT [esp + 5 * 4] + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + sub esp, 4 + push ebx + push ecx +; push edx ; need not be preserved + push esi + push edi + push ebp + + mov BLOCK, INT [eax + 8] + mov LUT, INT [eax + 12] + mov VALUES, INT [eax + 24] + movd AL, INT [eax + 20] + mov T0, INT [eax + 28] + mov ZEROBITS, T0 + mov LEN, INT [eax + 16] + pxor ZERO, ZERO + mov K, LEN + and K, -16 + shr K, 4 + jz .ELOOP16 +.BLOOP16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + dec K + jnz .BLOOP16 +.ELOOP16: + mov LENEND, LEN + and LENEND, 7 + + test LEN, 8 + jz .TRY7 + test LEN, 7 + jz .TRY8 + + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + jmp .PADDING +.TRY8: + LOAD8 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 + jmp .PADDING +.TRY7: + LOAD7 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 +.PADDING: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDING + align 16 +.ZEROLOOP: + movdqa XMMWORD [VALUES + 0], ZERO + add VALUES, 8*2 + inc K + jnz .ZEROLOOP +.EPADDING: + sub VALUES, DCTSIZE2*2 + + REDUCE0 + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +%undef ZERO +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef LUT +%undef T0 +%undef T1 +%undef BLOCK +%undef VALUES +%undef LEN + ; ; Prepare data for jsimd_encode_mcu_AC_refine(). ; diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c index f2315fa..9434bb0 100644 --- a/simd/i386/jsimd.c +++ b/simd/i386/jsimd.c @@ -1199,6 +1199,37 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, dctbl, actbl); } +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 4) + return 0; + if (!(simd_support & JSIMD_SSE2)) + return 0; +#if defined(HAVE_BUILTIN_CTZL) + return 1; +#elif defined(HAVE_BITSCANFORWARD) + return 1; +#else + return 0; +#endif +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ + jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, + Sl, Al, values, zerobits); +} + GLOBAL(int) jsimd_can_encode_mcu_AC_refine_prepare(void) { diff --git a/simd/jsimd.h b/simd/jsimd.h index b3e82cf..a9fc812 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -1074,6 +1074,10 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl c_derived_tbl *dctbl, c_derived_tbl *actbl); /* Progressive Huffman encoding */ +EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2 + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *values, size_t *zerobits); + EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, JCOEF *absvalues, size_t *bits); diff --git a/simd/loongson/jsimd.c b/simd/loongson/jsimd.c index ba1d14e..e8b1832 100644 --- a/simd/loongson/jsimd.c +++ b/simd/loongson/jsimd.c @@ -582,6 +582,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, return NULL; } +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + GLOBAL(int) jsimd_can_encode_mcu_AC_refine_prepare(void) { diff --git a/simd/mips/jsimd.c b/simd/mips/jsimd.c index d06cb97..af886f6 100644 --- a/simd/mips/jsimd.c +++ b/simd/mips/jsimd.c @@ -1087,6 +1087,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, return NULL; } +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + GLOBAL(int) jsimd_can_encode_mcu_AC_refine_prepare(void) { diff --git a/simd/powerpc/jsimd.c b/simd/powerpc/jsimd.c index 023e96d..f921712 100644 --- a/simd/powerpc/jsimd.c +++ b/simd/powerpc/jsimd.c @@ -844,6 +844,19 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, return NULL; } +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + GLOBAL(int) jsimd_can_encode_mcu_AC_refine_prepare(void) { diff --git a/simd/x86_64/jcphuff-sse2.asm b/simd/x86_64/jcphuff-sse2.asm index 4a9f592..b17488a 100644 --- a/simd/x86_64/jcphuff-sse2.asm +++ b/simd/x86_64/jcphuff-sse2.asm @@ -26,7 +26,8 @@ BITS 64 ; -------------------------------------------------------------------------- -; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2() +; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and +; jsimd_encode_mcu_AC_refine_prepare_sse2() %macro LOAD16 0 pxor N0, N0 @@ -245,6 +246,168 @@ mov MMWORD [r15], rax %endmacro +; +; Prepare data for jsimd_encode_mcu_AC_first(). +; +; GLOBAL(void) +; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *values, +; size_t *zerobits) +; +; r10 = const JCOEF *block +; r11 = const int *jpeg_natural_order_start +; r12 = int Sl +; r13 = int Al +; r14 = JCOEF *values +; r15 = size_t *zerobits + +%define ZERO xmm9 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define LUT r11 +%define T0 rcx +%define T0d ecx +%define T1 rdx +%define T1d edx +%define BLOCK r10 +%define VALUES r14 +%define LEN r12d +%define LENEND r13d + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [rbp - 16] + collect_args 6 + + movdqa XMMWORD [rbp - 16], ZERO + + movd AL, r13d + pxor ZERO, ZERO + mov K, LEN + mov LENEND, LEN + and K, -16 + and LENEND, 7 + shr K, 4 + jz .ELOOP16 +.BLOOP16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + dec K + jnz .BLOOP16 +.ELOOP16: + test LEN, 8 + jz .TRY7 + test LEN, 7 + jz .TRY8 + + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + jmp .PADDING +.TRY8: + LOAD8 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 + jmp .PADDING +.TRY7: + LOAD7 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 +.PADDING: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDING + align 16 +.ZEROLOOP: + movdqa XMMWORD [VALUES + 0], ZERO + add VALUES, 8*2 + inc K + jnz .ZEROLOOP +.EPADDING: + sub VALUES, DCTSIZE2*2 + + REDUCE0 + + movdqa ZERO, XMMWORD [rbp - 16] + uncollect_args 6 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +%undef ZERO +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef LUT +%undef T0 +%undef T0d +%undef T1 +%undef T1d +%undef BLOCK +%undef VALUES +%undef LEN +%undef LENEND + ; ; Prepare data for jsimd_encode_mcu_AC_refine(). ; diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c index 10af64d..6ec5f64 100644 --- a/simd/x86_64/jsimd.c +++ b/simd/x86_64/jsimd.c @@ -1022,6 +1022,37 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, dctbl, actbl); } +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 8) + return 0; + if (!(simd_support & JSIMD_SSE2)) + return 0; +#if defined(HAVE_BUILTIN_CTZL) + return 1; +#elif defined(HAVE_BITSCANFORWARD64) + return 1; +#else + return 0; +#endif +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ + jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, + Sl, Al, values, zerobits); +} + GLOBAL(int) jsimd_can_encode_mcu_AC_refine_prepare(void) {