14. Fixed a build error when building with older MinGW releases (regression
caused by 1.5.1[7].)
+15. Added SIMD acceleration for progressive Huffman encoding on SSE2-capable
+x86 and x86-64 platforms. This speeds up the compression of full-color
+progressive JPEGs by about 85-90% on average (relative to libjpeg-turbo 1.5.x)
+when using modern Intel and AMD CPUs.
+
1.5.3
=====
typedef struct {
struct jpeg_entropy_encoder pub; /* public fields */
+ /* Pointer to routine to prepare data for encode_mcu_AC_first() */
+ void (*AC_first_prepare) (const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits);
/* Pointer to routine to prepare data for encode_mcu_AC_refine() */
int (*AC_refine_prepare) (const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
/* Forward declarations */
METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
+METHODDEF(void) encode_mcu_AC_first_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
entropy->pub.encode_mcu = encode_mcu_DC_first;
else
entropy->pub.encode_mcu = encode_mcu_AC_first;
+ if (jsimd_can_encode_mcu_AC_first_prepare())
+ entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
+ else
+ entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
} else {
if (is_DC_band)
entropy->pub.encode_mcu = encode_mcu_DC_refine;
}
+/*
+ * Data preparation for encode_mcu_AC_first().
+ */
+
+#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \
+ for (k = 0; k < Sl; k++) { \
+ temp = block[jpeg_natural_order_start[k]]; \
+ if (temp == 0) \
+ continue; \
+ /* We must apply the point transform by Al. For AC coefficients this \
+ * is an integer division with rounding towards 0. To do this portably \
+ * in C, we shift after obtaining the absolute value; so the code is \
+ * interwoven with finding the abs value (temp) and output bits (temp2). \
+ */ \
+ temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp ^= temp2; \
+ temp -= temp2; /* temp is abs value of input */ \
+ temp >>= Al; /* apply the point transform */ \
+ /* Watch out for case that nonzero coef is zero after point transform */ \
+ if (temp == 0) \
+ continue; \
+ /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
+ temp2 ^= temp; \
+ values[k] = temp; \
+ values[k + DCTSIZE2] = temp2; \
+ zerobits |= ((size_t)1U) << k; \
+ } \
+}
+
+METHODDEF(void)
+encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *bits)
+{
+ register int k, temp, temp2;
+ size_t zerobits = 0U;
+ int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+ if (Sl0 > 32)
+ Sl0 = 32;
+#endif
+
+ COMPUTE_ABSVALUES_AC_FIRST(Sl0);
+
+ bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 4
+ zerobits = 0U;
+
+ if (Sl > 32) {
+ Sl -= 32;
+ jpeg_natural_order_start += 32;
+ values += 32;
+
+ COMPUTE_ABSVALUES_AC_FIRST(Sl);
+ }
+ bits[1] = zerobits;
+#endif
+}
+
/*
* MCU encoding for AC initial scan (either spectral selection,
* or first pass of successive approximation).
*/
+#define ENCODE_COEFS_AC_FIRST(label) { \
+ while (zerobits) { \
+ r = count_zeroes(&zerobits); \
+ cvalue += r; \
+label \
+ temp = cvalue[0]; \
+ temp2 = cvalue[DCTSIZE2]; \
+ \
+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
+ while (r > 15) { \
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+ r -= 16; \
+ } \
+ \
+ /* Find the number of bits needed for the magnitude of the coefficient */ \
+ nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ \
+ /* Check for out-of-range coefficient values */ \
+ if (nbits > MAX_COEF_BITS) \
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
+ \
+ /* Count/emit Huffman symbol for run length / number of bits */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \
+ \
+ /* Emit that number of bits of the value, if positive, */ \
+ /* or the complement of its magnitude, if negative. */ \
+ emit_bits(entropy, (unsigned int)temp2, nbits); \
+ \
+ cvalue++; \
+ zerobits >>= 1; \
+ } \
+}
+
METHODDEF(boolean)
encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
- register int temp, temp2, temp3;
- register int nbits;
- register int r, k;
- int Se = cinfo->Se;
+ register int temp, temp2;
+ register int nbits, r;
+ int Sl = cinfo->Se - cinfo->Ss + 1;
int Al = cinfo->Al;
- JBLOCKROW block;
+ JCOEF values_unaligned[2 * DCTSIZE2 + 15];
+ JCOEF *values;
+ const JCOEF *cvalue;
+ size_t zerobits;
+ size_t bits[8 / SIZEOF_SIZE_T];
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
if (entropy->restarts_to_go == 0)
emit_restart(entropy, entropy->next_restart_num);
- /* Encode the MCU data block */
- block = MCU_data[0];
-
- /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
+#ifdef WITH_SIMD
+ cvalue = values = (JCOEF *)PAD((size_t)values_unaligned, 16);
+#else
+ /* Not using SIMD, so alignment is not needed */
+ cvalue = values = values_unaligned;
+#endif
- r = 0; /* r = run length of zeros */
+ /* Prepare data */
+ entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+ Sl, Al, values, bits);
- for (k = cinfo->Ss; k <= Se; k++) {
- if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
- r++;
- continue;
- }
- /* We must apply the point transform by Al. For AC coefficients this
- * is an integer division with rounding towards 0. To do this portably
- * in C, we shift after obtaining the absolute value; so the code is
- * interwoven with finding the abs value (temp) and output bits (temp2).
- */
- temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
- temp ^= temp3;
- temp -= temp3; /* temp is abs value of input */
- temp >>= Al; /* apply the point transform */
- /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
- temp2 = temp ^ temp3;
- /* Watch out for case that nonzero coef is zero after point transform */
- if (temp == 0) {
- r++;
- continue;
- }
+ zerobits = bits[0];
+#if SIZEOF_SIZE_T == 4
+ zerobits |= bits[1];
+#endif
- /* Emit any pending EOBRUN */
- if (entropy->EOBRUN > 0)
- emit_eobrun(entropy);
- /* if run length > 15, must emit special run-length-16 codes (0xF0) */
- while (r > 15) {
- emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
- r -= 16;
- }
+ /* Emit any pending EOBRUN */
+ if (zerobits && (entropy->EOBRUN > 0))
+ emit_eobrun(entropy);
- /* Find the number of bits needed for the magnitude of the coefficient */
- nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */
- /* Check for out-of-range coefficient values */
- if (nbits > MAX_COEF_BITS)
- ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[0];
+#endif
- /* Count/emit Huffman symbol for run length / number of bits */
- emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
+ /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
- /* Emit that number of bits of the value, if positive, */
- /* or the complement of its magnitude, if negative. */
- emit_bits(entropy, (unsigned int)temp2, nbits);
+ ENCODE_COEFS_AC_FIRST();
- r = 0; /* reset zero run length */
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[1];
+ if (zerobits) {
+ int diff = ((values + DCTSIZE2 / 2) - cvalue);
+ r = count_zeroes(&zerobits);
+ r += diff;
+ cvalue += r;
+ goto first_iter_ac_first;
}
- if (r > 0) { /* If there are trailing zeroes, */
+ ENCODE_COEFS_AC_FIRST(first_iter_ac_first:);
+#endif
+
+ if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */
entropy->EOBRUN++; /* count an EOB */
if (entropy->EOBRUN == 0x7FFF)
emit_eobrun(entropy); /* force it out to avoid overflow */
c_derived_tbl *dctbl,
c_derived_tbl *actbl);
+EXTERN(int) jsimd_can_encode_mcu_AC_first_prepare(void);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
+
EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
return NULL;
}
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
dctbl, actbl);
}
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
last_dc_val, dctbl, actbl);
}
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
BITS 32
; --------------------------------------------------------------------------
-; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
%macro LOAD16 0
pxor N0, N0
mov INT [edi+SIZEOF_INT], edx
%endmacro
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *values,
+; size_t *zerobits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *zerobits
+
+%define ZERO xmm7
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define LENEND eax
+%define LUT ebx
+%define T0 ecx
+%define T1 edx
+%define BLOCK esi
+%define VALUES edi
+%define LEN ebp
+
+%define ZEROBITS INT [esp + 5 * 4]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ sub esp, 4
+ push ebx
+ push ecx
+; push edx ; need not be preserved
+ push esi
+ push edi
+ push ebp
+
+ mov BLOCK, INT [eax + 8]
+ mov LUT, INT [eax + 12]
+ mov VALUES, INT [eax + 24]
+ movd AL, INT [eax + 20]
+ mov T0, INT [eax + 28]
+ mov ZEROBITS, T0
+ mov LEN, INT [eax + 16]
+ pxor ZERO, ZERO
+ mov K, LEN
+ and K, -16
+ shr K, 4
+ jz .ELOOP16
+.BLOOP16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ dec K
+ jnz .BLOOP16
+.ELOOP16:
+ mov LENEND, LEN
+ and LENEND, 7
+
+ test LEN, 8
+ jz .TRY7
+ test LEN, 7
+ jz .TRY8
+
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ jmp .PADDING
+.TRY8:
+ LOAD8
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+ jmp .PADDING
+.TRY7:
+ LOAD7
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+.PADDING:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDING
+ align 16
+.ZEROLOOP:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOP
+.EPADDING:
+ sub VALUES, DCTSIZE2*2
+
+ REDUCE0
+
+ pop ebp
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+ pop ecx
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+
;
; Prepare data for jsimd_encode_mcu_AC_refine().
;
dctbl, actbl);
}
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 4)
+ return 0;
+ if (!(simd_support & JSIMD_SSE2))
+ return 0;
+#if defined(HAVE_BUILTIN_CTZL)
+ return 1;
+#elif defined(HAVE_BITSCANFORWARD)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
c_derived_tbl *dctbl, c_derived_tbl *actbl);
/* Progressive Huffman encoding */
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
+
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *absvalues, size_t *bits);
return NULL;
}
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
return NULL;
}
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
return NULL;
}
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
BITS 64
; --------------------------------------------------------------------------
-; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
%macro LOAD16 0
pxor N0, N0
mov MMWORD [r15], rax
%endmacro
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *values,
+; size_t *zerobits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *zerobits
+
+%define ZERO xmm9
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define LUT r11
+%define T0 rcx
+%define T0d ecx
+%define T1 rdx
+%define T1d edx
+%define BLOCK r10
+%define VALUES r14
+%define LEN r12d
+%define LENEND r13d
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [rbp - 16]
+ collect_args 6
+
+ movdqa XMMWORD [rbp - 16], ZERO
+
+ movd AL, r13d
+ pxor ZERO, ZERO
+ mov K, LEN
+ mov LENEND, LEN
+ and K, -16
+ and LENEND, 7
+ shr K, 4
+ jz .ELOOP16
+.BLOOP16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ dec K
+ jnz .BLOOP16
+.ELOOP16:
+ test LEN, 8
+ jz .TRY7
+ test LEN, 7
+ jz .TRY8
+
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ jmp .PADDING
+.TRY8:
+ LOAD8
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+ jmp .PADDING
+.TRY7:
+ LOAD7
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+.PADDING:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDING
+ align 16
+.ZEROLOOP:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOP
+.EPADDING:
+ sub VALUES, DCTSIZE2*2
+
+ REDUCE0
+
+ movdqa ZERO, XMMWORD [rbp - 16]
+ uncollect_args 6
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
;
; Prepare data for jsimd_encode_mcu_AC_refine().
;
dctbl, actbl);
}
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+ if (!(simd_support & JSIMD_SSE2))
+ return 0;
+#if defined(HAVE_BUILTIN_CTZL)
+ return 1;
+#elif defined(HAVE_BITSCANFORWARD64)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{