message(STATUS "Linker flags = ${EFFECTIVE_LD_FLAGS}")
include(CheckCSourceCompiles)
+include(CheckIncludeFiles)
+include(CheckTypeSize)
+
+check_type_size("size_t" SIZE_T)
+check_type_size("unsigned long" UNSIGNED_LONG)
+
+if(SIZE_T EQUAL UNSIGNED_LONG)
+ check_c_source_compiles("int main(int argc, char **argv) { unsigned long a = argc; return __builtin_ctzl(a); }"
+ HAVE_BUILTIN_CTZL)
+endif()
if(UNIX)
# Check for headers
- include(CheckIncludeFiles)
check_include_files(locale.h HAVE_LOCALE_H)
check_include_files(stddef.h HAVE_STDDEF_H)
check_include_files(stdlib.h HAVE_STDLIB_H)
endif()
# Check for types
- include(CheckTypeSize)
check_type_size("unsigned char" UNSIGNED_CHAR)
check_type_size("unsigned short" UNSIGNED_SHORT)
- check_type_size("size_t" SIZE_T)
# Check for compiler features
check_c_source_compiles("int main(void) { typedef struct undefined_structure *undef_struct_ptr; }"
endif()
if(MSVC)
+ check_include_files("intrin.h" HAVE_INTRIN_H)
set(INLINE_OPTIONS "__inline;inline")
else()
set(INLINE_OPTIONS "__inline__;inline")
/* Version number of package */
#define VERSION "@VERSION@"
-#ifndef _WIN32
-
/* The size of `size_t', as computed by sizeof. */
#define SIZEOF_SIZE_T @SIZE_T@
+/* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */
+#cmakedefine HAVE_BUILTIN_CTZL
+
+/* Define to 1 if you have the <intrin.h> header file. */
+#cmakedefine HAVE_INTRIN_H
+
+#if defined(_MSC_VER) && defined(HAVE_INTRIN_H)
+#if (SIZEOF_SIZE_T == 8)
+#define HAVE_BITSCANFORWARD64
+#elif (SIZEOF_SIZE_T == 4)
+#define HAVE_BITSCANFORWARD
+#endif
#endif
* Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2011, 2015, 2018, D. R. Commander.
+ * Copyright (C) 2016, 2018, Matthieu Darbois.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
-#include "jchuff.h" /* Declarations shared with jchuff.c */
+#include "jsimd.h"
+#include "jconfigint.h"
#include <limits.h>
+#ifdef HAVE_INTRIN_H
+#include <intrin.h>
+#ifdef _MSC_VER
+#ifdef HAVE_BITSCANFORWARD64
+#pragma intrinsic(_BitScanForward64)
+#endif
+#ifdef HAVE_BITSCANFORWARD
+#pragma intrinsic(_BitScanForward)
+#endif
+#endif
+#endif
+
#ifdef C_PROGRESSIVE_SUPPORTED
/*
typedef struct {
struct jpeg_entropy_encoder pub; /* public fields */
+ /* Pointer to routine to prepare data for encode_mcu_AC_refine() */
+ int (*AC_refine_prepare) (const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits);
+
/* Mode flag: TRUE for optimization, FALSE for actual data output */
boolean gather_statistics;
#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
#endif
+#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1)))
+
/* Forward declarations */
METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
JBLOCKROW *MCU_data);
METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
+METHODDEF(int) encode_mcu_AC_refine_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo);
+/* Count bit loop zeroes */
+INLINE
+METHODDEF(int)
+count_zeroes(size_t *x)
+{
+ int result;
+#if defined(HAVE_BUILTIN_CTZL)
+ result = __builtin_ctzl(*x);
+ *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD64)
+ _BitScanForward64(&result, *x);
+ *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD)
+ _BitScanForward(&result, *x);
+ *x >>= result;
+#else
+ result = 0;
+ while ((*x & 1) == 0) {
+ ++result;
+ *x >>= 1;
+ }
+#endif
+ return result;
+}
+
+
/*
* Initialize for a Huffman-compressed scan using progressive JPEG.
*/
entropy->pub.encode_mcu = encode_mcu_DC_refine;
else {
entropy->pub.encode_mcu = encode_mcu_AC_refine;
+ if (jsimd_can_encode_mcu_AC_refine_prepare())
+ entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare;
+ else
+ entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare;
/* AC refinement needs a correction bit buffer */
if (entropy->bit_buffer == NULL)
entropy->bit_buffer = (char *)
}
+/*
+ * Data preparation for encode_mcu_AC_refine().
+ */
+
+#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \
+ /* It is convenient to make a pre-pass to determine the transformed \
+ * coefficients' absolute values and the EOB position. \
+ */ \
+ for (k = 0; k < Sl; k++) { \
+ temp = block[jpeg_natural_order_start[k]]; \
+ /* We must apply the point transform by Al. For AC coefficients this \
+ * is an integer division with rounding towards 0. To do this portably \
+ * in C, we shift after obtaining the absolute value. \
+ */ \
+ temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp ^= temp2; \
+ temp -= temp2; /* temp is abs value of input */ \
+ temp >>= Al; /* apply the point transform */ \
+ if (temp != 0) { \
+ zerobits |= ((size_t)1U) << k; \
+ signbits |= ((size_t)(temp2 + 1)) << k; \
+ } \
+ absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \
+ if (temp == 1) \
+ EOB = k + koffset; /* EOB = index of last newly-nonzero coef */ \
+ } \
+}
+
+METHODDEF(int)
+encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ register int k, temp, temp2;
+ int EOB = 0;
+ size_t zerobits = 0U, signbits = 0U;
+ int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+ if (Sl0 > 32)
+ Sl0 = 32;
+#endif
+
+ COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0);
+
+ bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 8
+ bits[1] = signbits;
+#else
+ bits[2] = signbits;
+
+ zerobits = 0U;
+ signbits = 0U;
+
+ if (Sl > 32) {
+ Sl -= 32;
+ jpeg_natural_order_start += 32;
+ absvalues += 32;
+
+ COMPUTE_ABSVALUES_AC_REFINE(Sl, 32);
+ }
+
+ bits[1] = zerobits;
+ bits[3] = signbits;
+#endif
+
+ return EOB;
+}
+
+
/*
* MCU encoding for AC successive approximation refinement scan.
*/
+#define ENCODE_COEFS_AC_REFINE(label) { \
+ while (zerobits) { \
+ int idx = count_zeroes(&zerobits); \
+ r += idx; \
+ cabsvalue += idx; \
+ signbits >>= idx; \
+label \
+ /* Emit any required ZRLs, but not if they can be folded into EOB */ \
+ while (r > 15 && (cabsvalue <= EOBPTR)) { \
+ /* emit any pending EOBRUN and the BE correction bits */ \
+ emit_eobrun(entropy); \
+ /* Emit ZRL */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+ r -= 16; \
+ /* Emit buffered correction bits that must be associated with ZRL */ \
+ emit_buffered_bits(entropy, BR_buffer, BR); \
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+ BR = 0; \
+ } \
+ \
+ temp = *cabsvalue++; \
+ \
+ /* If the coef was previously nonzero, it only needs a correction bit. \
+ * NOTE: a straight translation of the spec's figure G.7 would suggest \
+ * that we also need to test r > 15. But if r > 15, we can only get here \
+ * if k > EOB, which implies that this coefficient is not 1. \
+ */ \
+ if (temp > 1) { \
+ /* The correction bit is the next bit of the absolute value. */ \
+ BR_buffer[BR++] = (char)(temp & 1); \
+ signbits >>= 1; \
+ zerobits >>= 1; \
+ continue; \
+ } \
+ \
+ /* Emit any pending EOBRUN and the BE correction bits */ \
+ emit_eobrun(entropy); \
+ \
+ /* Count/emit Huffman symbol for run length / number of bits */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \
+ \
+ /* Emit output bit for newly-nonzero coef */ \
+ temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \
+ emit_bits(entropy, (unsigned int)temp, 1); \
+ \
+ /* Emit buffered correction bits that must be associated with this code */ \
+ emit_buffered_bits(entropy, BR_buffer, BR); \
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+ BR = 0; \
+ r = 0; /* reset zero run length */ \
+ signbits >>= 1; \
+ zerobits >>= 1; \
+ } \
+}
+
METHODDEF(boolean)
encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
- register int temp, temp3;
- register int r, k;
- int EOB;
+ register int temp, r;
char *BR_buffer;
unsigned int BR;
- int Se = cinfo->Se;
+ int Sl = cinfo->Se - cinfo->Ss + 1;
int Al = cinfo->Al;
- JBLOCKROW block;
- int absvalues[DCTSIZE2];
+ JCOEF absvalues_unaligned[DCTSIZE2 + 15];
+ JCOEF *absvalues;
+ const JCOEF *cabsvalue, *EOBPTR;
+ size_t zerobits, signbits;
+ size_t bits[16 / SIZEOF_SIZE_T];
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
if (entropy->restarts_to_go == 0)
emit_restart(entropy, entropy->next_restart_num);
- /* Encode the MCU data block */
- block = MCU_data[0];
+#ifdef WITH_SIMD
+ cabsvalue = absvalues = (JCOEF *)PAD((size_t)absvalues_unaligned, 16);
+#else
+ /* Not using SIMD, so alignment is not needed */
+ cabsvalue = absvalues = absvalues_unaligned;
+#endif
- /* It is convenient to make a pre-pass to determine the transformed
- * coefficients' absolute values and the EOB position.
- */
- EOB = 0;
- for (k = cinfo->Ss; k <= Se; k++) {
- temp = (*block)[jpeg_natural_order[k]];
- /* We must apply the point transform by Al. For AC coefficients this
- * is an integer division with rounding towards 0. To do this portably
- * in C, we shift after obtaining the absolute value.
- */
- temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
- temp ^= temp3;
- temp -= temp3; /* temp is abs value of input */
- temp >>= Al; /* apply the point transform */
- absvalues[k] = temp; /* save abs value for main pass */
- if (temp == 1)
- EOB = k; /* EOB = index of last newly-nonzero coef */
- }
+ /* Prepare data */
+ EOBPTR = absvalues +
+ entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+ Sl, Al, absvalues, bits);
/* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
BR = 0; /* BR = count of buffered bits added now */
BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
- for (k = cinfo->Ss; k <= Se; k++) {
- if ((temp = absvalues[k]) == 0) {
- r++;
- continue;
- }
-
- /* Emit any required ZRLs, but not if they can be folded into EOB */
- while (r > 15 && k <= EOB) {
- /* emit any pending EOBRUN and the BE correction bits */
- emit_eobrun(entropy);
- /* Emit ZRL */
- emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
- r -= 16;
- /* Emit buffered correction bits that must be associated with ZRL */
- emit_buffered_bits(entropy, BR_buffer, BR);
- BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
- BR = 0;
- }
-
- /* If the coef was previously nonzero, it only needs a correction bit.
- * NOTE: a straight translation of the spec's figure G.7 would suggest
- * that we also need to test r > 15. But if r > 15, we can only get here
- * if k > EOB, which implies that this coefficient is not 1.
- */
- if (temp > 1) {
- /* The correction bit is the next bit of the absolute value. */
- BR_buffer[BR++] = (char)(temp & 1);
- continue;
- }
-
- /* Emit any pending EOBRUN and the BE correction bits */
- emit_eobrun(entropy);
-
- /* Count/emit Huffman symbol for run length / number of bits */
- emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
+ zerobits = bits[0];
+#if SIZEOF_SIZE_T == 8
+ signbits = bits[1];
+#else
+ signbits = bits[2];
+#endif
+ ENCODE_COEFS_AC_REFINE();
+
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[1];
+ signbits = bits[3];
+
+ if (zerobits) {
+ int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
+ int idx = count_zeroes(&zerobits);
+ signbits >>= idx;
+ idx += diff;
+ r += idx;
+ cabsvalue += idx;
+ goto first_iter_ac_refine;
+ }
- /* Emit output bit for newly-nonzero coef */
- temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
- emit_bits(entropy, (unsigned int)temp, 1);
+ ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:);
+#endif
- /* Emit buffered correction bits that must be associated with this code */
- emit_buffered_bits(entropy, BR_buffer, BR);
- BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
- BR = 0;
- r = 0; /* reset zero run length */
- }
+ r |= (int)((absvalues + Sl) - cabsvalue);
if (r > 0 || BR > 0) { /* If there are trailing zeroes, */
entropy->EOBRUN++; /* count an EOB */
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, 2014, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl,
c_derived_tbl *actbl);
+
+EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
{
return NULL;
}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
if(CPU_TYPE STREQUAL "x86_64")
set(SIMD_SOURCES x86_64/jsimdcpu.asm x86_64/jfdctflt-sse.asm
x86_64/jccolor-sse2.asm x86_64/jcgray-sse2.asm x86_64/jchuff-sse2.asm
- x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm x86_64/jdmerge-sse2.asm
- x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm x86_64/jfdctint-sse2.asm
- x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm x86_64/jidctint-sse2.asm
- x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm x86_64/jquanti-sse2.asm
+ x86_64/jcphuff-sse2.asm x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm
+ x86_64/jdmerge-sse2.asm x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm
+ x86_64/jfdctint-sse2.asm x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm
+ x86_64/jidctint-sse2.asm x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm
+ x86_64/jquanti-sse2.asm
x86_64/jccolor-avx2.asm x86_64/jcgray-avx2.asm x86_64/jcsample-avx2.asm
x86_64/jdcolor-avx2.asm x86_64/jdmerge-avx2.asm x86_64/jdsample-avx2.asm
x86_64/jfdctint-avx2.asm x86_64/jidctint-avx2.asm x86_64/jquanti-avx2.asm)
i386/jidctint-mmx.asm i386/jidctred-mmx.asm i386/jquant-mmx.asm
i386/jfdctflt-sse.asm i386/jidctflt-sse.asm i386/jquant-sse.asm
i386/jccolor-sse2.asm i386/jcgray-sse2.asm i386/jchuff-sse2.asm
- i386/jcsample-sse2.asm i386/jdcolor-sse2.asm i386/jdmerge-sse2.asm
- i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm i386/jfdctint-sse2.asm
- i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm i386/jidctint-sse2.asm
- i386/jidctred-sse2.asm i386/jquantf-sse2.asm i386/jquanti-sse2.asm
+ i386/jcphuff-sse2.asm i386/jcsample-sse2.asm i386/jdcolor-sse2.asm
+ i386/jdmerge-sse2.asm i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm
+ i386/jfdctint-sse2.asm i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm
+ i386/jidctint-sse2.asm i386/jidctred-sse2.asm i386/jquantf-sse2.asm
+ i386/jquanti-sse2.asm
i386/jccolor-avx2.asm i386/jcgray-avx2.asm i386/jcsample-avx2.asm
i386/jdcolor-avx2.asm i386/jdmerge-avx2.asm i386/jdsample-avx2.asm
i386/jfdctint-avx2.asm i386/jidctint-avx2.asm i386/jquanti-avx2.asm)
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
dctbl, actbl);
}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
last_dc_val, dctbl, actbl);
}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
--- /dev/null
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding. See jcphuff.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+ pxor N0, N0
+ pxor N1, N1
+
+ mov T0, INT [LUT + 0*SIZEOF_INT]
+ mov T1, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0, INT [LUT + 1*SIZEOF_INT]
+ mov T1, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ mov T0, INT [LUT + 2*SIZEOF_INT]
+ mov T1, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ mov T0, INT [LUT + 3*SIZEOF_INT]
+ mov T1, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ mov T0, INT [LUT + 4*SIZEOF_INT]
+ mov T1, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ mov T0, INT [LUT + 5*SIZEOF_INT]
+ mov T1, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ mov T0, INT [LUT + 6*SIZEOF_INT]
+ mov T1, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+
+ mov T0, INT [LUT + 7*SIZEOF_INT]
+ mov T1, INT [LUT + 15*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+ pinsrw X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+ pxor N0, N0
+ pxor N1, N1
+ pxor X1, X1
+
+ mov T0, INT [LUT + 0*SIZEOF_INT]
+ mov T1, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+
+ cmp LENEND, 2
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+ pxor N0, N0
+
+ mov T0, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+
+ mov T0, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+ pxor N0, N0
+ pxor X0, X0
+
+ mov T1, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 0
+
+ cmp LENEND, 2
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+ movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
+ movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
+ movdqa xmm2, XMMWORD [VALUES + (16*2)]
+ movdqa xmm3, XMMWORD [VALUES + (24*2)]
+ movdqa xmm4, XMMWORD [VALUES + (32*2)]
+ movdqa xmm5, XMMWORD [VALUES + (40*2)]
+ movdqa xmm6, XMMWORD [VALUES + (48*2)]
+
+ pcmpeqw xmm0, ZERO
+ pcmpeqw xmm1, ZERO
+ pcmpeqw xmm2, ZERO
+ pcmpeqw xmm3, ZERO
+ pcmpeqw xmm4, ZERO
+ pcmpeqw xmm5, ZERO
+ pcmpeqw xmm6, ZERO
+ pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
+
+ packsswb xmm0, xmm1
+ packsswb xmm2, xmm3
+ packsswb xmm4, xmm5
+ packsswb xmm6, xmm7
+
+ pmovmskb eax, xmm0
+ pmovmskb ecx, xmm2
+ pmovmskb edx, xmm4
+ pmovmskb esi, xmm6
+
+ shl ecx, 16
+ shl esi, 16
+
+ or eax, ecx
+ or edx, esi
+
+ not eax
+ not edx
+
+ mov edi, ZEROBITS
+
+ mov INT [edi], eax
+ mov INT [edi+SIZEOF_INT], edx
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *absvalues,
+; size_t *bits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *bits
+
+%define ZERO xmm7
+%define ONE xmm5
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define LENEND eax
+%define LUT ebx
+%define T0 ecx
+%define T0w cx
+%define T1 edx
+%define BLOCK esi
+%define VALUES edi
+%define KK ebp
+
+%define ZEROBITS INT [esp + 5 * 4]
+%define EOB INT [esp + 5 * 4 + 4]
+%define LEN INT [esp + 5 * 4 + 8]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ sub esp, 16
+ push ebx
+ push ecx
+; push edx ; need not be preserved
+ push esi
+ push edi
+ push ebp
+
+ pcmpeqw ONE, ONE
+ psrlw ONE, 15
+ mov BLOCK, INT [eax + 8]
+ mov LUT, INT [eax + 12]
+ mov VALUES, INT [eax + 24]
+ movd AL, INT [eax + 20]
+ mov T0, INT [eax + 28]
+ mov K, INT [eax + 16]
+ mov INT [T0 + 2 * SIZEOF_INT], -1
+ mov INT [T0 + 3 * SIZEOF_INT], -1
+ mov ZEROBITS, T0
+ mov LEN, K
+ pxor ZERO, ZERO
+ and K, -16
+ mov EOB, 0
+ xor KK, KK
+ shr K, 4
+ jz .ELOOPR16
+.BLOOPR16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER16 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER16:
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ add KK, 2
+ dec K
+ jnz .BLOOPR16
+.ELOOPR16:
+ mov LENEND, LEN
+
+ test LENEND, 8
+ jz .TRYR7
+ test LENEND, 7
+ jz .TRYR8
+
+ and LENEND, 7
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER15 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER15:
+ add VALUES, 16*2
+ jmp .PADDINGR
+.TRYR8:
+ LOAD8
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER8 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER8:
+ add VALUES, 8*2
+ jmp .PADDINGR
+.TRYR7:
+ and LENEND, 7
+ LOAD7
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER7 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER7:
+ add VALUES, 8*2
+.PADDINGR:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDINGR
+ align 16
+.ZEROLOOPR:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOPR
+.EPADDINGR:
+ sub VALUES, DCTSIZE2*2
+
+ REDUCE0
+
+ mov eax, EOB
+
+ pop ebp
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+ pop ecx
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
+#include "jconfigint.h"
/*
* In the PIC cases, we have no guarantee that constants will keep
return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
dctbl, actbl);
}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 4)
+ return 0;
+ if (!(simd_support & JSIMD_SSE2))
+ return 0;
+#if defined(HAVE_BUILTIN_CTZL)
+ return 1;
+#elif defined(HAVE_BITSCANFORWARD)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
+}
* Copyright (C) 2011, 2014-2016, 2018, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* Copyright (C) 2014, Linaro Limited.
- * Copyright (C) 2015-2016, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
*
* Based on the x86 SIMD extension for IJG JPEG library,
EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
(void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+/* Progressive Huffman encoding */
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015, 2018, Matthieu Darbois.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
*
* Based on the x86 SIMD extension for IJG JPEG library,
{
return NULL;
}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
{
return NULL;
}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
{
return NULL;
}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
--- /dev/null
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
+; (64-bit SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding. See jcphuff.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+ pxor N0, N0
+ pxor N1, N1
+
+ mov T0d, INT [LUT + 0*SIZEOF_INT]
+ mov T1d, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0d, INT [LUT + 1*SIZEOF_INT]
+ mov T1d, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ mov T0d, INT [LUT + 2*SIZEOF_INT]
+ mov T1d, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ mov T0d, INT [LUT + 3*SIZEOF_INT]
+ mov T1d, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ mov T0d, INT [LUT + 4*SIZEOF_INT]
+ mov T1d, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ mov T0d, INT [LUT + 5*SIZEOF_INT]
+ mov T1d, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ mov T0d, INT [LUT + 6*SIZEOF_INT]
+ mov T1d, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+
+ mov T0d, INT [LUT + 7*SIZEOF_INT]
+ mov T1d, INT [LUT + 15*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+ pinsrw X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+ pxor N0, N0
+ pxor N1, N1
+ pxor X1, X1
+
+ mov T0d, INT [LUT + 0*SIZEOF_INT]
+ mov T1d, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0d, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0d, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0d, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0d, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0d, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0d, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0d, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+
+ cmp LENEND, 2
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+ pxor N0, N0
+
+ mov T0d, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+
+ mov T0d, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0d, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0d, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0d, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0d, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0d, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0d, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+ pxor N0, N0
+ pxor X0, X0
+
+ mov T1d, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 0
+
+ cmp LENEND, 2
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+ movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
+ movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
+ movdqa xmm2, XMMWORD [VALUES + (16*2)]
+ movdqa xmm3, XMMWORD [VALUES + (24*2)]
+ movdqa xmm4, XMMWORD [VALUES + (32*2)]
+ movdqa xmm5, XMMWORD [VALUES + (40*2)]
+ movdqa xmm6, XMMWORD [VALUES + (48*2)]
+ movdqa xmm7, XMMWORD [VALUES + (56*2)]
+
+ pcmpeqw xmm0, ZERO
+ pcmpeqw xmm1, ZERO
+ pcmpeqw xmm2, ZERO
+ pcmpeqw xmm3, ZERO
+ pcmpeqw xmm4, ZERO
+ pcmpeqw xmm5, ZERO
+ pcmpeqw xmm6, ZERO
+ pcmpeqw xmm7, ZERO
+
+ packsswb xmm0, xmm1
+ packsswb xmm2, xmm3
+ packsswb xmm4, xmm5
+ packsswb xmm6, xmm7
+
+ pmovmskb eax, xmm0
+ pmovmskb ecx, xmm2
+ pmovmskb edx, xmm4
+ pmovmskb esi, xmm6
+
+ shl rcx, 16
+ shl rdx, 32
+ shl rsi, 48
+
+ or rax, rcx
+ or rdx, rsi
+ or rax, rdx
+
+ not rax
+
+ mov MMWORD [r15], rax
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *absvalues,
+; size_t *bits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *bits
+
+%define ZERO xmm9
+%define ONE xmm5
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define KK r9d
+%define EOB r8d
+%define SIGN rdi
+%define LUT r11
+%define T0 rcx
+%define T0d ecx
+%define T1 rdx
+%define T1d edx
+%define BLOCK r10
+%define VALUES r14
+%define LEN r12d
+%define LENEND r13d
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [rbp - 16]
+ collect_args 6
+
+ movdqa XMMWORD [rbp - 16], ZERO
+
+ xor SIGN, SIGN
+ xor EOB, EOB
+ xor KK, KK
+ movd AL, r13d
+ pxor ZERO, ZERO
+ pcmpeqw ONE, ONE
+ psrlw ONE, 15
+ mov K, LEN
+ mov LENEND, LEN
+ and K, -16
+ and LENEND, 7
+ shr K, 4
+ jz .ELOOPR16
+.BLOOPR16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 16 ; make room for sizebits
+ shl T0, 48
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER16 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER16:
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ add KK, 16
+ dec K
+ jnz .BLOOPR16
+.ELOOPR16:
+ test LEN, 8
+ jz .TRYR7
+ test LEN, 7
+ jz .TRYR8
+
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 16 ; make room for sizebits
+ shl T0, 48
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER15 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER15:
+ add VALUES, 16*2
+ jmp .PADDINGR
+.TRYR8:
+ LOAD8
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 8 ; make room for sizebits
+ shl T0, 56
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER8 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER8:
+ add VALUES, 8*2
+ jmp .PADDINGR
+.TRYR7:
+ LOAD7
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 8 ; make room for sizebits
+ shl T0, 56
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER7 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER7:
+ add VALUES, 8*2
+.PADDINGR:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDINGR
+ align 16
+.ZEROLOOPR:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ shr SIGN, 8
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOPR
+.EPADDINGR:
+ not SIGN
+ sub VALUES, DCTSIZE2*2
+ mov MMWORD [r15+SIZEOF_MMWORD], SIGN
+
+ REDUCE0
+
+ mov eax, EOB
+ movdqa ZERO, XMMWORD [rbp - 16]
+ uncollect_args 6
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
+#include "jconfigint.h"
/*
* In the PIC cases, we have no guarantee that constants will keep
return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
dctbl, actbl);
}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+ if (!(simd_support & JSIMD_SSE2))
+ return 0;
+#if defined(HAVE_BUILTIN_CTZL)
+ return 1;
+#elif defined(HAVE_BITSCANFORWARD64)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
+}