From: mayeut Date: Fri, 2 Mar 2018 21:33:19 +0000 (+0100) Subject: C/SSE2 optimization of encode_mcu_AC_refine() X-Git-Tag: 1.5.90~13 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=16bd984557fa2c490be0b9665e2ea0d4274528a8;p=libjpeg-turbo C/SSE2 optimization of encode_mcu_AC_refine() This commit adds C and SSE2 optimizations for the encode_mcu_AC_refine() function used in progressive Huffman encoding. The image used for testing can be retrieved from this page: https://blog.cloudflare.com/doubling-the-speed-of-jpegtran All timings done on `Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz` clang version is `Apple LLVM version 9.0.0 (clang-900.0.39.2)` gcc-5 version is `gcc-5 (Homebrew GCC 5.5.0) 5.5.0` gcc-7 version is `gcc-7 (Homebrew GCC 7.2.0) 7.2.0` Here are the results in comparison to libjpeg-turbo@3c54642 using `time ./jpegtran -outfile /dev/null -progressive -optimise -copy none print_poster_0025.jpg` C clang x86_64: +7% gcc-5 x86_64: +30% gcc-7 x86_64: +33% clang i386: +0% gcc-5 i386: +24% gcc-7 i386: +23% SSE2 clang x86_64: +42% gcc-5 x86_64: +53% gcc-7 x86_64: +64% clang i386: +35% gcc-5 i386: +46% gcc-7 i386: +49% Discussion in libjpeg-turbo/libjpeg-turbo#46 --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 312eab4..7042e3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -341,10 +341,19 @@ set(EFFECTIVE_LD_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS_${CMA message(STATUS "Linker flags = ${EFFECTIVE_LD_FLAGS}") include(CheckCSourceCompiles) +include(CheckIncludeFiles) +include(CheckTypeSize) + +check_type_size("size_t" SIZE_T) +check_type_size("unsigned long" UNSIGNED_LONG) + +if(SIZE_T EQUAL UNSIGNED_LONG) + check_c_source_compiles("int main(int argc, char **argv) { unsigned long a = argc; return __builtin_ctzl(a); }" + HAVE_BUILTIN_CTZL) +endif() if(UNIX) # Check for headers - include(CheckIncludeFiles) check_include_files(locale.h HAVE_LOCALE_H) check_include_files(stddef.h HAVE_STDDEF_H) check_include_files(stdlib.h HAVE_STDLIB_H) @@ -359,10 +368,8 @@ if(UNIX) endif() # Check for types - include(CheckTypeSize) check_type_size("unsigned char" UNSIGNED_CHAR) check_type_size("unsigned short" UNSIGNED_SHORT) - check_type_size("size_t" SIZE_T) # Check for compiler features check_c_source_compiles("int main(void) { typedef struct undefined_structure *undef_struct_ptr; }" @@ -408,6 +415,7 @@ if(UNIX) endif() if(MSVC) + check_include_files("intrin.h" HAVE_INTRIN_H) set(INLINE_OPTIONS "__inline;inline") else() set(INLINE_OPTIONS "__inline__;inline") diff --git a/jconfigint.h.in b/jconfigint.h.in index d9b46ea..55df053 100644 --- a/jconfigint.h.in +++ b/jconfigint.h.in @@ -13,9 +13,19 @@ /* Version number of package */ #define VERSION "@VERSION@" -#ifndef _WIN32 - /* The size of `size_t', as computed by sizeof. */ #define SIZEOF_SIZE_T @SIZE_T@ +/* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */ +#cmakedefine HAVE_BUILTIN_CTZL + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_INTRIN_H + +#if defined(_MSC_VER) && defined(HAVE_INTRIN_H) +#if (SIZEOF_SIZE_T == 8) +#define HAVE_BITSCANFORWARD64 +#elif (SIZEOF_SIZE_T == 4) +#define HAVE_BITSCANFORWARD +#endif #endif diff --git a/jcphuff.c b/jcphuff.c index 37df6de..31f1db3 100644 --- a/jcphuff.c +++ b/jcphuff.c @@ -5,6 +5,7 @@ * Copyright (C) 1995-1997, Thomas G. Lane. * libjpeg-turbo Modifications: * Copyright (C) 2011, 2015, 2018, D. R. Commander. + * Copyright (C) 2016, 2018, Matthieu Darbois. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -18,9 +19,22 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jchuff.h" /* Declarations shared with jchuff.c */ +#include "jsimd.h" +#include "jconfigint.h" #include +#ifdef HAVE_INTRIN_H +#include +#ifdef _MSC_VER +#ifdef HAVE_BITSCANFORWARD64 +#pragma intrinsic(_BitScanForward64) +#endif +#ifdef HAVE_BITSCANFORWARD +#pragma intrinsic(_BitScanForward) +#endif +#endif +#endif + #ifdef C_PROGRESSIVE_SUPPORTED /* @@ -59,6 +73,11 @@ typedef struct { struct jpeg_entropy_encoder pub; /* public fields */ + /* Pointer to routine to prepare data for encode_mcu_AC_refine() */ + int (*AC_refine_prepare) (const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits); + /* Mode flag: TRUE for optimization, FALSE for actual data output */ boolean gather_statistics; @@ -120,6 +139,8 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr; #define IRIGHT_SHIFT(x, shft) ((x) >> (shft)) #endif +#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1))) + /* Forward declarations */ METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data); @@ -127,12 +148,41 @@ METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data); METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data); +METHODDEF(int) encode_mcu_AC_refine_prepare + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *absvalues, size_t *bits); METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data); METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo); METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo); +/* Count bit loop zeroes */ +INLINE +METHODDEF(int) +count_zeroes(size_t *x) +{ + int result; +#if defined(HAVE_BUILTIN_CTZL) + result = __builtin_ctzl(*x); + *x >>= result; +#elif defined(HAVE_BITSCANFORWARD64) + _BitScanForward64(&result, *x); + *x >>= result; +#elif defined(HAVE_BITSCANFORWARD) + _BitScanForward(&result, *x); + *x >>= result; +#else + result = 0; + while ((*x & 1) == 0) { + ++result; + *x >>= 1; + } +#endif + return result; +} + + /* * Initialize for a Huffman-compressed scan using progressive JPEG. */ @@ -163,6 +213,10 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics) entropy->pub.encode_mcu = encode_mcu_DC_refine; else { entropy->pub.encode_mcu = encode_mcu_AC_refine; + if (jsimd_can_encode_mcu_AC_refine_prepare()) + entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare; + else + entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare; /* AC refinement needs a correction bit buffer */ if (entropy->bit_buffer == NULL) entropy->bit_buffer = (char *) @@ -637,23 +691,149 @@ encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data) } +/* + * Data preparation for encode_mcu_AC_refine(). + */ + +#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \ + /* It is convenient to make a pre-pass to determine the transformed \ + * coefficients' absolute values and the EOB position. \ + */ \ + for (k = 0; k < Sl; k++) { \ + temp = block[jpeg_natural_order_start[k]]; \ + /* We must apply the point transform by Al. For AC coefficients this \ + * is an integer division with rounding towards 0. To do this portably \ + * in C, we shift after obtaining the absolute value. \ + */ \ + temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \ + temp ^= temp2; \ + temp -= temp2; /* temp is abs value of input */ \ + temp >>= Al; /* apply the point transform */ \ + if (temp != 0) { \ + zerobits |= ((size_t)1U) << k; \ + signbits |= ((size_t)(temp2 + 1)) << k; \ + } \ + absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \ + if (temp == 1) \ + EOB = k + koffset; /* EOB = index of last newly-nonzero coef */ \ + } \ +} + +METHODDEF(int) +encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + register int k, temp, temp2; + int EOB = 0; + size_t zerobits = 0U, signbits = 0U; + int Sl0 = Sl; + +#if SIZEOF_SIZE_T == 4 + if (Sl0 > 32) + Sl0 = 32; +#endif + + COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0); + + bits[0] = zerobits; +#if SIZEOF_SIZE_T == 8 + bits[1] = signbits; +#else + bits[2] = signbits; + + zerobits = 0U; + signbits = 0U; + + if (Sl > 32) { + Sl -= 32; + jpeg_natural_order_start += 32; + absvalues += 32; + + COMPUTE_ABSVALUES_AC_REFINE(Sl, 32); + } + + bits[1] = zerobits; + bits[3] = signbits; +#endif + + return EOB; +} + + /* * MCU encoding for AC successive approximation refinement scan. */ +#define ENCODE_COEFS_AC_REFINE(label) { \ + while (zerobits) { \ + int idx = count_zeroes(&zerobits); \ + r += idx; \ + cabsvalue += idx; \ + signbits >>= idx; \ +label \ + /* Emit any required ZRLs, but not if they can be folded into EOB */ \ + while (r > 15 && (cabsvalue <= EOBPTR)) { \ + /* emit any pending EOBRUN and the BE correction bits */ \ + emit_eobrun(entropy); \ + /* Emit ZRL */ \ + emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \ + r -= 16; \ + /* Emit buffered correction bits that must be associated with ZRL */ \ + emit_buffered_bits(entropy, BR_buffer, BR); \ + BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \ + BR = 0; \ + } \ + \ + temp = *cabsvalue++; \ + \ + /* If the coef was previously nonzero, it only needs a correction bit. \ + * NOTE: a straight translation of the spec's figure G.7 would suggest \ + * that we also need to test r > 15. But if r > 15, we can only get here \ + * if k > EOB, which implies that this coefficient is not 1. \ + */ \ + if (temp > 1) { \ + /* The correction bit is the next bit of the absolute value. */ \ + BR_buffer[BR++] = (char)(temp & 1); \ + signbits >>= 1; \ + zerobits >>= 1; \ + continue; \ + } \ + \ + /* Emit any pending EOBRUN and the BE correction bits */ \ + emit_eobrun(entropy); \ + \ + /* Count/emit Huffman symbol for run length / number of bits */ \ + emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \ + \ + /* Emit output bit for newly-nonzero coef */ \ + temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \ + emit_bits(entropy, (unsigned int)temp, 1); \ + \ + /* Emit buffered correction bits that must be associated with this code */ \ + emit_buffered_bits(entropy, BR_buffer, BR); \ + BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \ + BR = 0; \ + r = 0; /* reset zero run length */ \ + signbits >>= 1; \ + zerobits >>= 1; \ + } \ +} + METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data) { phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy; - register int temp, temp3; - register int r, k; - int EOB; + register int temp, r; char *BR_buffer; unsigned int BR; - int Se = cinfo->Se; + int Sl = cinfo->Se - cinfo->Ss + 1; int Al = cinfo->Al; - JBLOCKROW block; - int absvalues[DCTSIZE2]; + JCOEF absvalues_unaligned[DCTSIZE2 + 15]; + JCOEF *absvalues; + const JCOEF *cabsvalue, *EOBPTR; + size_t zerobits, signbits; + size_t bits[16 / SIZEOF_SIZE_T]; entropy->next_output_byte = cinfo->dest->next_output_byte; entropy->free_in_buffer = cinfo->dest->free_in_buffer; @@ -663,27 +843,17 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data) if (entropy->restarts_to_go == 0) emit_restart(entropy, entropy->next_restart_num); - /* Encode the MCU data block */ - block = MCU_data[0]; +#ifdef WITH_SIMD + cabsvalue = absvalues = (JCOEF *)PAD((size_t)absvalues_unaligned, 16); +#else + /* Not using SIMD, so alignment is not needed */ + cabsvalue = absvalues = absvalues_unaligned; +#endif - /* It is convenient to make a pre-pass to determine the transformed - * coefficients' absolute values and the EOB position. - */ - EOB = 0; - for (k = cinfo->Ss; k <= Se; k++) { - temp = (*block)[jpeg_natural_order[k]]; - /* We must apply the point transform by Al. For AC coefficients this - * is an integer division with rounding towards 0. To do this portably - * in C, we shift after obtaining the absolute value. - */ - temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); - temp ^= temp3; - temp -= temp3; /* temp is abs value of input */ - temp >>= Al; /* apply the point transform */ - absvalues[k] = temp; /* save abs value for main pass */ - if (temp == 1) - EOB = k; /* EOB = index of last newly-nonzero coef */ - } + /* Prepare data */ + EOBPTR = absvalues + + entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss, + Sl, Al, absvalues, bits); /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */ @@ -691,52 +861,32 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data) BR = 0; /* BR = count of buffered bits added now */ BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */ - for (k = cinfo->Ss; k <= Se; k++) { - if ((temp = absvalues[k]) == 0) { - r++; - continue; - } - - /* Emit any required ZRLs, but not if they can be folded into EOB */ - while (r > 15 && k <= EOB) { - /* emit any pending EOBRUN and the BE correction bits */ - emit_eobrun(entropy); - /* Emit ZRL */ - emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); - r -= 16; - /* Emit buffered correction bits that must be associated with ZRL */ - emit_buffered_bits(entropy, BR_buffer, BR); - BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ - BR = 0; - } - - /* If the coef was previously nonzero, it only needs a correction bit. - * NOTE: a straight translation of the spec's figure G.7 would suggest - * that we also need to test r > 15. But if r > 15, we can only get here - * if k > EOB, which implies that this coefficient is not 1. - */ - if (temp > 1) { - /* The correction bit is the next bit of the absolute value. */ - BR_buffer[BR++] = (char)(temp & 1); - continue; - } - - /* Emit any pending EOBRUN and the BE correction bits */ - emit_eobrun(entropy); - - /* Count/emit Huffman symbol for run length / number of bits */ - emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); + zerobits = bits[0]; +#if SIZEOF_SIZE_T == 8 + signbits = bits[1]; +#else + signbits = bits[2]; +#endif + ENCODE_COEFS_AC_REFINE(); + +#if SIZEOF_SIZE_T == 4 + zerobits = bits[1]; + signbits = bits[3]; + + if (zerobits) { + int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue); + int idx = count_zeroes(&zerobits); + signbits >>= idx; + idx += diff; + r += idx; + cabsvalue += idx; + goto first_iter_ac_refine; + } - /* Emit output bit for newly-nonzero coef */ - temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1; - emit_bits(entropy, (unsigned int)temp, 1); + ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:); +#endif - /* Emit buffered correction bits that must be associated with this code */ - emit_buffered_bits(entropy, BR_buffer, BR); - BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ - BR = 0; - r = 0; /* reset zero run length */ - } + r |= (int)((absvalues + Sl) - cabsvalue); if (r > 0 || BR > 0) { /* If there are trailing zeroes, */ entropy->EOBRUN++; /* count an EOB */ diff --git a/jsimd.h b/jsimd.h index 1b9af30..c134e97 100644 --- a/jsimd.h +++ b/jsimd.h @@ -3,7 +3,7 @@ * * Copyright 2009 Pierre Ossman for Cendio AB * Copyright (C) 2011, 2014, D. R. Commander. - * Copyright (C) 2015, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -103,3 +103,9 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void); + +EXTERN(int) jsimd_encode_mcu_AC_refine_prepare + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *absvalues, size_t *bits); diff --git a/jsimd_none.c b/jsimd_none.c index 58acbe6..5b13d19 100644 --- a/jsimd_none.c +++ b/jsimd_none.c @@ -3,7 +3,7 @@ * * Copyright 2009 Pierre Ossman for Cendio AB * Copyright (C) 2009-2011, 2014, D. R. Commander. - * Copyright (C) 2015, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -389,3 +389,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, { return NULL; } + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt index 624350c..2b37525 100755 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt @@ -108,10 +108,11 @@ endif() if(CPU_TYPE STREQUAL "x86_64") set(SIMD_SOURCES x86_64/jsimdcpu.asm x86_64/jfdctflt-sse.asm x86_64/jccolor-sse2.asm x86_64/jcgray-sse2.asm x86_64/jchuff-sse2.asm - x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm x86_64/jdmerge-sse2.asm - x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm x86_64/jfdctint-sse2.asm - x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm x86_64/jidctint-sse2.asm - x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm x86_64/jquanti-sse2.asm + x86_64/jcphuff-sse2.asm x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm + x86_64/jdmerge-sse2.asm x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm + x86_64/jfdctint-sse2.asm x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm + x86_64/jidctint-sse2.asm x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm + x86_64/jquanti-sse2.asm x86_64/jccolor-avx2.asm x86_64/jcgray-avx2.asm x86_64/jcsample-avx2.asm x86_64/jdcolor-avx2.asm x86_64/jdmerge-avx2.asm x86_64/jdsample-avx2.asm x86_64/jfdctint-avx2.asm x86_64/jidctint-avx2.asm x86_64/jquanti-avx2.asm) @@ -124,10 +125,11 @@ else() i386/jidctint-mmx.asm i386/jidctred-mmx.asm i386/jquant-mmx.asm i386/jfdctflt-sse.asm i386/jidctflt-sse.asm i386/jquant-sse.asm i386/jccolor-sse2.asm i386/jcgray-sse2.asm i386/jchuff-sse2.asm - i386/jcsample-sse2.asm i386/jdcolor-sse2.asm i386/jdmerge-sse2.asm - i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm i386/jfdctint-sse2.asm - i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm i386/jidctint-sse2.asm - i386/jidctred-sse2.asm i386/jquantf-sse2.asm i386/jquanti-sse2.asm + i386/jcphuff-sse2.asm i386/jcsample-sse2.asm i386/jdcolor-sse2.asm + i386/jdmerge-sse2.asm i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm + i386/jfdctint-sse2.asm i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm + i386/jidctint-sse2.asm i386/jidctred-sse2.asm i386/jquantf-sse2.asm + i386/jquanti-sse2.asm i386/jccolor-avx2.asm i386/jcgray-avx2.asm i386/jcsample-avx2.asm i386/jdcolor-avx2.asm i386/jdmerge-avx2.asm i386/jdsample-avx2.asm i386/jfdctint-avx2.asm i386/jidctint-avx2.asm i386/jquanti-avx2.asm) diff --git a/simd/arm/jsimd.c b/simd/arm/jsimd.c index e706326..e0f1a4c 100644 --- a/simd/arm/jsimd.c +++ b/simd/arm/jsimd.c @@ -4,7 +4,7 @@ * Copyright 2009 Pierre Ossman for Cendio AB * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander. - * Copyright (C) 2015-2016, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -691,3 +691,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, dctbl, actbl); } + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/simd/arm64/jsimd.c b/simd/arm64/jsimd.c index 942ed3a..bd689ab 100644 --- a/simd/arm64/jsimd.c +++ b/simd/arm64/jsimd.c @@ -4,7 +4,7 @@ * Copyright 2009 Pierre Ossman for Cendio AB * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander. - * Copyright (C) 2015-2016, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -769,3 +769,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block, last_dc_val, dctbl, actbl); } + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/simd/i386/jcphuff-sse2.asm b/simd/i386/jcphuff-sse2.asm new file mode 100644 index 0000000..078ae4a --- /dev/null +++ b/simd/i386/jcphuff-sse2.asm @@ -0,0 +1,486 @@ +; +; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2) +; +; Copyright (C) 2016, 2018, Matthieu Darbois +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation of data preparation for progressive +; Huffman encoding. See jcphuff.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +; -------------------------------------------------------------------------- +; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2() + +%macro LOAD16 0 + pxor N0, N0 + pxor N1, N1 + + mov T0, INT [LUT + 0*SIZEOF_INT] + mov T1, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0, INT [LUT + 1*SIZEOF_INT] + mov T1, INT [LUT + 9*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + pinsrw X1, word [BLOCK + T1 * 2], 1 + + mov T0, INT [LUT + 2*SIZEOF_INT] + mov T1, INT [LUT + 10*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + pinsrw X1, word [BLOCK + T1 * 2], 2 + + mov T0, INT [LUT + 3*SIZEOF_INT] + mov T1, INT [LUT + 11*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + pinsrw X1, word [BLOCK + T1 * 2], 3 + + mov T0, INT [LUT + 4*SIZEOF_INT] + mov T1, INT [LUT + 12*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + pinsrw X1, word [BLOCK + T1 * 2], 4 + + mov T0, INT [LUT + 5*SIZEOF_INT] + mov T1, INT [LUT + 13*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + pinsrw X1, word [BLOCK + T1 * 2], 5 + + mov T0, INT [LUT + 6*SIZEOF_INT] + mov T1, INT [LUT + 14*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + pinsrw X1, word [BLOCK + T1 * 2], 6 + + mov T0, INT [LUT + 7*SIZEOF_INT] + mov T1, INT [LUT + 15*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + pinsrw X1, word [BLOCK + T1 * 2], 7 +%endmacro + +%macro LOAD15 0 + pxor N0, N0 + pxor N1, N1 + pxor X1, X1 + + mov T0, INT [LUT + 0*SIZEOF_INT] + mov T1, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + + cmp LENEND, 2 + jl %%.ELOAD15 + mov T1, INT [LUT + 9*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD15 + mov T1, INT [LUT + 10*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD15 + mov T1, INT [LUT + 11*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD15 + mov T1, INT [LUT + 12*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD15 + mov T1, INT [LUT + 13*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD15 + mov T1, INT [LUT + 14*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 6 +%%.ELOAD15: +%endmacro + +%macro LOAD8 0 + pxor N0, N0 + + mov T0, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + + mov T0, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 +%endmacro + +%macro LOAD7 0 + pxor N0, N0 + pxor X0, X0 + + mov T1, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 0 + + cmp LENEND, 2 + jl %%.ELOAD7 + mov T1, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD7 + mov T1, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD7 + mov T1, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD7 + mov T1, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD7 + mov T1, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD7 + mov T1, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 6 +%%.ELOAD7: +%endmacro + +%macro REDUCE0 0 + movdqa xmm0, XMMWORD [VALUES + ( 0*2)] + movdqa xmm1, XMMWORD [VALUES + ( 8*2)] + movdqa xmm2, XMMWORD [VALUES + (16*2)] + movdqa xmm3, XMMWORD [VALUES + (24*2)] + movdqa xmm4, XMMWORD [VALUES + (32*2)] + movdqa xmm5, XMMWORD [VALUES + (40*2)] + movdqa xmm6, XMMWORD [VALUES + (48*2)] + + pcmpeqw xmm0, ZERO + pcmpeqw xmm1, ZERO + pcmpeqw xmm2, ZERO + pcmpeqw xmm3, ZERO + pcmpeqw xmm4, ZERO + pcmpeqw xmm5, ZERO + pcmpeqw xmm6, ZERO + pcmpeqw xmm7, XMMWORD [VALUES + (56*2)] + + packsswb xmm0, xmm1 + packsswb xmm2, xmm3 + packsswb xmm4, xmm5 + packsswb xmm6, xmm7 + + pmovmskb eax, xmm0 + pmovmskb ecx, xmm2 + pmovmskb edx, xmm4 + pmovmskb esi, xmm6 + + shl ecx, 16 + shl esi, 16 + + or eax, ecx + or edx, esi + + not eax + not edx + + mov edi, ZEROBITS + + mov INT [edi], eax + mov INT [edi+SIZEOF_INT], edx +%endmacro + +; +; Prepare data for jsimd_encode_mcu_AC_refine(). +; +; GLOBAL(int) +; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *absvalues, +; size_t *bits) +; +; eax + 8 = const JCOEF *block +; eax + 12 = const int *jpeg_natural_order_start +; eax + 16 = int Sl +; eax + 20 = int Al +; eax + 24 = JCOEF *values +; eax + 28 = size_t *bits + +%define ZERO xmm7 +%define ONE xmm5 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define LENEND eax +%define LUT ebx +%define T0 ecx +%define T0w cx +%define T1 edx +%define BLOCK esi +%define VALUES edi +%define KK ebp + +%define ZEROBITS INT [esp + 5 * 4] +%define EOB INT [esp + 5 * 4 + 4] +%define LEN INT [esp + 5 * 4 + 8] + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + sub esp, 16 + push ebx + push ecx +; push edx ; need not be preserved + push esi + push edi + push ebp + + pcmpeqw ONE, ONE + psrlw ONE, 15 + mov BLOCK, INT [eax + 8] + mov LUT, INT [eax + 12] + mov VALUES, INT [eax + 24] + movd AL, INT [eax + 20] + mov T0, INT [eax + 28] + mov K, INT [eax + 16] + mov INT [T0 + 2 * SIZEOF_INT], -1 + mov INT [T0 + 3 * SIZEOF_INT], -1 + mov ZEROBITS, T0 + mov LEN, K + pxor ZERO, ZERO + and K, -16 + mov EOB, 0 + xor KK, KK + shr K, 4 + jz .ELOOPR16 +.BLOOPR16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER16 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER16: + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + add KK, 2 + dec K + jnz .BLOOPR16 +.ELOOPR16: + mov LENEND, LEN + + test LENEND, 8 + jz .TRYR7 + test LENEND, 7 + jz .TRYR8 + + and LENEND, 7 + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER15 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER15: + add VALUES, 16*2 + jmp .PADDINGR +.TRYR8: + LOAD8 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER8 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER8: + add VALUES, 8*2 + jmp .PADDINGR +.TRYR7: + and LENEND, 7 + LOAD7 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER7 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER7: + add VALUES, 8*2 +.PADDINGR: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDINGR + align 16 +.ZEROLOOPR: + movdqa XMMWORD [VALUES + 0], ZERO + add VALUES, 8*2 + inc K + jnz .ZEROLOOPR +.EPADDINGR: + sub VALUES, DCTSIZE2*2 + + REDUCE0 + + mov eax, EOB + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +%undef ZERO +%undef ONE +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef KK +%undef EOB +%undef SIGN +%undef LUT +%undef T0 +%undef T1 +%undef BLOCK +%undef VALUES +%undef LEN +%undef LENEND + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c index f12d1e9..f2315fa 100644 --- a/simd/i386/jsimd.c +++ b/simd/i386/jsimd.c @@ -3,7 +3,7 @@ * * Copyright 2009 Pierre Ossman for Cendio AB * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander. - * Copyright (C) 2015, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -21,6 +21,7 @@ #include "../../jdct.h" #include "../../jsimddct.h" #include "../jsimd.h" +#include "jconfigint.h" /* * In the PIC cases, we have no guarantee that constants will keep @@ -1197,3 +1198,35 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, dctbl, actbl); } + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 4) + return 0; + if (!(simd_support & JSIMD_SSE2)) + return 0; +#if defined(HAVE_BUILTIN_CTZL) + return 1; +#elif defined(HAVE_BITSCANFORWARD) + return 1; +#else + return 0; +#endif +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return jsimd_encode_mcu_AC_refine_prepare_sse2(block, + jpeg_natural_order_start, + Sl, Al, absvalues, bits); +} diff --git a/simd/jsimd.h b/simd/jsimd.h index 4d1a0ff..b3e82cf 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -5,7 +5,7 @@ * Copyright (C) 2011, 2014-2016, 2018, D. R. Commander. * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. * Copyright (C) 2014, Linaro Limited. - * Copyright (C) 2015-2016, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. * * Based on the x86 SIMD extension for IJG JPEG library, @@ -1072,3 +1072,8 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl); + +/* Progressive Huffman encoding */ +EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2 + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *absvalues, size_t *bits); diff --git a/simd/loongson/jsimd.c b/simd/loongson/jsimd.c index 132798a..ba1d14e 100644 --- a/simd/loongson/jsimd.c +++ b/simd/loongson/jsimd.c @@ -4,7 +4,7 @@ * Copyright 2009 Pierre Ossman for Cendio AB * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander. * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. - * Copyright (C) 2015, Matthieu Darbois. + * Copyright (C) 2015, 2018, Matthieu Darbois. * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. * * Based on the x86 SIMD extension for IJG JPEG library, @@ -581,3 +581,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, { return NULL; } + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/simd/mips/jsimd.c b/simd/mips/jsimd.c index 1ff1053..d06cb97 100644 --- a/simd/mips/jsimd.c +++ b/simd/mips/jsimd.c @@ -4,7 +4,7 @@ * Copyright 2009 Pierre Ossman for Cendio AB * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander. * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. - * Copyright (C) 2015, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -1086,3 +1086,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, { return NULL; } + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/simd/powerpc/jsimd.c b/simd/powerpc/jsimd.c index 05b221e..023e96d 100644 --- a/simd/powerpc/jsimd.c +++ b/simd/powerpc/jsimd.c @@ -3,7 +3,7 @@ * * Copyright 2009 Pierre Ossman for Cendio AB * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander. - * Copyright (C) 2015, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -843,3 +843,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, { return NULL; } + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/simd/x86_64/jcphuff-sse2.asm b/simd/x86_64/jcphuff-sse2.asm new file mode 100644 index 0000000..4a9f592 --- /dev/null +++ b/simd/x86_64/jcphuff-sse2.asm @@ -0,0 +1,474 @@ +; +; jcphuff-sse2.asm - prepare data for progressive Huffman encoding +; (64-bit SSE2) +; +; Copyright (C) 2016, 2018, Matthieu Darbois +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation of data preparation for progressive +; Huffman encoding. See jcphuff.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +; -------------------------------------------------------------------------- +; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2() + +%macro LOAD16 0 + pxor N0, N0 + pxor N1, N1 + + mov T0d, INT [LUT + 0*SIZEOF_INT] + mov T1d, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0d, INT [LUT + 1*SIZEOF_INT] + mov T1d, INT [LUT + 9*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + pinsrw X1, word [BLOCK + T1 * 2], 1 + + mov T0d, INT [LUT + 2*SIZEOF_INT] + mov T1d, INT [LUT + 10*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + pinsrw X1, word [BLOCK + T1 * 2], 2 + + mov T0d, INT [LUT + 3*SIZEOF_INT] + mov T1d, INT [LUT + 11*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + pinsrw X1, word [BLOCK + T1 * 2], 3 + + mov T0d, INT [LUT + 4*SIZEOF_INT] + mov T1d, INT [LUT + 12*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + pinsrw X1, word [BLOCK + T1 * 2], 4 + + mov T0d, INT [LUT + 5*SIZEOF_INT] + mov T1d, INT [LUT + 13*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + pinsrw X1, word [BLOCK + T1 * 2], 5 + + mov T0d, INT [LUT + 6*SIZEOF_INT] + mov T1d, INT [LUT + 14*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + pinsrw X1, word [BLOCK + T1 * 2], 6 + + mov T0d, INT [LUT + 7*SIZEOF_INT] + mov T1d, INT [LUT + 15*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + pinsrw X1, word [BLOCK + T1 * 2], 7 +%endmacro + +%macro LOAD15 0 + pxor N0, N0 + pxor N1, N1 + pxor X1, X1 + + mov T0d, INT [LUT + 0*SIZEOF_INT] + mov T1d, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0d, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0d, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0d, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0d, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0d, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0d, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0d, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + + cmp LENEND, 2 + jl %%.ELOAD15 + mov T1d, INT [LUT + 9*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD15 + mov T1d, INT [LUT + 10*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD15 + mov T1d, INT [LUT + 11*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD15 + mov T1d, INT [LUT + 12*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD15 + mov T1d, INT [LUT + 13*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD15 + mov T1d, INT [LUT + 14*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 6 +%%.ELOAD15: +%endmacro + +%macro LOAD8 0 + pxor N0, N0 + + mov T0d, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + + mov T0d, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0d, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0d, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0d, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0d, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0d, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0d, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 +%endmacro + +%macro LOAD7 0 + pxor N0, N0 + pxor X0, X0 + + mov T1d, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 0 + + cmp LENEND, 2 + jl %%.ELOAD7 + mov T1d, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD7 + mov T1d, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD7 + mov T1d, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD7 + mov T1d, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD7 + mov T1d, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD7 + mov T1d, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 6 +%%.ELOAD7: +%endmacro + +%macro REDUCE0 0 + movdqa xmm0, XMMWORD [VALUES + ( 0*2)] + movdqa xmm1, XMMWORD [VALUES + ( 8*2)] + movdqa xmm2, XMMWORD [VALUES + (16*2)] + movdqa xmm3, XMMWORD [VALUES + (24*2)] + movdqa xmm4, XMMWORD [VALUES + (32*2)] + movdqa xmm5, XMMWORD [VALUES + (40*2)] + movdqa xmm6, XMMWORD [VALUES + (48*2)] + movdqa xmm7, XMMWORD [VALUES + (56*2)] + + pcmpeqw xmm0, ZERO + pcmpeqw xmm1, ZERO + pcmpeqw xmm2, ZERO + pcmpeqw xmm3, ZERO + pcmpeqw xmm4, ZERO + pcmpeqw xmm5, ZERO + pcmpeqw xmm6, ZERO + pcmpeqw xmm7, ZERO + + packsswb xmm0, xmm1 + packsswb xmm2, xmm3 + packsswb xmm4, xmm5 + packsswb xmm6, xmm7 + + pmovmskb eax, xmm0 + pmovmskb ecx, xmm2 + pmovmskb edx, xmm4 + pmovmskb esi, xmm6 + + shl rcx, 16 + shl rdx, 32 + shl rsi, 48 + + or rax, rcx + or rdx, rsi + or rax, rdx + + not rax + + mov MMWORD [r15], rax +%endmacro + +; +; Prepare data for jsimd_encode_mcu_AC_refine(). +; +; GLOBAL(int) +; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *absvalues, +; size_t *bits) +; +; r10 = const JCOEF *block +; r11 = const int *jpeg_natural_order_start +; r12 = int Sl +; r13 = int Al +; r14 = JCOEF *values +; r15 = size_t *bits + +%define ZERO xmm9 +%define ONE xmm5 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define KK r9d +%define EOB r8d +%define SIGN rdi +%define LUT r11 +%define T0 rcx +%define T0d ecx +%define T1 rdx +%define T1d edx +%define BLOCK r10 +%define VALUES r14 +%define LEN r12d +%define LENEND r13d + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [rbp - 16] + collect_args 6 + + movdqa XMMWORD [rbp - 16], ZERO + + xor SIGN, SIGN + xor EOB, EOB + xor KK, KK + movd AL, r13d + pxor ZERO, ZERO + pcmpeqw ONE, ONE + psrlw ONE, 15 + mov K, LEN + mov LENEND, LEN + and K, -16 + and LENEND, 7 + shr K, 4 + jz .ELOOPR16 +.BLOOPR16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 16 ; make room for sizebits + shl T0, 48 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER16 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER16: + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + add KK, 16 + dec K + jnz .BLOOPR16 +.ELOOPR16: + test LEN, 8 + jz .TRYR7 + test LEN, 7 + jz .TRYR8 + + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 16 ; make room for sizebits + shl T0, 48 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER15 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER15: + add VALUES, 16*2 + jmp .PADDINGR +.TRYR8: + LOAD8 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 8 ; make room for sizebits + shl T0, 56 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER8 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER8: + add VALUES, 8*2 + jmp .PADDINGR +.TRYR7: + LOAD7 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 8 ; make room for sizebits + shl T0, 56 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER7 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER7: + add VALUES, 8*2 +.PADDINGR: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDINGR + align 16 +.ZEROLOOPR: + movdqa XMMWORD [VALUES + 0], ZERO + shr SIGN, 8 + add VALUES, 8*2 + inc K + jnz .ZEROLOOPR +.EPADDINGR: + not SIGN + sub VALUES, DCTSIZE2*2 + mov MMWORD [r15+SIZEOF_MMWORD], SIGN + + REDUCE0 + + mov eax, EOB + movdqa ZERO, XMMWORD [rbp - 16] + uncollect_args 6 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +%undef ZERO +%undef ONE +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef KK +%undef EOB +%undef SIGN +%undef LUT +%undef T0 +%undef T0d +%undef T1 +%undef T1d +%undef BLOCK +%undef VALUES +%undef LEN +%undef LENEND + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c index a504bd8..10af64d 100644 --- a/simd/x86_64/jsimd.c +++ b/simd/x86_64/jsimd.c @@ -3,7 +3,7 @@ * * Copyright 2009 Pierre Ossman for Cendio AB * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander. - * Copyright (C) 2015, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -21,6 +21,7 @@ #include "../../jdct.h" #include "../../jsimddct.h" #include "../jsimd.h" +#include "jconfigint.h" /* * In the PIC cases, we have no guarantee that constants will keep @@ -1020,3 +1021,35 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, dctbl, actbl); } + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 8) + return 0; + if (!(simd_support & JSIMD_SSE2)) + return 0; +#if defined(HAVE_BUILTIN_CTZL) + return 1; +#elif defined(HAVE_BITSCANFORWARD64) + return 1; +#else + return 0; +#endif +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return jsimd_encode_mcu_AC_refine_prepare_sse2(block, + jpeg_natural_order_start, + Sl, Al, absvalues, bits); +}