From 577ecd93f19fdd55be5529306ca8c275e56575fc Mon Sep 17 00:00:00 2001 From: DRC Date: Tue, 23 Dec 2014 04:14:54 +0000 Subject: [PATCH] AltiVec SIMD implementation of sample conversion and integer quantization git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1474 632fc199-4ca6-4c93-a231-07263d6284db --- simd/Makefile.am | 3 +- simd/jquanti-altivec.c | 236 +++++++++++++++++++++++++++++++++++++++++ simd/jsimd.h | 6 ++ simd/jsimd_powerpc.c | 30 ++++++ 4 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 simd/jquanti-altivec.c diff --git a/simd/Makefile.am b/simd/Makefile.am index 7aa36ae..dd0148d 100644 --- a/simd/Makefile.am +++ b/simd/Makefile.am @@ -75,7 +75,8 @@ if SIMD_POWERPC libsimd_la_SOURCES = jsimd_powerpc.c \ jccolor-altivec.c jcgray-altivec.c \ jfdctfst-altivec.c jfdctint-altivec.c \ - jidctfst-altivec.c jidctint-altivec.c + jidctfst-altivec.c jidctint-altivec.c \ + jquanti-altivec.c libsimd_la_CFLAGS = -maltivec jccolor-altivec.lo: jccolext-altivec.c diff --git a/simd/jquanti-altivec.c b/simd/jquanti-altivec.c new file mode 100644 index 0000000..12c97ee --- /dev/null +++ b/simd/jquanti-altivec.c @@ -0,0 +1,236 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. + * All rights reserved. + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ + +#include "jsimd_altivec.h" + + +/* NOTE: The address will either be aligned or offset by 8 bytes, so we can + * always get the data we want by using a single vector load (although we may + * have to permute the result.) + */ +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_ld(0, elemptr); \ + if ((size_t)elemptr & 15) \ + in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \ +} + + +void +jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM * workspace) +{ + JSAMPROW elemptr; + __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; + __vector short out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; + __vector unsigned char zero = { __16X(0) }; + + LOAD_ROW(0); + LOAD_ROW(1); + LOAD_ROW(2); + LOAD_ROW(3); + LOAD_ROW(4); + LOAD_ROW(5); + LOAD_ROW(6); + LOAD_ROW(7); + + out0 = (__vector short)vec_mergeh(zero, in0); + out1 = (__vector short)vec_mergeh(zero, in1); + out2 = (__vector short)vec_mergeh(zero, in2); + out3 = (__vector short)vec_mergeh(zero, in3); + out4 = (__vector short)vec_mergeh(zero, in4); + out5 = (__vector short)vec_mergeh(zero, in5); + out6 = (__vector short)vec_mergeh(zero, in6); + out7 = (__vector short)vec_mergeh(zero, in7); + + out0 = vec_sub(out0, pw_centerjsamp); + out1 = vec_sub(out1, pw_centerjsamp); + out2 = vec_sub(out2, pw_centerjsamp); + out3 = vec_sub(out3, pw_centerjsamp); + out4 = vec_sub(out4, pw_centerjsamp); + out5 = vec_sub(out5, pw_centerjsamp); + out6 = vec_sub(out6, pw_centerjsamp); + out7 = vec_sub(out7, pw_centerjsamp); + + vec_st(out0, 0, workspace); + vec_st(out1, 16, workspace); + vec_st(out2, 32, workspace); + vec_st(out3, 48, workspace); + vec_st(out4, 64, workspace); + vec_st(out5, 80, workspace); + vec_st(out6, 96, workspace); + vec_st(out7, 112, workspace); +} + + +#define WORD_BIT 16 + +/* There is no AltiVec unsigned multiply instruction, hence this. */ + +#define MULTIPLY(vs0, vs1, out) { \ + tmpe = vec_mule((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + tmpo = vec_mulo((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + out = (__vector short)vec_perm((__vector unsigned short)tmpe, \ + (__vector unsigned short)tmpo, \ + shift_pack_index); \ +} + +void +jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM * divisors, + DCTELEM * workspace) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7; + __vector short row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s; + __vector short corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7; + __vector short recip0, recip1, recip2, recip3, recip4, recip5, recip6, + recip7; + __vector short scale0, scale1, scale2, scale3, scale4, scale5, scale6, + scale7; + __vector unsigned int tmpe, tmpo; + + /* Constants */ + __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; + __vector unsigned char shift_pack_index = + { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29}; + + row0 = vec_ld(0, workspace); + row1 = vec_ld(16, workspace); + row2 = vec_ld(32, workspace); + row3 = vec_ld(48, workspace); + row4 = vec_ld(64, workspace); + row5 = vec_ld(80, workspace); + row6 = vec_ld(96, workspace); + row7 = vec_ld(112, workspace); + + /* Branch-less absolute value */ + row0s = vec_sra(row0, pw_word_bit_m1); + row1s = vec_sra(row1, pw_word_bit_m1); + row2s = vec_sra(row2, pw_word_bit_m1); + row3s = vec_sra(row3, pw_word_bit_m1); + row4s = vec_sra(row4, pw_word_bit_m1); + row5s = vec_sra(row5, pw_word_bit_m1); + row6s = vec_sra(row6, pw_word_bit_m1); + row7s = vec_sra(row7, pw_word_bit_m1); + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + corr0 = vec_ld(DCTSIZE2 * 2, divisors); + corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors); + corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors); + corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors); + corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors); + corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors); + corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors); + corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors); + + row0 = vec_add(row0, corr0); + row1 = vec_add(row1, corr1); + row2 = vec_add(row2, corr2); + row3 = vec_add(row3, corr3); + row4 = vec_add(row4, corr4); + row5 = vec_add(row5, corr5); + row6 = vec_add(row6, corr6); + row7 = vec_add(row7, corr7); + + recip0 = vec_ld(0, divisors); + recip1 = vec_ld(16, divisors); + recip2 = vec_ld(32, divisors); + recip3 = vec_ld(48, divisors); + recip4 = vec_ld(64, divisors); + recip5 = vec_ld(80, divisors); + recip6 = vec_ld(96, divisors); + recip7 = vec_ld(112, divisors); + + MULTIPLY(row0, recip0, row0); + MULTIPLY(row1, recip1, row1); + MULTIPLY(row2, recip2, row2); + MULTIPLY(row3, recip3, row3); + MULTIPLY(row4, recip4, row4); + MULTIPLY(row5, recip5, row5); + MULTIPLY(row6, recip6, row6); + MULTIPLY(row7, recip7, row7); + + scale0 = vec_ld(DCTSIZE2 * 4, divisors); + scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors); + scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors); + scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors); + scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors); + scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors); + scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors); + scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors); + + MULTIPLY(row0, scale0, row0); + MULTIPLY(row1, scale1, row1); + MULTIPLY(row2, scale2, row2); + MULTIPLY(row3, scale3, row3); + MULTIPLY(row4, scale4, row4); + MULTIPLY(row5, scale5, row5); + MULTIPLY(row6, scale6, row6); + MULTIPLY(row7, scale7, row7); + + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + vec_st(row0, 0, coef_block); + vec_st(row1, 16, coef_block); + vec_st(row2, 32, coef_block); + vec_st(row3, 48, coef_block); + vec_st(row4, 64, coef_block); + vec_st(row5, 80, coef_block); + vec_st(row6, 96, coef_block); + vec_st(row7, 112, coef_block); +} diff --git a/simd/jsimd.h b/simd/jsimd.h index a8009b5..3cb63ec 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -568,6 +568,9 @@ EXTERN(void) jsimd_convsamp_neon EXTERN(void) jsimd_convsamp_mips_dspr2 (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace); +EXTERN(void) jsimd_convsamp_altivec + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace); + /* Floating Point Sample Conversion */ EXTERN(void) jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace); @@ -622,6 +625,9 @@ EXTERN(void) jsimd_quantize_neon EXTERN(void) jsimd_quantize_mips_dspr2 (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace); +EXTERN(void) jsimd_quantize_altivec + (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace); + /* Floating Point Quantization */ EXTERN(void) jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace); diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c index a33bf62..60dad60 100644 --- a/simd/jsimd_powerpc.c +++ b/simd/jsimd_powerpc.c @@ -290,6 +290,21 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, GLOBAL(int) jsimd_can_convsamp (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + return 0; } @@ -303,6 +318,7 @@ GLOBAL(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace) { + jsimd_convsamp_altivec(sample_data, start_col, workspace); } GLOBAL(void) @@ -371,6 +387,19 @@ jsimd_fdct_float (FAST_FLOAT * data) GLOBAL(int) jsimd_can_quantize (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + return 0; } @@ -384,6 +413,7 @@ GLOBAL(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace) { + jsimd_quantize_altivec(coef_block, divisors, workspace); } GLOBAL(void) -- 2.40.0