From: DRC Date: Fri, 5 Sep 2014 06:33:42 +0000 (+0000) Subject: AltiVec SIMD implementation of fast forward DCT X-Git-Tag: 1.4.90~150 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cd2d8e1cc77130216e2a2da7fb6e7507bafc8299;p=libjpeg-turbo AltiVec SIMD implementation of fast forward DCT git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1405 632fc199-4ca6-4c93-a231-07263d6284db --- diff --git a/configure.ac b/configure.ac index 42c6150..6361cd4 100644 --- a/configure.ac +++ b/configure.ac @@ -481,6 +481,10 @@ if test "x${with_simd}" != "xno"; then fi fi ;; + powerpc64) + AC_MSG_RESULT([yes (powerpc64)]) + simd_arch=powerpc64 + ;; *) AC_MSG_RESULT([no ("$host_cpu")]) with_simd=no; @@ -506,6 +510,7 @@ AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"]) AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"]) AM_CONDITIONAL([SIMD_ARM_64], [test "x$simd_arch" = "xaarch64"]) AM_CONDITIONAL([SIMD_MIPS], [test "x$simd_arch" = "xmips"]) +AM_CONDITIONAL([SIMD_POWERPC64], [test "x$simd_arch" = "xpowerpc64"]) AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"]) AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"]) diff --git a/simd/Makefile.am b/simd/Makefile.am index 9605e68..06dce03 100644 --- a/simd/Makefile.am +++ b/simd/Makefile.am @@ -70,6 +70,13 @@ libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2_asm.h jsimd_mips_dspr2.S endif +if SIMD_POWERPC64 + +libsimd_la_SOURCES = jsimd_powerpc64.c jsimd_powerpc64_altivec.c +libsimd_la_CFLAGS = -maltivec + +endif + AM_CPPFLAGS = -I$(top_srcdir) .asm.lo: diff --git a/simd/jsimd.h b/simd/jsimd.h index c5abd45..b032972 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -2,7 +2,7 @@ * simd/jsimd.h * * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright 2011 D. R. Commander + * Copyright (C) 2011, 2014 D. R. Commander * Copyright (C) 2013-2014, MIPS Technologies, Inc., California * Copyright (C) 2014 Linaro Limited * @@ -21,6 +21,7 @@ #define JSIMD_SSE2 0x08 #define JSIMD_ARM_NEON 0x10 #define JSIMD_MIPS_DSPR2 0x20 +#define JSIMD_ALTIVEC 0x40 /* SIMD Ext: retrieve SIMD/CPU information */ EXTERN(unsigned int) jpeg_simd_cpu_support (void); @@ -554,6 +555,8 @@ EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM * data); EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM * data); +EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM * data); + /* Floating Point Forward DCT */ EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT * data); diff --git a/simd/jsimd_powerpc64.c b/simd/jsimd_powerpc64.c new file mode 100644 index 0000000..a9a5965 --- /dev/null +++ b/simd/jsimd_powerpc64.c @@ -0,0 +1,358 @@ +/* + * jsimd_powerpc64.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright 2009-2011, 2014 D. R. Commander + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit x86 architecture. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +static unsigned int simd_support = ~0; + +LOCAL(void) +init_simd (void) +{ + char *env = NULL; + + if (simd_support != ~0U) + return; + + simd_support = JSIMD_ALTIVEC; + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM * workspace) +{ +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT * workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM * data) +{ +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM * data) +{ + jsimd_fdct_ifast_altivec(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT * data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, + DCTELEM * workspace) +{ +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, + FAST_FLOAT * workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} diff --git a/simd/jsimd_powerpc64_altivec.c b/simd/jsimd_powerpc64_altivec.c new file mode 100644 index 0000000..be42e67 --- /dev/null +++ b/simd/jsimd_powerpc64_altivec.c @@ -0,0 +1,191 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. + * All rights reserved. + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" +#include + +#define TRANSPOSE(row, col) \ +{ \ + __vector short row04l, row04h, row15l, row15h, \ + row26l, row26h, row37l, row37h; \ + __vector short col01e, col01o, col23e, col23o, \ + col45e, col45o, col67e, col67o; \ + \ + /* transpose coefficients (phase 1) */ \ + row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \ + row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \ + row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \ + row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \ + row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \ + row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \ + row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \ + row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \ + \ + /* transpose coefficients (phase 2) */ \ + col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61} */ \ + col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \ + col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \ + col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \ + col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \ + col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \ + col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \ + col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \ + \ + /* transpose coefficients (phase 3) */ \ + col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \ + col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71} */ \ + col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \ + col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \ + col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \ + col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \ + col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \ + col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \ +} + +#define PRE_MULTIPLY_SCALE_BITS 2 + +static const __vector short constants __attribute__((aligned(16))) = +{ + 98 << 5, /* FIX(0.382683433) */ + 139 << 5, /* FIX(0.541196100) */ + 181 << 5, /* FIX(0.707106781) */ + 334 << 5 /* FIX(1.306562965) */ +}; + +#define DO_DCT() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out4 = vec_sub(tmp10, tmp11); \ + \ + z1 = vec_add(tmp12, tmp13); \ + z1 = z1 << PRE_MULTIPLY_SCALE_BITS; \ + z1 = vec_madds(z1, PW_0707, zero); \ + \ + out2 = vec_add(tmp13, z1); \ + out6 = vec_sub(tmp13, z1); \ + \ + /* Odd part */ \ + \ + tmp10 = vec_add(tmp4, tmp5); \ + tmp11 = vec_add(tmp5, tmp6); \ + tmp12 = vec_add(tmp6, tmp7); \ + \ + tmp10 = tmp10 << PRE_MULTIPLY_SCALE_BITS; \ + tmp12 = tmp12 << PRE_MULTIPLY_SCALE_BITS; \ + z5 = vec_sub(tmp10, tmp12); \ + z5 = vec_madds(z5, PW_0382, zero); \ + \ + z2 = vec_madds(tmp10, PW_0541, zero); \ + z2 = vec_add(z2, z5); \ + \ + z4 = vec_madds(tmp12, PW_1306, zero); \ + z4 = vec_add(z4, z5); \ + \ + tmp11 = tmp11 << PRE_MULTIPLY_SCALE_BITS; \ + z3 = vec_madds(tmp11, PW_0707, zero); \ + \ + z11 = vec_add(tmp7, z3); \ + z13 = vec_sub(tmp7, z3); \ + \ + out5 = vec_add(z13, z2); \ + out3 = vec_sub(z13, z2); \ + out1 = vec_add(z11, z4); \ + out7 = vec_sub(z11, z4); \ +} + +void +jsimd_fdct_ifast_altivec (DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + z1, z2, z3, z4, z5, z11, z13, + out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short zero = vec_splat_s16(0), + PW_0382 = vec_splat(constants, 0), + PW_0541 = vec_splat(constants, 1), + PW_0707 = vec_splat(constants, 2), + PW_1306 = vec_splat(constants, 3); + + /* Pass 1: process rows. */ + + row0 = *(__vector short *)&data[0]; + row1 = *(__vector short *)&data[8]; + row2 = *(__vector short *)&data[16]; + row3 = *(__vector short *)&data[24]; + row4 = *(__vector short *)&data[32]; + row5 = *(__vector short *)&data[40]; + row6 = *(__vector short *)&data[48]; + row7 = *(__vector short *)&data[56]; + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_DCT(); + + /* Pass 2: process columns. */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_DCT(); + + *(__vector short *)&data[0] = out0; + *(__vector short *)&data[8] = out1; + *(__vector short *)&data[16] = out2; + *(__vector short *)&data[24] = out3; + *(__vector short *)&data[32] = out4; + *(__vector short *)&data[40] = out5; + *(__vector short *)&data[48] = out6; + *(__vector short *)&data[56] = out7; +}