From c641cddb80871395e81164d586faec7e056336c1 Mon Sep 17 00:00:00 2001 From: DRC Date: Wed, 14 Jan 2015 15:41:11 +0000 Subject: [PATCH] AltiVec SIMD implementation of H2V1 and H2V2 plain upsampling (used only when decompressing YCCK images with fast upsampling enabled.) git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1504 632fc199-4ca6-4c93-a231-07263d6284db --- ChangeLog.txt | 11 ++++++ simd/jdsample-altivec.c | 88 +++++++++++++++++++++++++++++++++++++++++ simd/jsimd.h | 6 +++ simd/jsimd_powerpc.c | 26 ++++++++++++ 4 files changed, 131 insertions(+) diff --git a/ChangeLog.txt b/ChangeLog.txt index 8866455..b61ab2b 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -1,3 +1,14 @@ +1.5 pre-beta +============ + +[1] Added full SIMD acceleration for PowerPC platforms using AltiVec VMX +(128-bit SIMD) instructions. Although the performance of libjpeg-turbo on +PowerPC was already good, due to the increased number of registers available +to the compiler vs. x86, it was still possible to speed up compression by about +3-4x and decompression by about 2-2.5x (relative to libjpeg v6b) through the +use of AltiVec instructions. + + 1.4.0 ===== diff --git a/simd/jdsample-altivec.c b/simd/jdsample-altivec.c index f49e403..f73b185 100644 --- a/simd/jdsample-altivec.c +++ b/simd/jdsample-altivec.c @@ -294,3 +294,91 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, } } } + + +/* These are rarely used (mainly just for decompressing YCCK images) */ + +void +jsimd_h2v1_upsample_altivec (int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr); + vec_st(inh, 16, outptr); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr); + vec_st(inh, 48, outptr); + } + } + } +} + + +void +jsimd_h2v2_upsample_altivec (int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr0, outptr1; + int inrow, outrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr = input_data[inrow]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr0); + vec_st(inl, 0, outptr1); + + vec_st(inh, 16, outptr0); + vec_st(inh, 16, outptr1); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr0); + vec_st(inl, 32, outptr1); + + vec_st(inh, 48, outptr0); + vec_st(inh, 48, outptr1); + } + } + } +} diff --git a/simd/jsimd.h b/simd/jsimd.h index 4e6e8d7..c0de7e7 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -425,6 +425,12 @@ EXTERN(void) jsimd_int_upsample_mips_dspr2 JSAMPARRAY * output_data_ptr, JDIMENSION output_width, int max_v_samp_factor); +EXTERN(void) jsimd_h2v1_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr); /* Fancy Upsampling */ EXTERN(void) jsimd_h2v1_fancy_upsample_mmx diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c index 1da406e..2fc6814 100644 --- a/simd/jsimd_powerpc.c +++ b/simd/jsimd_powerpc.c @@ -280,12 +280,34 @@ jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(int) jsimd_can_h2v2_upsample (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + return 0; } GLOBAL(int) jsimd_can_h2v1_upsample (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + return 0; } @@ -295,6 +317,8 @@ jsimd_h2v2_upsample (j_decompress_ptr cinfo, JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) { + jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(void) @@ -303,6 +327,8 @@ jsimd_h2v1_upsample (j_decompress_ptr cinfo, JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) { + jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(int) -- 2.40.0