From: DRC Date: Fri, 20 Feb 2015 19:57:21 +0000 (+0000) Subject: Extend the AltiVec VMX SIMD routines to support little endian PowerPC platforms. X-Git-Tag: 1.4.90~99 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=771ab19437ec56548ddae46ff1bebc533a533a83;p=libjpeg-turbo Extend the AltiVec VMX SIMD routines to support little endian PowerPC platforms. git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1529 632fc199-4ca6-4c93-a231-07263d6284db --- diff --git a/simd/jccolext-altivec.c b/simd/jccolext-altivec.c index 39177bb..403aa96 100644 --- a/simd/jccolext-altivec.c +++ b/simd/jccolext-altivec.c @@ -1,7 +1,7 @@ /* * AltiVec optimizations for libjpeg-turbo * - * Copyright (C) 2014, D. R. Commander. + * Copyright (C) 2014-2015, D. R. Commander. * Copyright (C) 2014, Jay Foad. * All rights reserved. * This software is provided 'as-is', without any express or implied @@ -29,12 +29,18 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf, JDIMENSION output_row, int num_rows) { JSAMPROW inptr, outptr0, outptr1, outptr2; - int pitch = img_width * RGB_PIXELSIZE, offset, num_cols; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; - __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, + __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr; -#if RGB_PIXELSIZE == 4 +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = {0}; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 __vector unsigned char rgb4 = {0}; #endif __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; @@ -50,7 +56,11 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf, __vector int pd_onehalf = { __4X(ONE_HALF) }, pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) }; __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; +#endif while (--num_rows >= 0) { inptr = *input_buf++; @@ -63,6 +73,7 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf, num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, outptr0 += 16, outptr1 += 16, outptr2 += 16) { +#if __BIG_ENDIAN__ /* Load 16 pixels == 48 or 64 bytes */ offset = (size_t)inptr & 15; if (offset) { @@ -106,28 +117,31 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf, #endif } } else { +#endif /* __BIG_ENDIAN__ */ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { /* Slow path */ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); - rgb0 = vec_ld(0, tmpbuf); - rgb1 = vec_ld(16, tmpbuf); - rgb2 = vec_ld(32, tmpbuf); + rgb0 = VEC_LD(0, tmpbuf); + rgb1 = VEC_LD(16, tmpbuf); + rgb2 = VEC_LD(32, tmpbuf); #if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, tmpbuf); + rgb3 = VEC_LD(48, tmpbuf); #endif } else { /* Fast path */ - rgb0 = vec_ld(0, inptr); + rgb0 = VEC_LD(0, inptr); if (num_cols > 16) - rgb1 = vec_ld(16, inptr); + rgb1 = VEC_LD(16, inptr); if (num_cols > 32) - rgb2 = vec_ld(32, inptr); + rgb2 = VEC_LD(32, inptr); #if RGB_PIXELSIZE == 4 if (num_cols > 48) - rgb3 = vec_ld(48, inptr); + rgb3 = VEC_LD(48, inptr); #endif } +#if __BIG_ENDIAN__ } +#endif #if RGB_PIXELSIZE == 3 /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 @@ -167,14 +181,14 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf, * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't * support unsigned vectors. */ - rg0 = (__vector signed short)vec_mergeh(pb_zero, rgbg0); - bg0 = (__vector signed short)vec_mergel(pb_zero, rgbg0); - rg1 = (__vector signed short)vec_mergeh(pb_zero, rgbg1); - bg1 = (__vector signed short)vec_mergel(pb_zero, rgbg1); - rg2 = (__vector signed short)vec_mergeh(pb_zero, rgbg2); - bg2 = (__vector signed short)vec_mergel(pb_zero, rgbg2); - rg3 = (__vector signed short)vec_mergeh(pb_zero, rgbg3); - bg3 = (__vector signed short)vec_mergel(pb_zero, rgbg3); + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); /* (Original) * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B diff --git a/simd/jcgryext-altivec.c b/simd/jcgryext-altivec.c index c2bedd6..c171615 100644 --- a/simd/jcgryext-altivec.c +++ b/simd/jcgryext-altivec.c @@ -1,7 +1,7 @@ /* * AltiVec optimizations for libjpeg-turbo * - * Copyright (C) 2014, D. R. Commander. + * Copyright (C) 2014-2015, D. R. Commander. * Copyright (C) 2014, Jay Foad. * All rights reserved. * This software is provided 'as-is', without any express or implied @@ -30,12 +30,18 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width, JDIMENSION output_row, int num_rows) { JSAMPROW inptr, outptr; - int pitch = img_width * RGB_PIXELSIZE, offset, num_cols; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; +#endif - __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, + __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgbg0, rgbg1, rgbg2, rgbg3, y; -#if RGB_PIXELSIZE == 4 +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = {0}; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 __vector unsigned char rgb4 = {0}; #endif __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; @@ -47,7 +53,11 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width, pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }; __vector int pd_onehalf = { __4X(ONE_HALF) }; __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; +#endif while (--num_rows >= 0) { inptr = *input_buf++; @@ -58,6 +68,7 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width, num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, outptr += 16) { +#if __BIG_ENDIAN__ /* Load 16 pixels == 48 or 64 bytes */ offset = (size_t)inptr & 15; if (offset) { @@ -123,6 +134,18 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width, #endif } } +#else + /* Little endian */ + rgb0 = vec_vsx_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_vsx_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_vsx_ld(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = vec_vsx_ld(48, inptr); +#endif +#endif #if RGB_PIXELSIZE == 3 /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 @@ -162,14 +185,14 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width, * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't * support unsigned vectors. */ - rg0 = (__vector signed short)vec_mergeh(pb_zero, rgbg0); - bg0 = (__vector signed short)vec_mergel(pb_zero, rgbg0); - rg1 = (__vector signed short)vec_mergeh(pb_zero, rgbg1); - bg1 = (__vector signed short)vec_mergel(pb_zero, rgbg1); - rg2 = (__vector signed short)vec_mergeh(pb_zero, rgbg2); - bg2 = (__vector signed short)vec_mergel(pb_zero, rgbg2); - rg3 = (__vector signed short)vec_mergeh(pb_zero, rgbg3); - bg3 = (__vector signed short)vec_mergel(pb_zero, rgbg3); + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); /* (Original) * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B diff --git a/simd/jcsample-altivec.c b/simd/jcsample-altivec.c index 517e2be..603492d 100644 --- a/simd/jcsample-altivec.c +++ b/simd/jcsample-altivec.c @@ -58,8 +58,8 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, this0 = vec_ld(0, inptr); this0 = vec_perm(this0, this0, even_odd_index); - this0e = (__vector unsigned short)vec_mergeh(pb_zero, this0); - this0o = (__vector unsigned short)vec_mergel(pb_zero, this0); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); outl = vec_add(this0e, this0o); outl = vec_add(outl, pw_bias); outl = vec_sr(outl, pw_one); @@ -67,8 +67,8 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, if (outcol > 8) { next0 = vec_ld(16, inptr); next0 = vec_perm(next0, next0, even_odd_index); - next0e = (__vector unsigned short)vec_mergeh(pb_zero, next0); - next0o = (__vector unsigned short)vec_mergel(pb_zero, next0); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); outh = vec_add(next0e, next0o); outh = vec_add(outh, pw_bias); outh = vec_sr(outh, pw_one); @@ -118,14 +118,14 @@ jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, this0 = vec_ld(0, inptr0); this0 = vec_perm(this0, this0, even_odd_index); - this0e = (__vector unsigned short)vec_mergeh(pb_zero, this0); - this0o = (__vector unsigned short)vec_mergel(pb_zero, this0); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); out0l = vec_add(this0e, this0o); this1 = vec_ld(0, inptr1); this1 = vec_perm(this1, this1, even_odd_index); - this1e = (__vector unsigned short)vec_mergeh(pb_zero, this1); - this1o = (__vector unsigned short)vec_mergel(pb_zero, this1); + this1e = (__vector unsigned short)VEC_UNPACKHU(this1); + this1o = (__vector unsigned short)VEC_UNPACKLU(this1); out1l = vec_add(this1e, this1o); outl = vec_add(out0l, out1l); @@ -135,14 +135,14 @@ jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, if (outcol > 8) { next0 = vec_ld(16, inptr0); next0 = vec_perm(next0, next0, even_odd_index); - next0e = (__vector unsigned short)vec_mergeh(pb_zero, next0); - next0o = (__vector unsigned short)vec_mergel(pb_zero, next0); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); out0h = vec_add(next0e, next0o); next1 = vec_ld(16, inptr1); next1 = vec_perm(next1, next1, even_odd_index); - next1e = (__vector unsigned short)vec_mergeh(pb_zero, next1); - next1o = (__vector unsigned short)vec_mergel(pb_zero, next1); + next1e = (__vector unsigned short)VEC_UNPACKHU(next1); + next1o = (__vector unsigned short)VEC_UNPACKLU(next1); out1h = vec_add(next1e, next1o); outh = vec_add(out0h, out1h); diff --git a/simd/jdcolext-altivec.c b/simd/jdcolext-altivec.c index 1b8311d..1ae91b9 100644 --- a/simd/jdcolext-altivec.c +++ b/simd/jdcolext-altivec.c @@ -28,13 +28,22 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf, JSAMPARRAY output_buf, int num_rows) { JSAMPROW outptr, inptr0, inptr1, inptr2; - int pitch = out_width * RGB_PIXELSIZE, offset, num_cols; + int pitch = out_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, - y, cb, cr, edgel, edgeh, edges, out0, out1, out2, out3; + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif #if RGB_PIXELSIZE == 4 - __vector unsigned char rgb3, out4; + __vector unsigned char rgb3; #endif __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh, crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w; @@ -51,7 +60,11 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf, pw_cj = { __8X(CENTERJSAMPLE) }; __vector int pd_onehalf = { __4X(ONE_HALF) }; __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; +#endif while (--num_rows >= 0) { inptr0 = input_buf[0][input_row]; @@ -68,18 +81,18 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf, /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't * support unsigned vectors. */ - yl = (__vector signed short)vec_mergeh(pb_zero, y); - yh = (__vector signed short)vec_mergel(pb_zero, y); + yl = (__vector signed short)VEC_UNPACKHU(y); + yh = (__vector signed short)VEC_UNPACKLU(y); cb = vec_ld(0, inptr1); - cbl = (__vector signed short)vec_mergeh(pb_zero, cb); - cbh = (__vector signed short)vec_mergel(pb_zero, cb); + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); cbl = vec_sub(cbl, pw_cj); cbh = vec_sub(cbh, pw_cj); cr = vec_ld(0, inptr2); - crl = (__vector signed short)vec_mergeh(pb_zero, cr); - crh = (__vector signed short)vec_mergel(pb_zero, cr); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); crl = vec_sub(crl, pw_cj); crh = vec_sub(crh, pw_cj); @@ -181,6 +194,7 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf, rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); #endif +#if __BIG_ENDIAN__ offset = (size_t)outptr & 15; if (offset) { __vector unsigned char unaligned_shift_index; @@ -230,28 +244,31 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf, #endif } } else { +#endif /* __BIG_ENDIAN__ */ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { /* Slow path */ - vec_st(rgb0, 0, tmpbuf); - vec_st(rgb1, 16, tmpbuf); - vec_st(rgb2, 32, tmpbuf); + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); #if RGB_PIXELSIZE == 4 - vec_st(rgb3, 48, tmpbuf); + VEC_ST(rgb3, 48, tmpbuf); #endif memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); } else { /* Fast path */ - vec_st(rgb0, 0, outptr); + VEC_ST(rgb0, 0, outptr); if (num_cols > 16) - vec_st(rgb1, 16, outptr); + VEC_ST(rgb1, 16, outptr); if (num_cols > 32) - vec_st(rgb2, 32, outptr); + VEC_ST(rgb2, 32, outptr); #if RGB_PIXELSIZE == 4 if (num_cols > 48) - vec_st(rgb3, 48, outptr); + VEC_ST(rgb3, 48, outptr); #endif } +#if __BIG_ENDIAN__ } +#endif } } } diff --git a/simd/jdmrgext-altivec.c b/simd/jdmrgext-altivec.c index 0b92e7e..3b6950d 100644 --- a/simd/jdmrgext-altivec.c +++ b/simd/jdmrgext-altivec.c @@ -29,13 +29,22 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width, JSAMPARRAY output_buf) { JSAMPROW outptr, inptr0, inptr1, inptr2; - int pitch = output_width * RGB_PIXELSIZE, offset, num_cols, yloop; + int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop; +#if __BIG_ENDIAN__ + int offset; +#endif unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, - y, cb, cr, edgel, edgeh, edges, out0, out1, out2, out3; + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif #if RGB_PIXELSIZE == 4 - __vector unsigned char rgb3, out4; + __vector unsigned char rgb3; #endif __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh, crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w, @@ -53,9 +62,15 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width, pw_cj = { __8X(CENTERJSAMPLE) }; __vector int pd_onehalf = { __4X(ONE_HALF) }; __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}, even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30}, odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}, + even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0}, + odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0}; +#endif inptr0 = input_buf[0][in_row_group_ctr]; inptr1 = input_buf[1][in_row_group_ctr]; @@ -68,14 +83,14 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width, /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't * support unsigned vectors. */ - cbl = (__vector signed short)vec_mergeh(pb_zero, cb); - cbh = (__vector signed short)vec_mergel(pb_zero, cb); + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); cbl = vec_sub(cbl, pw_cj); cbh = vec_sub(cbh, pw_cj); cr = vec_ld(0, inptr2); - crl = (__vector signed short)vec_mergeh(pb_zero, cr); - crh = (__vector signed short)vec_mergel(pb_zero, cr); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); crl = vec_sub(crl, pw_cj); crh = vec_sub(crh, pw_cj); @@ -204,6 +219,7 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width, rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); #endif +#if __BIG_ENDIAN__ offset = (size_t)outptr & 15; if (offset) { __vector unsigned char unaligned_shift_index; @@ -253,28 +269,31 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width, #endif } } else { +#endif /* __BIG_ENDIAN__ */ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { /* Slow path */ - vec_st(rgb0, 0, tmpbuf); - vec_st(rgb1, 16, tmpbuf); - vec_st(rgb2, 32, tmpbuf); + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); #if RGB_PIXELSIZE == 4 - vec_st(rgb3, 48, tmpbuf); + VEC_ST(rgb3, 48, tmpbuf); #endif memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); } else { /* Fast path */ - vec_st(rgb0, 0, outptr); + VEC_ST(rgb0, 0, outptr); if (num_cols > 16) - vec_st(rgb1, 16, outptr); + VEC_ST(rgb1, 16, outptr); if (num_cols > 32) - vec_st(rgb2, 32, outptr); + VEC_ST(rgb2, 32, outptr); #if RGB_PIXELSIZE == 4 if (num_cols > 48) - vec_st(rgb3, 48, outptr); + VEC_ST(rgb3, 48, outptr); #endif } +#if __BIG_ENDIAN__ } +#endif } } } diff --git a/simd/jdsample-altivec.c b/simd/jdsample-altivec.c index f73b185..6b77d04 100644 --- a/simd/jdsample-altivec.c +++ b/simd/jdsample-altivec.c @@ -46,7 +46,11 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor, last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30}, next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15}, +#if __BIG_ENDIAN__ merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; +#else + merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; +#endif __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; for (inrow = 0; inrow < max_v_samp_factor; inrow++) { @@ -80,12 +84,12 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor, this0l = vec_mergeh(this0e, this0o); this0h = vec_mergel(this0e, this0o); - last0l = (__vector short)vec_mergeh(pb_zero, p_last0); - last0h = (__vector short)vec_mergel(pb_zero, p_last0); + last0l = (__vector short)VEC_UNPACKHU(p_last0); + last0h = (__vector short)VEC_UNPACKLU(p_last0); last0l = vec_add(last0l, pw_one); - next0l = (__vector short)vec_mergeh(pb_zero, p_next0); - next0h = (__vector short)vec_mergel(pb_zero, p_next0); + next0l = (__vector short)VEC_UNPACKHU(p_next0); + next0h = (__vector short)VEC_UNPACKLU(p_next0); next0l = vec_add(next0l, pw_two); outle = vec_add(this0l, last0l); @@ -143,7 +147,11 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29}, next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17}, next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15}, +#if __BIG_ENDIAN__ merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; +#else + merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; +#endif __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; __vector unsigned short pw_four = { __8X(4) }; @@ -163,14 +171,14 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, } this0 = vec_ld(0, inptr0); - this0l = (__vector short)vec_mergeh(pb_zero, this0); - this0h = (__vector short)vec_mergel(pb_zero, this0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); this0l = vec_mladd(this0l, pw_three, pw_zero); this0h = vec_mladd(this0h, pw_three, pw_zero); this_1 = vec_ld(0, inptr_1); - this_1l = (__vector short)vec_mergeh(pb_zero, this_1); - this_1h = (__vector short)vec_mergel(pb_zero, this_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); thiscolsum_1l = vec_add(this0l, this_1l); thiscolsum_1h = vec_add(this0h, this_1h); lastcolsum_1h = thiscolsum_1h; @@ -178,8 +186,8 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); this1 = vec_ld(0, inptr1); - this1l = (__vector short)vec_mergeh(pb_zero, this1); - this1h = (__vector short)vec_mergel(pb_zero, this1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); thiscolsum1l = vec_add(this0l, this1l); thiscolsum1h = vec_add(this0h, this1h); lastcolsum1h = thiscolsum1h; @@ -207,22 +215,22 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, next_index_lastcol); } else { this0 = vec_ld(16, inptr0); - this0l = (__vector short)vec_mergeh(pb_zero, this0); - this0h = (__vector short)vec_mergel(pb_zero, this0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); this0l = vec_mladd(this0l, pw_three, pw_zero); this0h = vec_mladd(this0h, pw_three, pw_zero); this_1 = vec_ld(16, inptr_1); - this_1l = (__vector short)vec_mergeh(pb_zero, this_1); - this_1h = (__vector short)vec_mergel(pb_zero, this_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); nextcolsum_1l = vec_add(this0l, this_1l); nextcolsum_1h = vec_add(this0h, this_1h); p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); this1 = vec_ld(16, inptr1); - this1l = (__vector short)vec_mergeh(pb_zero, this1); - this1h = (__vector short)vec_mergel(pb_zero, this1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); nextcolsum1l = vec_add(this0l, this1l); nextcolsum1h = vec_add(this0h, this1h); p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); diff --git a/simd/jquanti-altivec.c b/simd/jquanti-altivec.c index d57d691..2fbec2d 100644 --- a/simd/jquanti-altivec.c +++ b/simd/jquanti-altivec.c @@ -1,7 +1,7 @@ /* * AltiVec optimizations for libjpeg-turbo * - * Copyright (C) 2014, D. R. Commander. + * Copyright (C) 2014-2015, D. R. Commander. * All rights reserved. * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -29,6 +29,8 @@ * always get the data we want by using a single vector load (although we may * have to permute the result.) */ +#if __BIG_ENDIAN__ + #define LOAD_ROW(row) { \ elemptr = sample_data[row] + start_col; \ in##row = vec_ld(0, elemptr); \ @@ -36,6 +38,15 @@ in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \ } +#else + +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_vsx_ld(0, elemptr); \ +} + +#endif + void jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col, @@ -59,14 +70,14 @@ jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col, LOAD_ROW(6); LOAD_ROW(7); - out0 = (__vector short)vec_mergeh(pb_zero, in0); - out1 = (__vector short)vec_mergeh(pb_zero, in1); - out2 = (__vector short)vec_mergeh(pb_zero, in2); - out3 = (__vector short)vec_mergeh(pb_zero, in3); - out4 = (__vector short)vec_mergeh(pb_zero, in4); - out5 = (__vector short)vec_mergeh(pb_zero, in5); - out6 = (__vector short)vec_mergeh(pb_zero, in6); - out7 = (__vector short)vec_mergeh(pb_zero, in7); + out0 = (__vector short)VEC_UNPACKHU(in0); + out1 = (__vector short)VEC_UNPACKHU(in1); + out2 = (__vector short)VEC_UNPACKHU(in2); + out3 = (__vector short)VEC_UNPACKHU(in3); + out4 = (__vector short)VEC_UNPACKHU(in4); + out5 = (__vector short)VEC_UNPACKHU(in5); + out6 = (__vector short)VEC_UNPACKHU(in6); + out7 = (__vector short)VEC_UNPACKHU(in7); out0 = vec_sub(out0, pw_centerjsamp); out1 = vec_sub(out1, pw_centerjsamp); @@ -116,8 +127,13 @@ jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM * divisors, /* Constants */ __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; +#if __BIG_ENDIAN__ __vector unsigned char shift_pack_index = {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29}; +#else + __vector unsigned char shift_pack_index = + {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31}; +#endif row0 = vec_ld(0, workspace); row1 = vec_ld(16, workspace); diff --git a/simd/jsimd_altivec.h b/simd/jsimd_altivec.h index e800206..2660219 100644 --- a/simd/jsimd_altivec.h +++ b/simd/jsimd_altivec.h @@ -1,7 +1,7 @@ /* * AltiVec optimizations for libjpeg-turbo * - * Copyright (C) 2014, D. R. Commander. + * Copyright (C) 2014-2015, D. R. Commander. * All rights reserved. * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -78,3 +78,22 @@ #ifndef min #define min(a,b) ((a) < (b) ? (a) : (b)) #endif + + +/* Macros to abstract big/little endian bit twiddling */ + +#if __BIG_ENDIAN__ + +#define VEC_LD(a, b) vec_ld(a, b) +#define VEC_ST(a, b, c) vec_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a) +#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a) + +#else + +#define VEC_LD(a, b) vec_vsx_ld(a, b) +#define VEC_ST(a, b, c) vec_vsx_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero) +#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero) + +#endif