/*
* AltiVec optimizations for libjpeg-turbo
*
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014-2015, D. R. Commander.
* Copyright (C) 2014, Jay Foad.
* All rights reserved.
* This software is provided 'as-is', without any express or implied
JDIMENSION output_row, int num_rows)
{
JSAMPROW inptr, outptr0, outptr1, outptr2;
- int pitch = img_width * RGB_PIXELSIZE, offset, num_cols;
+ int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+ int offset;
+#endif
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
- __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0},
+ __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
-#if RGB_PIXELSIZE == 4
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
__vector unsigned char rgb4 = {0};
#endif
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
__vector int pd_onehalf = { __4X(ONE_HALF) },
pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
__vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+ shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
while (--num_rows >= 0) {
inptr = *input_buf++;
num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+#if __BIG_ENDIAN__
/* Load 16 pixels == 48 or 64 bytes */
offset = (size_t)inptr & 15;
if (offset) {
#endif
}
} else {
+#endif /* __BIG_ENDIAN__ */
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
/* Slow path */
memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
- rgb0 = vec_ld(0, tmpbuf);
- rgb1 = vec_ld(16, tmpbuf);
- rgb2 = vec_ld(32, tmpbuf);
+ rgb0 = VEC_LD(0, tmpbuf);
+ rgb1 = VEC_LD(16, tmpbuf);
+ rgb2 = VEC_LD(32, tmpbuf);
#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, tmpbuf);
+ rgb3 = VEC_LD(48, tmpbuf);
#endif
} else {
/* Fast path */
- rgb0 = vec_ld(0, inptr);
+ rgb0 = VEC_LD(0, inptr);
if (num_cols > 16)
- rgb1 = vec_ld(16, inptr);
+ rgb1 = VEC_LD(16, inptr);
if (num_cols > 32)
- rgb2 = vec_ld(32, inptr);
+ rgb2 = VEC_LD(32, inptr);
#if RGB_PIXELSIZE == 4
if (num_cols > 48)
- rgb3 = vec_ld(48, inptr);
+ rgb3 = VEC_LD(48, inptr);
#endif
}
+#if __BIG_ENDIAN__
}
+#endif
#if RGB_PIXELSIZE == 3
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
* support unsigned vectors.
*/
- rg0 = (__vector signed short)vec_mergeh(pb_zero, rgbg0);
- bg0 = (__vector signed short)vec_mergel(pb_zero, rgbg0);
- rg1 = (__vector signed short)vec_mergeh(pb_zero, rgbg1);
- bg1 = (__vector signed short)vec_mergel(pb_zero, rgbg1);
- rg2 = (__vector signed short)vec_mergeh(pb_zero, rgbg2);
- bg2 = (__vector signed short)vec_mergel(pb_zero, rgbg2);
- rg3 = (__vector signed short)vec_mergeh(pb_zero, rgbg3);
- bg3 = (__vector signed short)vec_mergel(pb_zero, rgbg3);
+ rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+ bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+ rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+ bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+ rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+ bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+ rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+ bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
/* (Original)
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
/*
* AltiVec optimizations for libjpeg-turbo
*
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014-2015, D. R. Commander.
* Copyright (C) 2014, Jay Foad.
* All rights reserved.
* This software is provided 'as-is', without any express or implied
JDIMENSION output_row, int num_rows)
{
JSAMPROW inptr, outptr;
- int pitch = img_width * RGB_PIXELSIZE, offset, num_cols;
+ int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+ int offset;
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+#endif
- __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0},
+ __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
rgbg0, rgbg1, rgbg2, rgbg3, y;
-#if RGB_PIXELSIZE == 4
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
__vector unsigned char rgb4 = {0};
#endif
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
__vector int pd_onehalf = { __4X(ONE_HALF) };
__vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+ shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
while (--num_rows >= 0) {
inptr = *input_buf++;
num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
outptr += 16) {
+#if __BIG_ENDIAN__
/* Load 16 pixels == 48 or 64 bytes */
offset = (size_t)inptr & 15;
if (offset) {
#endif
}
}
+#else
+ /* Little endian */
+ rgb0 = vec_vsx_ld(0, inptr);
+ if (num_cols > 16)
+ rgb1 = vec_vsx_ld(16, inptr);
+ if (num_cols > 32)
+ rgb2 = vec_vsx_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ rgb3 = vec_vsx_ld(48, inptr);
+#endif
+#endif
#if RGB_PIXELSIZE == 3
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
* support unsigned vectors.
*/
- rg0 = (__vector signed short)vec_mergeh(pb_zero, rgbg0);
- bg0 = (__vector signed short)vec_mergel(pb_zero, rgbg0);
- rg1 = (__vector signed short)vec_mergeh(pb_zero, rgbg1);
- bg1 = (__vector signed short)vec_mergel(pb_zero, rgbg1);
- rg2 = (__vector signed short)vec_mergeh(pb_zero, rgbg2);
- bg2 = (__vector signed short)vec_mergel(pb_zero, rgbg2);
- rg3 = (__vector signed short)vec_mergeh(pb_zero, rgbg3);
- bg3 = (__vector signed short)vec_mergel(pb_zero, rgbg3);
+ rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+ bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+ rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+ bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+ rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+ bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+ rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+ bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
/* (Original)
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
this0 = vec_ld(0, inptr);
this0 = vec_perm(this0, this0, even_odd_index);
- this0e = (__vector unsigned short)vec_mergeh(pb_zero, this0);
- this0o = (__vector unsigned short)vec_mergel(pb_zero, this0);
+ this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+ this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
outl = vec_add(this0e, this0o);
outl = vec_add(outl, pw_bias);
outl = vec_sr(outl, pw_one);
if (outcol > 8) {
next0 = vec_ld(16, inptr);
next0 = vec_perm(next0, next0, even_odd_index);
- next0e = (__vector unsigned short)vec_mergeh(pb_zero, next0);
- next0o = (__vector unsigned short)vec_mergel(pb_zero, next0);
+ next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+ next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
outh = vec_add(next0e, next0o);
outh = vec_add(outh, pw_bias);
outh = vec_sr(outh, pw_one);
this0 = vec_ld(0, inptr0);
this0 = vec_perm(this0, this0, even_odd_index);
- this0e = (__vector unsigned short)vec_mergeh(pb_zero, this0);
- this0o = (__vector unsigned short)vec_mergel(pb_zero, this0);
+ this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+ this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
out0l = vec_add(this0e, this0o);
this1 = vec_ld(0, inptr1);
this1 = vec_perm(this1, this1, even_odd_index);
- this1e = (__vector unsigned short)vec_mergeh(pb_zero, this1);
- this1o = (__vector unsigned short)vec_mergel(pb_zero, this1);
+ this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
+ this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
out1l = vec_add(this1e, this1o);
outl = vec_add(out0l, out1l);
if (outcol > 8) {
next0 = vec_ld(16, inptr0);
next0 = vec_perm(next0, next0, even_odd_index);
- next0e = (__vector unsigned short)vec_mergeh(pb_zero, next0);
- next0o = (__vector unsigned short)vec_mergel(pb_zero, next0);
+ next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+ next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
out0h = vec_add(next0e, next0o);
next1 = vec_ld(16, inptr1);
next1 = vec_perm(next1, next1, even_odd_index);
- next1e = (__vector unsigned short)vec_mergeh(pb_zero, next1);
- next1o = (__vector unsigned short)vec_mergel(pb_zero, next1);
+ next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
+ next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
out1h = vec_add(next1e, next1o);
outh = vec_add(out0h, out1h);
JSAMPARRAY output_buf, int num_rows)
{
JSAMPROW outptr, inptr0, inptr1, inptr2;
- int pitch = out_width * RGB_PIXELSIZE, offset, num_cols;
+ int pitch = out_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+ int offset;
+#endif
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
__vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
- y, cb, cr, edgel, edgeh, edges, out0, out1, out2, out3;
+ y, cb, cr;
+#if __BIG_ENDIAN__
+ __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char out4;
+#endif
+#endif
#if RGB_PIXELSIZE == 4
- __vector unsigned char rgb3, out4;
+ __vector unsigned char rgb3;
#endif
__vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
pw_cj = { __8X(CENTERJSAMPLE) };
__vector int pd_onehalf = { __4X(ONE_HALF) };
__vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+ shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
while (--num_rows >= 0) {
inptr0 = input_buf[0][input_row];
/* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
* support unsigned vectors.
*/
- yl = (__vector signed short)vec_mergeh(pb_zero, y);
- yh = (__vector signed short)vec_mergel(pb_zero, y);
+ yl = (__vector signed short)VEC_UNPACKHU(y);
+ yh = (__vector signed short)VEC_UNPACKLU(y);
cb = vec_ld(0, inptr1);
- cbl = (__vector signed short)vec_mergeh(pb_zero, cb);
- cbh = (__vector signed short)vec_mergel(pb_zero, cb);
+ cbl = (__vector signed short)VEC_UNPACKHU(cb);
+ cbh = (__vector signed short)VEC_UNPACKLU(cb);
cbl = vec_sub(cbl, pw_cj);
cbh = vec_sub(cbh, pw_cj);
cr = vec_ld(0, inptr2);
- crl = (__vector signed short)vec_mergeh(pb_zero, cr);
- crh = (__vector signed short)vec_mergel(pb_zero, cr);
+ crl = (__vector signed short)VEC_UNPACKHU(cr);
+ crh = (__vector signed short)VEC_UNPACKLU(cr);
crl = vec_sub(crl, pw_cj);
crh = vec_sub(crh, pw_cj);
rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
#endif
+#if __BIG_ENDIAN__
offset = (size_t)outptr & 15;
if (offset) {
__vector unsigned char unaligned_shift_index;
#endif
}
} else {
+#endif /* __BIG_ENDIAN__ */
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
/* Slow path */
- vec_st(rgb0, 0, tmpbuf);
- vec_st(rgb1, 16, tmpbuf);
- vec_st(rgb2, 32, tmpbuf);
+ VEC_ST(rgb0, 0, tmpbuf);
+ VEC_ST(rgb1, 16, tmpbuf);
+ VEC_ST(rgb2, 32, tmpbuf);
#if RGB_PIXELSIZE == 4
- vec_st(rgb3, 48, tmpbuf);
+ VEC_ST(rgb3, 48, tmpbuf);
#endif
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
} else {
/* Fast path */
- vec_st(rgb0, 0, outptr);
+ VEC_ST(rgb0, 0, outptr);
if (num_cols > 16)
- vec_st(rgb1, 16, outptr);
+ VEC_ST(rgb1, 16, outptr);
if (num_cols > 32)
- vec_st(rgb2, 32, outptr);
+ VEC_ST(rgb2, 32, outptr);
#if RGB_PIXELSIZE == 4
if (num_cols > 48)
- vec_st(rgb3, 48, outptr);
+ VEC_ST(rgb3, 48, outptr);
#endif
}
+#if __BIG_ENDIAN__
}
+#endif
}
}
}
JSAMPARRAY output_buf)
{
JSAMPROW outptr, inptr0, inptr1, inptr2;
- int pitch = output_width * RGB_PIXELSIZE, offset, num_cols, yloop;
+ int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
+#if __BIG_ENDIAN__
+ int offset;
+#endif
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
__vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
- y, cb, cr, edgel, edgeh, edges, out0, out1, out2, out3;
+ y, cb, cr;
+#if __BIG_ENDIAN__
+ __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char out4;
+#endif
+#endif
#if RGB_PIXELSIZE == 4
- __vector unsigned char rgb3, out4;
+ __vector unsigned char rgb3;
#endif
__vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
pw_cj = { __8X(CENTERJSAMPLE) };
__vector int pd_onehalf = { __4X(ONE_HALF) };
__vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
+#else
+ shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
+ even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
+ odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
+#endif
inptr0 = input_buf[0][in_row_group_ctr];
inptr1 = input_buf[1][in_row_group_ctr];
/* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
* support unsigned vectors.
*/
- cbl = (__vector signed short)vec_mergeh(pb_zero, cb);
- cbh = (__vector signed short)vec_mergel(pb_zero, cb);
+ cbl = (__vector signed short)VEC_UNPACKHU(cb);
+ cbh = (__vector signed short)VEC_UNPACKLU(cb);
cbl = vec_sub(cbl, pw_cj);
cbh = vec_sub(cbh, pw_cj);
cr = vec_ld(0, inptr2);
- crl = (__vector signed short)vec_mergeh(pb_zero, cr);
- crh = (__vector signed short)vec_mergel(pb_zero, cr);
+ crl = (__vector signed short)VEC_UNPACKHU(cr);
+ crh = (__vector signed short)VEC_UNPACKLU(cr);
crl = vec_sub(crl, pw_cj);
crh = vec_sub(crh, pw_cj);
rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
#endif
+#if __BIG_ENDIAN__
offset = (size_t)outptr & 15;
if (offset) {
__vector unsigned char unaligned_shift_index;
#endif
}
} else {
+#endif /* __BIG_ENDIAN__ */
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
/* Slow path */
- vec_st(rgb0, 0, tmpbuf);
- vec_st(rgb1, 16, tmpbuf);
- vec_st(rgb2, 32, tmpbuf);
+ VEC_ST(rgb0, 0, tmpbuf);
+ VEC_ST(rgb1, 16, tmpbuf);
+ VEC_ST(rgb2, 32, tmpbuf);
#if RGB_PIXELSIZE == 4
- vec_st(rgb3, 48, tmpbuf);
+ VEC_ST(rgb3, 48, tmpbuf);
#endif
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
} else {
/* Fast path */
- vec_st(rgb0, 0, outptr);
+ VEC_ST(rgb0, 0, outptr);
if (num_cols > 16)
- vec_st(rgb1, 16, outptr);
+ VEC_ST(rgb1, 16, outptr);
if (num_cols > 32)
- vec_st(rgb2, 32, outptr);
+ VEC_ST(rgb2, 32, outptr);
#if RGB_PIXELSIZE == 4
if (num_cols > 48)
- vec_st(rgb3, 48, outptr);
+ VEC_ST(rgb3, 48, outptr);
#endif
}
+#if __BIG_ENDIAN__
}
+#endif
}
}
}
last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
+#if __BIG_ENDIAN__
merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+ merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
__vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
this0l = vec_mergeh(this0e, this0o);
this0h = vec_mergel(this0e, this0o);
- last0l = (__vector short)vec_mergeh(pb_zero, p_last0);
- last0h = (__vector short)vec_mergel(pb_zero, p_last0);
+ last0l = (__vector short)VEC_UNPACKHU(p_last0);
+ last0h = (__vector short)VEC_UNPACKLU(p_last0);
last0l = vec_add(last0l, pw_one);
- next0l = (__vector short)vec_mergeh(pb_zero, p_next0);
- next0h = (__vector short)vec_mergel(pb_zero, p_next0);
+ next0l = (__vector short)VEC_UNPACKHU(p_next0);
+ next0h = (__vector short)VEC_UNPACKLU(p_next0);
next0l = vec_add(next0l, pw_two);
outle = vec_add(this0l, last0l);
last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
+#if __BIG_ENDIAN__
merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+ merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
__vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
__vector unsigned short pw_four = { __8X(4) };
}
this0 = vec_ld(0, inptr0);
- this0l = (__vector short)vec_mergeh(pb_zero, this0);
- this0h = (__vector short)vec_mergel(pb_zero, this0);
+ this0l = (__vector short)VEC_UNPACKHU(this0);
+ this0h = (__vector short)VEC_UNPACKLU(this0);
this0l = vec_mladd(this0l, pw_three, pw_zero);
this0h = vec_mladd(this0h, pw_three, pw_zero);
this_1 = vec_ld(0, inptr_1);
- this_1l = (__vector short)vec_mergeh(pb_zero, this_1);
- this_1h = (__vector short)vec_mergel(pb_zero, this_1);
+ this_1l = (__vector short)VEC_UNPACKHU(this_1);
+ this_1h = (__vector short)VEC_UNPACKLU(this_1);
thiscolsum_1l = vec_add(this0l, this_1l);
thiscolsum_1h = vec_add(this0h, this_1h);
lastcolsum_1h = thiscolsum_1h;
p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
this1 = vec_ld(0, inptr1);
- this1l = (__vector short)vec_mergeh(pb_zero, this1);
- this1h = (__vector short)vec_mergel(pb_zero, this1);
+ this1l = (__vector short)VEC_UNPACKHU(this1);
+ this1h = (__vector short)VEC_UNPACKLU(this1);
thiscolsum1l = vec_add(this0l, this1l);
thiscolsum1h = vec_add(this0h, this1h);
lastcolsum1h = thiscolsum1h;
next_index_lastcol);
} else {
this0 = vec_ld(16, inptr0);
- this0l = (__vector short)vec_mergeh(pb_zero, this0);
- this0h = (__vector short)vec_mergel(pb_zero, this0);
+ this0l = (__vector short)VEC_UNPACKHU(this0);
+ this0h = (__vector short)VEC_UNPACKLU(this0);
this0l = vec_mladd(this0l, pw_three, pw_zero);
this0h = vec_mladd(this0h, pw_three, pw_zero);
this_1 = vec_ld(16, inptr_1);
- this_1l = (__vector short)vec_mergeh(pb_zero, this_1);
- this_1h = (__vector short)vec_mergel(pb_zero, this_1);
+ this_1l = (__vector short)VEC_UNPACKHU(this_1);
+ this_1h = (__vector short)VEC_UNPACKLU(this_1);
nextcolsum_1l = vec_add(this0l, this_1l);
nextcolsum_1h = vec_add(this0h, this_1h);
p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
this1 = vec_ld(16, inptr1);
- this1l = (__vector short)vec_mergeh(pb_zero, this1);
- this1h = (__vector short)vec_mergel(pb_zero, this1);
+ this1l = (__vector short)VEC_UNPACKHU(this1);
+ this1h = (__vector short)VEC_UNPACKLU(this1);
nextcolsum1l = vec_add(this0l, this1l);
nextcolsum1h = vec_add(this0h, this1h);
p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
/*
* AltiVec optimizations for libjpeg-turbo
*
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014-2015, D. R. Commander.
* All rights reserved.
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* always get the data we want by using a single vector load (although we may
* have to permute the result.)
*/
+#if __BIG_ENDIAN__
+
#define LOAD_ROW(row) { \
elemptr = sample_data[row] + start_col; \
in##row = vec_ld(0, elemptr); \
in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
}
+#else
+
+#define LOAD_ROW(row) { \
+ elemptr = sample_data[row] + start_col; \
+ in##row = vec_vsx_ld(0, elemptr); \
+}
+
+#endif
+
void
jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
LOAD_ROW(6);
LOAD_ROW(7);
- out0 = (__vector short)vec_mergeh(pb_zero, in0);
- out1 = (__vector short)vec_mergeh(pb_zero, in1);
- out2 = (__vector short)vec_mergeh(pb_zero, in2);
- out3 = (__vector short)vec_mergeh(pb_zero, in3);
- out4 = (__vector short)vec_mergeh(pb_zero, in4);
- out5 = (__vector short)vec_mergeh(pb_zero, in5);
- out6 = (__vector short)vec_mergeh(pb_zero, in6);
- out7 = (__vector short)vec_mergeh(pb_zero, in7);
+ out0 = (__vector short)VEC_UNPACKHU(in0);
+ out1 = (__vector short)VEC_UNPACKHU(in1);
+ out2 = (__vector short)VEC_UNPACKHU(in2);
+ out3 = (__vector short)VEC_UNPACKHU(in3);
+ out4 = (__vector short)VEC_UNPACKHU(in4);
+ out5 = (__vector short)VEC_UNPACKHU(in5);
+ out6 = (__vector short)VEC_UNPACKHU(in6);
+ out7 = (__vector short)VEC_UNPACKHU(in7);
out0 = vec_sub(out0, pw_centerjsamp);
out1 = vec_sub(out1, pw_centerjsamp);
/* Constants */
__vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
+#if __BIG_ENDIAN__
__vector unsigned char shift_pack_index =
{0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
+#else
+ __vector unsigned char shift_pack_index =
+ {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
+#endif
row0 = vec_ld(0, workspace);
row1 = vec_ld(16, workspace);
/*
* AltiVec optimizations for libjpeg-turbo
*
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014-2015, D. R. Commander.
* All rights reserved.
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
#ifndef min
#define min(a,b) ((a) < (b) ? (a) : (b))
#endif
+
+
+/* Macros to abstract big/little endian bit twiddling */
+
+#if __BIG_ENDIAN__
+
+#define VEC_LD(a, b) vec_ld(a, b)
+#define VEC_ST(a, b, c) vec_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
+
+#else
+
+#define VEC_LD(a, b) vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
+
+#endif