Extend the AltiVec VMX SIMD routines to support little endian PowerPC platforms.

author DRC <dcommander@users.sourceforge.net>

Fri, 20 Feb 2015 19:57:21 +0000 (19:57 +0000)

committer DRC <dcommander@users.sourceforge.net>

Fri, 20 Feb 2015 19:57:21 +0000 (19:57 +0000)
author DRC <dcommander@users.sourceforge.net>
Fri, 20 Feb 2015 19:57:21 +0000 (19:57 +0000)
committer DRC <dcommander@users.sourceforge.net>
Fri, 20 Feb 2015 19:57:21 +0000 (19:57 +0000)
diff --git a/simd/jccolext-altivec.c b/simd/jccolext-altivec.c

index 39177bb41c6ec4451165e2eb7675be858f2620fb..403aa964ba05a111f65c2d33833367e3e9051a59 100644 (file)
--- a/simd/jccolext-altivec.c
+++ b/simd/jccolext-altivec.c
@@ -1,7 +1,7 @@
  /*
   * AltiVec optimizations for libjpeg-turbo
   *
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014-2015, D. R. Commander.
   * Copyright (C) 2014, Jay Foad.
   * All rights reserved.
   * This software is provided 'as-is', without any express or implied
@@ -29,12 +29,18 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
                                      JDIMENSION output_row, int num_rows)
  {
    JSAMPROW inptr, outptr0, outptr1, outptr2;
-  int pitch = img_width * RGB_PIXELSIZE, offset, num_cols;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
    unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
  
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0},
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
      rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
-#if RGB_PIXELSIZE == 4
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
    __vector unsigned char rgb4 = {0};
  #endif
    __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
@@ -50,7 +56,11 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
    __vector int pd_onehalf = { __4X(ONE_HALF) },
      pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
    __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
      shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
  
    while (--num_rows >= 0) {
      inptr = *input_buf++;
@@ -63,6 +73,7 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
           num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
           outptr0 += 16, outptr1 += 16, outptr2 += 16) {
  
+#if __BIG_ENDIAN__
        /* Load 16 pixels == 48 or 64 bytes */
        offset = (size_t)inptr & 15;
        if (offset) {
@@ -106,28 +117,31 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
  #endif
          }
        } else {
+#endif /* __BIG_ENDIAN__ */
          if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
            /* Slow path */
            memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
-          rgb0 = vec_ld(0, tmpbuf);
-          rgb1 = vec_ld(16, tmpbuf);
-          rgb2 = vec_ld(32, tmpbuf);
+          rgb0 = VEC_LD(0, tmpbuf);
+          rgb1 = VEC_LD(16, tmpbuf);
+          rgb2 = VEC_LD(32, tmpbuf);
  #if RGB_PIXELSIZE == 4
-          rgb3 = vec_ld(48, tmpbuf);
+          rgb3 = VEC_LD(48, tmpbuf);
  #endif
          } else {
            /* Fast path */
-          rgb0 = vec_ld(0, inptr);
+          rgb0 = VEC_LD(0, inptr);
            if (num_cols > 16)
-            rgb1 = vec_ld(16, inptr);
+            rgb1 = VEC_LD(16, inptr);
            if (num_cols > 32)
-            rgb2 = vec_ld(32, inptr);
+            rgb2 = VEC_LD(32, inptr);
  #if RGB_PIXELSIZE == 4
            if (num_cols > 48)
-            rgb3 = vec_ld(48, inptr);
+            rgb3 = VEC_LD(48, inptr);
  #endif
          }
+#if __BIG_ENDIAN__
        }
+#endif
  
  #if RGB_PIXELSIZE == 3
        /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
@@ -167,14 +181,14 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
         * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
         * support unsigned vectors.
         */
-      rg0 = (__vector signed short)vec_mergeh(pb_zero, rgbg0);
-      bg0 = (__vector signed short)vec_mergel(pb_zero, rgbg0);
-      rg1 = (__vector signed short)vec_mergeh(pb_zero, rgbg1);
-      bg1 = (__vector signed short)vec_mergel(pb_zero, rgbg1);
-      rg2 = (__vector signed short)vec_mergeh(pb_zero, rgbg2);
-      bg2 = (__vector signed short)vec_mergel(pb_zero, rgbg2);
-      rg3 = (__vector signed short)vec_mergeh(pb_zero, rgbg3);
-      bg3 = (__vector signed short)vec_mergel(pb_zero, rgbg3);
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
  
        /* (Original)
         * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
diff --git a/simd/jcgryext-altivec.c b/simd/jcgryext-altivec.c

index c2bedd669befb0e22b5c793c9ff938a2566f1dd3..c171615da1dd7c2a81a5e2f1bcf4b0815583a446 100644 (file)
--- a/simd/jcgryext-altivec.c
+++ b/simd/jcgryext-altivec.c
@@ -1,7 +1,7 @@
  /*
   * AltiVec optimizations for libjpeg-turbo
   *
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014-2015, D. R. Commander.
   * Copyright (C) 2014, Jay Foad.
   * All rights reserved.
   * This software is provided 'as-is', without any express or implied
@@ -30,12 +30,18 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
                                       JDIMENSION output_row, int num_rows)
  {
    JSAMPROW inptr, outptr;
-  int pitch = img_width * RGB_PIXELSIZE, offset, num_cols;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
    unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+#endif
  
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0},
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
      rgbg0, rgbg1, rgbg2, rgbg3, y;
-#if RGB_PIXELSIZE == 4
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
    __vector unsigned char rgb4 = {0};
  #endif
    __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
@@ -47,7 +53,11 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
      pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
    __vector int pd_onehalf = { __4X(ONE_HALF) };
    __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
      shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
  
    while (--num_rows >= 0) {
      inptr = *input_buf++;
@@ -58,6 +68,7 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
           num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
           outptr += 16) {
  
+#if __BIG_ENDIAN__
        /* Load 16 pixels == 48 or 64 bytes */
        offset = (size_t)inptr & 15;
        if (offset) {
@@ -123,6 +134,18 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
  #endif
          }
        }
+#else
+      /* Little endian */
+      rgb0 = vec_vsx_ld(0, inptr);
+      if (num_cols > 16)
+        rgb1 = vec_vsx_ld(16, inptr);
+      if (num_cols > 32)
+        rgb2 = vec_vsx_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+      if (num_cols > 48)
+        rgb3 = vec_vsx_ld(48, inptr);
+#endif
+#endif
  
  #if RGB_PIXELSIZE == 3
        /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
@@ -162,14 +185,14 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
         * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
         * support unsigned vectors.
         */
-      rg0 = (__vector signed short)vec_mergeh(pb_zero, rgbg0);
-      bg0 = (__vector signed short)vec_mergel(pb_zero, rgbg0);
-      rg1 = (__vector signed short)vec_mergeh(pb_zero, rgbg1);
-      bg1 = (__vector signed short)vec_mergel(pb_zero, rgbg1);
-      rg2 = (__vector signed short)vec_mergeh(pb_zero, rgbg2);
-      bg2 = (__vector signed short)vec_mergel(pb_zero, rgbg2);
-      rg3 = (__vector signed short)vec_mergeh(pb_zero, rgbg3);
-      bg3 = (__vector signed short)vec_mergel(pb_zero, rgbg3);
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
  
        /* (Original)
         * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
diff --git a/simd/jcsample-altivec.c b/simd/jcsample-altivec.c

index 517e2bed70dac17da69904210d4e8eb6da115da5..603492dbb3c999d959377f38ddba09028b2f69ea 100644 (file)
--- a/simd/jcsample-altivec.c
+++ b/simd/jcsample-altivec.c
@@ -58,8 +58,8 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
  
        this0 = vec_ld(0, inptr);
        this0 = vec_perm(this0, this0, even_odd_index);
-      this0e = (__vector unsigned short)vec_mergeh(pb_zero, this0);
-      this0o = (__vector unsigned short)vec_mergel(pb_zero, this0);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
        outl = vec_add(this0e, this0o);
        outl = vec_add(outl, pw_bias);
        outl = vec_sr(outl, pw_one);
@@ -67,8 +67,8 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
        if (outcol > 8) {
          next0 = vec_ld(16, inptr);
          next0 = vec_perm(next0, next0, even_odd_index);
-        next0e = (__vector unsigned short)vec_mergeh(pb_zero, next0);
-        next0o = (__vector unsigned short)vec_mergel(pb_zero, next0);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
          outh = vec_add(next0e, next0o);
          outh = vec_add(outh, pw_bias);
          outh = vec_sr(outh, pw_one);
@@ -118,14 +118,14 @@ jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
  
        this0 = vec_ld(0, inptr0);
        this0 = vec_perm(this0, this0, even_odd_index);
-      this0e = (__vector unsigned short)vec_mergeh(pb_zero, this0);
-      this0o = (__vector unsigned short)vec_mergel(pb_zero, this0);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
        out0l = vec_add(this0e, this0o);
  
        this1 = vec_ld(0, inptr1);
        this1 = vec_perm(this1, this1, even_odd_index);
-      this1e = (__vector unsigned short)vec_mergeh(pb_zero, this1);
-      this1o = (__vector unsigned short)vec_mergel(pb_zero, this1);
+      this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
+      this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
        out1l = vec_add(this1e, this1o);
  
        outl = vec_add(out0l, out1l);
@@ -135,14 +135,14 @@ jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
        if (outcol > 8) {
          next0 = vec_ld(16, inptr0);
          next0 = vec_perm(next0, next0, even_odd_index);
-        next0e = (__vector unsigned short)vec_mergeh(pb_zero, next0);
-        next0o = (__vector unsigned short)vec_mergel(pb_zero, next0);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
          out0h = vec_add(next0e, next0o);
  
          next1 = vec_ld(16, inptr1);
          next1 = vec_perm(next1, next1, even_odd_index);
-        next1e = (__vector unsigned short)vec_mergeh(pb_zero, next1);
-        next1o = (__vector unsigned short)vec_mergel(pb_zero, next1);
+        next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
+        next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
          out1h = vec_add(next1e, next1o);
  
          outh = vec_add(out0h, out1h);
diff --git a/simd/jdcolext-altivec.c b/simd/jdcolext-altivec.c

index 1b8311d3f36ac59da22552dfe1a048c2b7f7443a..1ae91b9c0e513084ab68407763d00a903c08e542 100644 (file)
--- a/simd/jdcolext-altivec.c
+++ b/simd/jdcolext-altivec.c
@@ -28,13 +28,22 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
                                      JSAMPARRAY output_buf, int num_rows)
  {
    JSAMPROW outptr, inptr0, inptr1, inptr2;
-  int pitch = out_width * RGB_PIXELSIZE, offset, num_cols;
+  int pitch = out_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
    unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
  
    __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
-    y, cb, cr, edgel, edgeh, edges, out0, out1, out2, out3;
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
  #if RGB_PIXELSIZE == 4
-  __vector unsigned char rgb3, out4;
+  __vector unsigned char rgb3;
  #endif
    __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
      crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
@@ -51,7 +60,11 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
      pw_cj = { __8X(CENTERJSAMPLE) };
    __vector int pd_onehalf = { __4X(ONE_HALF) };
    __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
      shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
  
    while (--num_rows >= 0) {
      inptr0 = input_buf[0][input_row];
@@ -68,18 +81,18 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
        /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
         * support unsigned vectors.
         */
-      yl = (__vector signed short)vec_mergeh(pb_zero, y);
-      yh = (__vector signed short)vec_mergel(pb_zero, y);
+      yl = (__vector signed short)VEC_UNPACKHU(y);
+      yh = (__vector signed short)VEC_UNPACKLU(y);
  
        cb = vec_ld(0, inptr1);
-      cbl = (__vector signed short)vec_mergeh(pb_zero, cb);
-      cbh = (__vector signed short)vec_mergel(pb_zero, cb);
+      cbl = (__vector signed short)VEC_UNPACKHU(cb);
+      cbh = (__vector signed short)VEC_UNPACKLU(cb);
        cbl = vec_sub(cbl, pw_cj);
        cbh = vec_sub(cbh, pw_cj);
  
        cr = vec_ld(0, inptr2);
-      crl = (__vector signed short)vec_mergeh(pb_zero, cr);
-      crh = (__vector signed short)vec_mergel(pb_zero, cr);
+      crl = (__vector signed short)VEC_UNPACKHU(cr);
+      crh = (__vector signed short)VEC_UNPACKLU(cr);
        crl = vec_sub(crl, pw_cj);
        crh = vec_sub(crh, pw_cj);
  
@@ -181,6 +194,7 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
        rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
  #endif
  
+#if __BIG_ENDIAN__
        offset = (size_t)outptr & 15;
        if (offset) {
          __vector unsigned char unaligned_shift_index;
@@ -230,28 +244,31 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
  #endif
          }
        } else {
+#endif /* __BIG_ENDIAN__ */
          if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
            /* Slow path */
-          vec_st(rgb0, 0, tmpbuf);
-          vec_st(rgb1, 16, tmpbuf);
-          vec_st(rgb2, 32, tmpbuf);
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
  #if RGB_PIXELSIZE == 4
-          vec_st(rgb3, 48, tmpbuf);
+          VEC_ST(rgb3, 48, tmpbuf);
  #endif
            memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
          } else {
            /* Fast path */
-          vec_st(rgb0, 0, outptr);
+          VEC_ST(rgb0, 0, outptr);
            if (num_cols > 16)
-            vec_st(rgb1, 16, outptr);
+            VEC_ST(rgb1, 16, outptr);
            if (num_cols > 32)
-            vec_st(rgb2, 32, outptr);
+            VEC_ST(rgb2, 32, outptr);
  #if RGB_PIXELSIZE == 4
            if (num_cols > 48)
-            vec_st(rgb3, 48, outptr);
+            VEC_ST(rgb3, 48, outptr);
  #endif
          }
+#if __BIG_ENDIAN__
        }
+#endif
      }
    }
  }
diff --git a/simd/jdmrgext-altivec.c b/simd/jdmrgext-altivec.c

index 0b92e7ed7c489e17490f6d68af25fc9b0af6cee5..3b6950d60ab01da5e44cc6e8748b97a99b772481 100644 (file)
--- a/simd/jdmrgext-altivec.c
+++ b/simd/jdmrgext-altivec.c
@@ -29,13 +29,22 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
                                           JSAMPARRAY output_buf)
  {
    JSAMPROW outptr, inptr0, inptr1, inptr2;
-  int pitch = output_width * RGB_PIXELSIZE, offset, num_cols, yloop;
+  int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
    unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
  
    __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
-    y, cb, cr, edgel, edgeh, edges, out0, out1, out2, out3;
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
  #if RGB_PIXELSIZE == 4
-  __vector unsigned char rgb3, out4;
+  __vector unsigned char rgb3;
  #endif
    __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
      crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
@@ -53,9 +62,15 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
      pw_cj = { __8X(CENTERJSAMPLE) };
    __vector int pd_onehalf = { __4X(ONE_HALF) };
    __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
      shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
      even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
      odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
+    even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
+    odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
+#endif
  
    inptr0 = input_buf[0][in_row_group_ctr];
    inptr1 = input_buf[1][in_row_group_ctr];
@@ -68,14 +83,14 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
      /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
       * support unsigned vectors.
       */
-    cbl = (__vector signed short)vec_mergeh(pb_zero, cb);
-    cbh = (__vector signed short)vec_mergel(pb_zero, cb);
+    cbl = (__vector signed short)VEC_UNPACKHU(cb);
+    cbh = (__vector signed short)VEC_UNPACKLU(cb);
      cbl = vec_sub(cbl, pw_cj);
      cbh = vec_sub(cbh, pw_cj);
  
      cr = vec_ld(0, inptr2);
-    crl = (__vector signed short)vec_mergeh(pb_zero, cr);
-    crh = (__vector signed short)vec_mergel(pb_zero, cr);
+    crl = (__vector signed short)VEC_UNPACKHU(cr);
+    crh = (__vector signed short)VEC_UNPACKLU(cr);
      crl = vec_sub(crl, pw_cj);
      crh = vec_sub(crh, pw_cj);
  
@@ -204,6 +219,7 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
        rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
  #endif
  
+#if __BIG_ENDIAN__
        offset = (size_t)outptr & 15;
        if (offset) {
          __vector unsigned char unaligned_shift_index;
@@ -253,28 +269,31 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
  #endif
          }
        } else {
+#endif /* __BIG_ENDIAN__ */
          if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
            /* Slow path */
-          vec_st(rgb0, 0, tmpbuf);
-          vec_st(rgb1, 16, tmpbuf);
-          vec_st(rgb2, 32, tmpbuf);
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
  #if RGB_PIXELSIZE == 4
-          vec_st(rgb3, 48, tmpbuf);
+          VEC_ST(rgb3, 48, tmpbuf);
  #endif
            memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
          } else {
            /* Fast path */
-          vec_st(rgb0, 0, outptr);
+          VEC_ST(rgb0, 0, outptr);
            if (num_cols > 16)
-            vec_st(rgb1, 16, outptr);
+            VEC_ST(rgb1, 16, outptr);
            if (num_cols > 32)
-            vec_st(rgb2, 32, outptr);
+            VEC_ST(rgb2, 32, outptr);
  #if RGB_PIXELSIZE == 4
            if (num_cols > 48)
-            vec_st(rgb3, 48, outptr);
+            VEC_ST(rgb3, 48, outptr);
  #endif
          }
+#if __BIG_ENDIAN__
        }
+#endif
      }
    }
  }
diff --git a/simd/jdsample-altivec.c b/simd/jdsample-altivec.c

index f73b185fa60a90ff61c04617f46f3537afeeaec9..6b77d04c741d1a1cd2bab93eb4ce4e421c87614d 100644 (file)
--- a/simd/jdsample-altivec.c
+++ b/simd/jdsample-altivec.c
@@ -46,7 +46,11 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
      last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
      next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
      next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
+#if __BIG_ENDIAN__
      merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
    __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
  
    for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
@@ -80,12 +84,12 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
        this0l = vec_mergeh(this0e, this0o);
        this0h = vec_mergel(this0e, this0o);
  
-      last0l = (__vector short)vec_mergeh(pb_zero, p_last0);
-      last0h = (__vector short)vec_mergel(pb_zero, p_last0);
+      last0l = (__vector short)VEC_UNPACKHU(p_last0);
+      last0h = (__vector short)VEC_UNPACKLU(p_last0);
        last0l = vec_add(last0l, pw_one);
  
-      next0l = (__vector short)vec_mergeh(pb_zero, p_next0);
-      next0h = (__vector short)vec_mergel(pb_zero, p_next0);
+      next0l = (__vector short)VEC_UNPACKHU(p_next0);
+      next0h = (__vector short)VEC_UNPACKLU(p_next0);
        next0l = vec_add(next0l, pw_two);
  
        outle = vec_add(this0l, last0l);
@@ -143,7 +147,11 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
      last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
      next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
      next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
+#if __BIG_ENDIAN__
      merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
    __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
      pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
    __vector unsigned short pw_four = { __8X(4) };
@@ -163,14 +171,14 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
      }
  
      this0 = vec_ld(0, inptr0);
-    this0l = (__vector short)vec_mergeh(pb_zero, this0);
-    this0h = (__vector short)vec_mergel(pb_zero, this0);
+    this0l = (__vector short)VEC_UNPACKHU(this0);
+    this0h = (__vector short)VEC_UNPACKLU(this0);
      this0l = vec_mladd(this0l, pw_three, pw_zero);
      this0h = vec_mladd(this0h, pw_three, pw_zero);
  
      this_1 = vec_ld(0, inptr_1);
-    this_1l = (__vector short)vec_mergeh(pb_zero, this_1);
-    this_1h = (__vector short)vec_mergel(pb_zero, this_1);
+    this_1l = (__vector short)VEC_UNPACKHU(this_1);
+    this_1h = (__vector short)VEC_UNPACKLU(this_1);
      thiscolsum_1l = vec_add(this0l, this_1l);
      thiscolsum_1h = vec_add(this0h, this_1h);
      lastcolsum_1h = thiscolsum_1h;
@@ -178,8 +186,8 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
      p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
  
      this1 = vec_ld(0, inptr1);
-    this1l = (__vector short)vec_mergeh(pb_zero, this1);
-    this1h = (__vector short)vec_mergel(pb_zero, this1);
+    this1l = (__vector short)VEC_UNPACKHU(this1);
+    this1h = (__vector short)VEC_UNPACKLU(this1);
      thiscolsum1l = vec_add(this0l, this1l);
      thiscolsum1h = vec_add(this0h, this1h);
      lastcolsum1h = thiscolsum1h;
@@ -207,22 +215,22 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
                                    next_index_lastcol);
        } else {
          this0 = vec_ld(16, inptr0);
-        this0l = (__vector short)vec_mergeh(pb_zero, this0);
-        this0h = (__vector short)vec_mergel(pb_zero, this0);
+        this0l = (__vector short)VEC_UNPACKHU(this0);
+        this0h = (__vector short)VEC_UNPACKLU(this0);
          this0l = vec_mladd(this0l, pw_three, pw_zero);
          this0h = vec_mladd(this0h, pw_three, pw_zero);
  
          this_1 = vec_ld(16, inptr_1);
-        this_1l = (__vector short)vec_mergeh(pb_zero, this_1);
-        this_1h = (__vector short)vec_mergel(pb_zero, this_1);
+        this_1l = (__vector short)VEC_UNPACKHU(this_1);
+        this_1h = (__vector short)VEC_UNPACKLU(this_1);
          nextcolsum_1l = vec_add(this0l, this_1l);
          nextcolsum_1h = vec_add(this0h, this_1h);
          p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
          p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
  
          this1 = vec_ld(16, inptr1);
-        this1l = (__vector short)vec_mergeh(pb_zero, this1);
-        this1h = (__vector short)vec_mergel(pb_zero, this1);
+        this1l = (__vector short)VEC_UNPACKHU(this1);
+        this1h = (__vector short)VEC_UNPACKLU(this1);
          nextcolsum1l = vec_add(this0l, this1l);
          nextcolsum1h = vec_add(this0h, this1h);
          p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
diff --git a/simd/jquanti-altivec.c b/simd/jquanti-altivec.c

index d57d691e4140ac710ebb536ce0dafea64a94e4c0..2fbec2d5f29a6cfe62d3405522ac067fe234c57d 100644 (file)
--- a/simd/jquanti-altivec.c
+++ b/simd/jquanti-altivec.c
@@ -1,7 +1,7 @@
  /*
   * AltiVec optimizations for libjpeg-turbo
   *
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014-2015, D. R. Commander.
   * All rights reserved.
   * This software is provided 'as-is', without any express or implied
   * warranty.  In no event will the authors be held liable for any damages
@@ -29,6 +29,8 @@
   * always get the data we want by using a single vector load (although we may
   * have to permute the result.)
   */
+#if __BIG_ENDIAN__
+
  #define LOAD_ROW(row) {  \
    elemptr = sample_data[row] + start_col;  \
    in##row = vec_ld(0, elemptr);  \
@@ -36,6 +38,15 @@
      in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \
  }
  
+#else
+
+#define LOAD_ROW(row) {  \
+  elemptr = sample_data[row] + start_col;  \
+  in##row = vec_vsx_ld(0, elemptr);  \
+}
+
+#endif
+
  
  void
  jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
@@ -59,14 +70,14 @@ jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
    LOAD_ROW(6);
    LOAD_ROW(7);
  
-  out0 = (__vector short)vec_mergeh(pb_zero, in0);
-  out1 = (__vector short)vec_mergeh(pb_zero, in1);
-  out2 = (__vector short)vec_mergeh(pb_zero, in2);
-  out3 = (__vector short)vec_mergeh(pb_zero, in3);
-  out4 = (__vector short)vec_mergeh(pb_zero, in4);
-  out5 = (__vector short)vec_mergeh(pb_zero, in5);
-  out6 = (__vector short)vec_mergeh(pb_zero, in6);
-  out7 = (__vector short)vec_mergeh(pb_zero, in7);
+  out0 = (__vector short)VEC_UNPACKHU(in0);
+  out1 = (__vector short)VEC_UNPACKHU(in1);
+  out2 = (__vector short)VEC_UNPACKHU(in2);
+  out3 = (__vector short)VEC_UNPACKHU(in3);
+  out4 = (__vector short)VEC_UNPACKHU(in4);
+  out5 = (__vector short)VEC_UNPACKHU(in5);
+  out6 = (__vector short)VEC_UNPACKHU(in6);
+  out7 = (__vector short)VEC_UNPACKHU(in7);
  
    out0 = vec_sub(out0, pw_centerjsamp);
    out1 = vec_sub(out1, pw_centerjsamp);
@@ -116,8 +127,13 @@ jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM * divisors,
  
    /* Constants */
    __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
+#if __BIG_ENDIAN__
    __vector unsigned char shift_pack_index =
      {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
+#else
+  __vector unsigned char shift_pack_index =
+    {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
+#endif
  
    row0 = vec_ld(0, workspace);
    row1 = vec_ld(16, workspace);
diff --git a/simd/jsimd_altivec.h b/simd/jsimd_altivec.h

index e80020609b683de15ca7019d7f6c75bc5b0be847..2660219250fc26de8f73293d76a273c6ce7e1fe9 100644 (file)
--- a/simd/jsimd_altivec.h
+++ b/simd/jsimd_altivec.h
@@ -1,7 +1,7 @@
  /*
   * AltiVec optimizations for libjpeg-turbo
   *
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014-2015, D. R. Commander.
   * All rights reserved.
   * This software is provided 'as-is', without any express or implied
   * warranty.  In no event will the authors be held liable for any damages
@@ -78,3 +78,22 @@
  #ifndef min
  #define min(a,b) ((a) < (b) ? (a) : (b))
  #endif
+
+
+/* Macros to abstract big/little endian bit twiddling */
+
+#if __BIG_ENDIAN__
+
+#define VEC_LD(a, b) vec_ld(a, b)
+#define VEC_ST(a, b, c) vec_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
+
+#else
+
+#define VEC_LD(a, b) vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
+
+#endif
author	DRC <dcommander@users.sourceforge.net>
	Fri, 20 Feb 2015 19:57:21 +0000 (19:57 +0000)
committer	DRC <dcommander@users.sourceforge.net>
	Fri, 20 Feb 2015 19:57:21 +0000 (19:57 +0000)
simd/jccolext-altivec.c		patch \| blob \| history
simd/jcgryext-altivec.c		patch \| blob \| history
simd/jcsample-altivec.c		patch \| blob \| history
simd/jdcolext-altivec.c		patch \| blob \| history
simd/jdmrgext-altivec.c		patch \| blob \| history
simd/jdsample-altivec.c		patch \| blob \| history
simd/jquanti-altivec.c		patch \| blob \| history
simd/jsimd_altivec.h		patch \| blob \| history