Moved OpenCL kernels to a separate header file.

author dirk <dirk@git.imagemagick.org>

Sun, 29 May 2016 19:26:07 +0000 (21:26 +0200)

committer dirk <dirk@git.imagemagick.org>

Sun, 29 May 2016 19:26:07 +0000 (21:26 +0200)
author dirk <dirk@git.imagemagick.org>
Sun, 29 May 2016 19:26:07 +0000 (21:26 +0200)
committer dirk <dirk@git.imagemagick.org>
Sun, 29 May 2016 19:26:07 +0000 (21:26 +0200)
diff --git a/MagickCore/accelerate-kernels-private.h b/MagickCore/accelerate-kernels-private.h

new file mode 100644 (file)

index 0000000..f94ef40
--- /dev/null
+++ b/MagickCore/accelerate-kernels-private.h
@@ -0,0 +1,3263 @@
+/*
+  Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization
+  dedicated to making software imaging solutions freely available.
+  
+  You may not use this file except in compliance with the License.
+  obtain a copy of the License at
+  
+    http://www.imagemagick.org/script/license.php
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+  MagickCore private kernels for accelerated functions.
+*/
+
+#ifndef _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
+#define _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if defined(MAGICKCORE_OPENCL_SUPPORT)
+
+/*
+  Define declarations.
+*/
+#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
+#define OPENCL_ELIF(...)       "\n #""elif " #__VA_ARGS__ " \n"
+#define OPENCL_ELSE()          "\n #""else " " \n"
+#define OPENCL_ENDIF()         "\n #""endif " " \n"
+#define OPENCL_IF(...)         "\n #""if " #__VA_ARGS__ " \n"
+#define STRINGIFY(...) #__VA_ARGS__ "\n"
+
+/*
+  Typedef declarations.
+*/
+
+typedef struct _FloatPixelPacket
+{
+  MagickRealType
+    red,
+    green,
+    blue,
+    alpha,
+    black;
+} FloatPixelPacket;
+
+const char *accelerateKernels =
+
+/*
+  Define declarations.
+*/
+  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
+  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
+  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
+  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
+  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
+  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
+  OPENCL_DEFINE(SigmaRandom, (attenuate))
+  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
+  OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
+  OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
+
+/*
+  Typedef declarations.
+*/
+  STRINGIFY(
+    typedef enum
+  {
+    UndefinedColorspace,
+    RGBColorspace,            /* Linear RGB colorspace */
+    GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
+    TransparentColorspace,
+    OHTAColorspace,
+    LabColorspace,
+    XYZColorspace,
+    YCbCrColorspace,
+    YCCColorspace,
+    YIQColorspace,
+    YPbPrColorspace,
+    YUVColorspace,
+    CMYKColorspace,           /* negared linear RGB with black separated */
+    sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
+    HSBColorspace,
+    HSLColorspace,
+    HWBColorspace,
+    Rec601LumaColorspace,
+    Rec601YCbCrColorspace,
+    Rec709LumaColorspace,
+    Rec709YCbCrColorspace,
+    LogColorspace,
+    CMYColorspace,            /* negated linear RGB colorspace */
+    LuvColorspace,
+    HCLColorspace,
+    LCHColorspace,            /* alias for LCHuv */
+    LMSColorspace,
+    LCHabColorspace,          /* Cylindrical (Polar) Lab */
+    LCHuvColorspace,          /* Cylindrical (Polar) Luv */
+    scRGBColorspace,
+    HSIColorspace,
+    HSVColorspace,            /* alias for HSB */
+    HCLpColorspace,
+    YDbDrColorspace
+  } ColorspaceType;
+  )
+
+  STRINGIFY(
+    typedef enum
+    {
+      UndefinedCompositeOp,
+      NoCompositeOp,
+      ModulusAddCompositeOp,
+      AtopCompositeOp,
+      BlendCompositeOp,
+      BumpmapCompositeOp,
+      ChangeMaskCompositeOp,
+      ClearCompositeOp,
+      ColorBurnCompositeOp,
+      ColorDodgeCompositeOp,
+      ColorizeCompositeOp,
+      CopyBlackCompositeOp,
+      CopyBlueCompositeOp,
+      CopyCompositeOp,
+      CopyCyanCompositeOp,
+      CopyGreenCompositeOp,
+      CopyMagentaCompositeOp,
+      CopyOpacityCompositeOp,
+      CopyRedCompositeOp,
+      CopyYellowCompositeOp,
+      DarkenCompositeOp,
+      DstAtopCompositeOp,
+      DstCompositeOp,
+      DstInCompositeOp,
+      DstOutCompositeOp,
+      DstOverCompositeOp,
+      DifferenceCompositeOp,
+      DisplaceCompositeOp,
+      DissolveCompositeOp,
+      ExclusionCompositeOp,
+      HardLightCompositeOp,
+      HueCompositeOp,
+      InCompositeOp,
+      LightenCompositeOp,
+      LinearLightCompositeOp,
+      LuminizeCompositeOp,
+      MinusDstCompositeOp,
+      ModulateCompositeOp,
+      MultiplyCompositeOp,
+      OutCompositeOp,
+      OverCompositeOp,
+      OverlayCompositeOp,
+      PlusCompositeOp,
+      ReplaceCompositeOp,
+      SaturateCompositeOp,
+      ScreenCompositeOp,
+      SoftLightCompositeOp,
+      SrcAtopCompositeOp,
+      SrcCompositeOp,
+      SrcInCompositeOp,
+      SrcOutCompositeOp,
+      SrcOverCompositeOp,
+      ModulusSubtractCompositeOp,
+      ThresholdCompositeOp,
+      XorCompositeOp,
+      /* These are new operators, added after the above was last sorted.
+       * The list should be re-sorted only when a new library version is
+       * created.
+       */
+      DivideDstCompositeOp,
+      DistortCompositeOp,
+      BlurCompositeOp,
+      PegtopLightCompositeOp,
+      VividLightCompositeOp,
+      PinLightCompositeOp,
+      LinearDodgeCompositeOp,
+      LinearBurnCompositeOp,
+      MathematicsCompositeOp,
+      DivideSrcCompositeOp,
+      MinusSrcCompositeOp,
+      DarkenIntensityCompositeOp,
+      LightenIntensityCompositeOp
+    } CompositeOperator;
+  )
+
+  STRINGIFY(
+     typedef enum
+     {
+       UndefinedFunction,
+       ArcsinFunction,
+       ArctanFunction,
+       PolynomialFunction,
+       SinusoidFunction
+     } MagickFunction;
+  )
+
+  STRINGIFY(
+    typedef enum
+    {
+      UndefinedNoise,
+      UniformNoise,
+      GaussianNoise,
+      MultiplicativeGaussianNoise,
+      ImpulseNoise,
+      LaplacianNoise,
+      PoissonNoise,
+      RandomNoise
+    } NoiseType;
+  )
+
+  STRINGIFY(
+    typedef enum
+  {
+    UndefinedPixelIntensityMethod = 0,
+    AveragePixelIntensityMethod,
+    BrightnessPixelIntensityMethod,
+    LightnessPixelIntensityMethod,
+    Rec601LumaPixelIntensityMethod,
+    Rec601LuminancePixelIntensityMethod,
+    Rec709LumaPixelIntensityMethod,
+    Rec709LuminancePixelIntensityMethod,
+    RMSPixelIntensityMethod,
+    MSPixelIntensityMethod
+  } PixelIntensityMethod;
+  )
+
+  STRINGIFY(
+  typedef enum {
+    BoxWeightingFunction = 0,
+    TriangleWeightingFunction,
+    CubicBCWeightingFunction,
+    HannWeightingFunction,
+    HammingWeightingFunction,
+    BlackmanWeightingFunction,
+    GaussianWeightingFunction,
+    QuadraticWeightingFunction,
+    JincWeightingFunction,
+    SincWeightingFunction,
+    SincFastWeightingFunction,
+    KaiserWeightingFunction,
+    WelshWeightingFunction,
+    BohmanWeightingFunction,
+    LagrangeWeightingFunction,
+    CosineWeightingFunction,
+  } ResizeWeightingFunctionType;
+  )
+
+  STRINGIFY(
+     typedef enum
+     {
+       UndefinedChannel = 0x0000,
+       RedChannel = 0x0001,
+       GrayChannel = 0x0001,
+       CyanChannel = 0x0001,
+       GreenChannel = 0x0002,
+       MagentaChannel = 0x0002,
+       BlueChannel = 0x0004,
+       YellowChannel = 0x0004,
+       BlackChannel = 0x0008,
+       AlphaChannel = 0x0010,
+       OpacityChannel = 0x0010,
+       IndexChannel = 0x0020,             /* Color Index Table? */
+       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
+       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
+       MetaChannel = 0x0100,              /* ???? */
+       CompositeChannels = 0x002F,
+       AllChannels = 0x7ffffff,
+       /*
+         Special purpose channel types.
+         FUTURE: are these needed any more - they are more like hacks
+         SyncChannels for example is NOT a real channel but a 'flag'
+         It really says -- "User has not defined channels"
+         Though it does have extra meaning in the "-auto-level" operator
+       */
+       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
+       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
+       GrayChannels = 0x0400,
+       SyncChannels = 0x20000,    /* channels modified as a single unit */
+       DefaultChannels = AllChannels
+     } ChannelType;  /* must correspond to PixelChannel */
+  )
+
+/*
+  Helper functions.
+*/
+
+OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
+
+  STRINGIFY(
+    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
+    {
+      return((CLQuantum) value);
+    }
+  )
+
+OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
+
+  STRINGIFY(
+    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
+    {
+      return((CLQuantum) (257.0f*value));
+    }
+  )
+
+OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
+
+  STRINGIFY(
+    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
+    {
+      return((CLQuantum) (16843009.0*value));
+    }
+  )
+
+OPENCL_ENDIF()
+
+  STRINGIFY(
+    inline int ClampToCanvas(const int offset,const int range)
+      {
+        return clamp(offset, (int)0, range-1);
+      }
+  )
+
+  STRINGIFY(
+    inline CLQuantum ClampToQuantum(const float value)
+      {
+        return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
+      }
+  )
+
+  STRINGIFY(
+    inline uint ScaleQuantumToMap(CLQuantum value)
+      {
+        if (value >= (CLQuantum) MaxMap)
+          return ((uint)MaxMap);
+        else 
+          return ((uint)value);
+      }
+  )
+
+  STRINGIFY(
+    inline float PerceptibleReciprocal(const float x)
+    {
+      float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
+      return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
+    }
+  )
+
+  STRINGIFY(
+  inline float RoundToUnity(const float value)
+   {
+     return clamp(value,0.0f,1.0f);
+   }
+  )
+
+  STRINGIFY(
+
+  inline unsigned int getPixelIndex(const unsigned int number_channels,
+    const unsigned int columns, const unsigned int x, const unsigned int y)
+  {
+    return (x * number_channels) + (y * columns * number_channels);
+  }
+
+  inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
+  inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
+  inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
+  inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
+
+  inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
+  inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
+  inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
+  inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
+
+  inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
+  inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
+  inline float getBlueF4(float4 p)                      { return p.x; }
+  inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
+
+  inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
+  inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
+  inline float getGreenF4(float4 p)                     { return p.y; }
+  inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
+
+  inline CLQuantum getRed(CLPixelType p)                { return p.z; }
+  inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
+  inline float getRedF4(float4 p)                       { return p.z; }
+  inline void setRedF4(float4* p, float value)          { (*p).z = value; }
+
+  inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
+  inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
+  inline float getAlphaF4(float4 p)                     { return p.w; }
+  inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
+
+  inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
+    const ChannelType channel, float *red, float *green, float *blue, float *alpha)
+  {
+    if ((channel & RedChannel) != 0)
+      *red=getPixelRed(p);
+
+    if (number_channels > 2)
+      {
+        if ((channel & GreenChannel) != 0)
+          *green=getPixelGreen(p);
+
+        if ((channel & BlueChannel) != 0)
+          *blue=getPixelBlue(p);
+      }
+
+    if (((number_channels == 4) || (number_channels == 2)) &&
+        ((channel & AlphaChannel) != 0))
+      *alpha=getPixelAlpha(p,number_channels);
+  }
+
+  inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
+    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
+  {
+    const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
+
+    float red = 0.0f;
+    float green = 0.0f;
+    float blue = 0.0f;
+    float alpha = 0.0f;
+
+    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
+    return (float4)(red, green, blue, alpha);
+  }
+
+  inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
+    const ChannelType channel, float red, float green, float blue, float alpha)
+  {
+    if ((channel & RedChannel) != 0)
+      setPixelRed(p,ClampToQuantum(red));
+
+    if (number_channels > 2)
+      {
+        if ((channel & GreenChannel) != 0)
+          setPixelGreen(p,ClampToQuantum(green));
+
+        if ((channel & BlueChannel) != 0)
+          setPixelBlue(p,ClampToQuantum(blue));
+      }
+
+    if (((number_channels == 4) || (number_channels == 2)) &&
+        ((channel & AlphaChannel) != 0))
+      setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
+  }
+
+  inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
+    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
+    float4 pixel)
+  {
+    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
+    WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
+  }
+
+  inline float GetPixelIntensity(const unsigned int colorspace,
+    const unsigned int method,float red,float green,float blue)
+  {
+    float intensity;
+
+    if (colorspace == GRAYColorspace)
+      return red;
+
+    switch (method)
+    {
+      case AveragePixelIntensityMethod:
+        {
+          intensity=(red+green+blue)/3.0;
+          break;
+        }
+      case BrightnessPixelIntensityMethod:
+        {
+          intensity=MagickMax(MagickMax(red,green),blue);
+          break;
+        }
+      case LightnessPixelIntensityMethod:
+        {
+          intensity=(MagickMin(MagickMin(red,green),blue)+
+              MagickMax(MagickMax(red,green),blue))/2.0;
+          break;
+        }
+      case MSPixelIntensityMethod:
+        {
+          intensity=(float) (((float) red*red+green*green+blue*blue)/
+              (3.0*QuantumRange));
+          break;
+        }
+      case Rec601LumaPixelIntensityMethod:
+        {
+          /*
+          if (image->colorspace == RGBColorspace)
+          {
+            red=EncodePixelGamma(red);
+            green=EncodePixelGamma(green);
+            blue=EncodePixelGamma(blue);
+          }
+          */
+          intensity=0.298839*red+0.586811*green+0.114350*blue;
+          break;
+        }
+      case Rec601LuminancePixelIntensityMethod:
+        {
+          /*
+          if (image->colorspace == sRGBColorspace)
+          {
+            red=DecodePixelGamma(red);
+            green=DecodePixelGamma(green);
+            blue=DecodePixelGamma(blue);
+          }
+          */
+          intensity=0.298839*red+0.586811*green+0.114350*blue;
+          break;
+        }
+      case Rec709LumaPixelIntensityMethod:
+      default:
+        {
+          /*
+          if (image->colorspace == RGBColorspace)
+          {
+            red=EncodePixelGamma(red);
+            green=EncodePixelGamma(green);
+            blue=EncodePixelGamma(blue);
+          }
+          */
+          intensity=0.212656*red+0.715158*green+0.072186*blue;
+          break;
+        }
+      case Rec709LuminancePixelIntensityMethod:
+        {
+          /*
+          if (image->colorspace == sRGBColorspace)
+          {
+            red=DecodePixelGamma(red);
+            green=DecodePixelGamma(green);
+            blue=DecodePixelGamma(blue);
+          }
+          */
+          intensity=0.212656*red+0.715158*green+0.072186*blue;
+          break;
+        }
+      case RMSPixelIntensityMethod:
+        {
+          intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
+              sqrt(3.0));
+          break;
+        }
+    }
+
+    return intensity;
+  }
+
+  inline int mirrorBottom(int value)
+  {
+      return (value < 0) ? - (value) : value;
+  }
+
+  inline int mirrorTop(int value, int width)
+  {
+      return (value >= width) ? (2 * width - value - 1) : value;
+  }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     A d d N o i s e                                                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+  /*
+  Part of MWC64X by David Thomas, dt10@imperial.ac.uk
+  This is provided under BSD, full license is with the main package.
+  See http://www.doc.ic.ac.uk/~dt10/research
+  */
+
+  // Pre: a<M, b<M
+  // Post: r=(a+b) mod M
+  ulong MWC_AddMod64(ulong a, ulong b, ulong M)
+  {
+    ulong v=a+b;
+    //if( (v>=M) || (v<a) )
+    if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
+      v=v-M;
+    return v;
+  }
+
+  // Pre: a<M,b<M
+  // Post: r=(a*b) mod M
+  // This could be done more efficently, but it is portable, and should
+  // be easy to understand. It can be replaced with any of the better
+  // modular multiplication algorithms (for example if you know you have
+  // double precision available or something).
+  ulong MWC_MulMod64(ulong a, ulong b, ulong M)
+  {
+    ulong r=0;
+    while(a!=0){
+      if(a&1)
+        r=MWC_AddMod64(r,b,M);
+      b=MWC_AddMod64(b,b,M);
+      a=a>>1;
+    }
+    return r;
+  }
+
+  // Pre: a<M, e>=0
+  // Post: r=(a^b) mod M
+  // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
+  // most architectures
+  ulong MWC_PowMod64(ulong a, ulong e, ulong M)
+  {
+    ulong sqr=a, acc=1;
+    while(e!=0){
+      if(e&1)
+        acc=MWC_MulMod64(acc,sqr,M);
+        sqr=MWC_MulMod64(sqr,sqr,M);
+      e=e>>1;
+    }
+    return acc;
+  }
+
+  uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
+  {
+    ulong m=MWC_PowMod64(A, distance, M);
+    ulong x=curr.x*(ulong)A+curr.y;
+    x=MWC_MulMod64(x, m, M);
+    return (uint2)((uint)(x/A), (uint)(x%A));
+  }
+
+  uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
+  {
+    // This is an arbitrary constant for starting LCG jumping from. I didn't
+    // want to start from 1, as then you end up with the two or three first values
+    // being a bit poor in ones - once you've decided that, one constant is as
+    // good as any another. There is no deep mathematical reason for it, I just
+    // generated a random number.
+    enum{ MWC_BASEID = 4077358422479273989UL };
+
+    ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
+    ulong m=MWC_PowMod64(A, dist, M);
+
+    ulong x=MWC_MulMod64(MWC_BASEID, m, M);
+    return (uint2)((uint)(x/A), (uint)(x%A));
+  }
+
+  //! Represents the state of a particular generator
+  typedef struct{ uint x; uint c; } mwc64x_state_t;
+
+  enum{ MWC64X_A = 4294883355U };
+  enum{ MWC64X_M = 18446383549859758079UL };
+
+  void MWC64X_Step(mwc64x_state_t *s)
+  {
+    uint X=s->x, C=s->c;
+
+    uint Xn=MWC64X_A*X+C;
+    uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
+    uint Cn=mad_hi(MWC64X_A,X,carry);
+
+    s->x=Xn;
+    s->c=Cn;
+  }
+
+  void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
+  {
+    uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
+    s->x=tmp.x;
+    s->c=tmp.y;
+  }
+
+  void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
+  {
+    uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
+    s->x=tmp.x;
+    s->c=tmp.y;
+  }
+
+  //! Return a 32-bit integer in the range [0..2^32)
+  uint MWC64X_NextUint(mwc64x_state_t *s)
+  {
+    uint res=s->x ^ s->c;
+    MWC64X_Step(s);
+    return res;
+  }
+
+  //
+  // End of MWC64X excerpt
+  //
+
+  float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
+  {
+    return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
+  }
+
+  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
+  {
+    float
+      alpha,
+      beta,
+      noise,
+      sigma;
+
+    noise = 0.0f;
+    alpha=mwcReadPseudoRandomValue(r);
+    switch(noise_type)
+    {
+      case UniformNoise:
+      default:
+        {
+          noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
+          break;
+        }
+      case GaussianNoise:
+        {
+          float
+            gamma,
+            tau;
+
+          if (alpha == 0.0f)
+            alpha=1.0f;
+          beta=mwcReadPseudoRandomValue(r);
+          gamma=sqrt(-2.0f*log(alpha));
+          sigma=gamma*cospi((2.0f*beta));
+          tau=gamma*sinpi((2.0f*beta));
+          noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
+          break;
+        }
+      case ImpulseNoise:
+      {
+        if (alpha < (SigmaImpulse/2.0f))
+          noise=0.0f;
+        else
+          if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
+            noise=QuantumRange;
+          else
+            noise=pixel;
+        break;
+      }
+      case LaplacianNoise:
+      {
+        if (alpha <= 0.5f)
+          {
+            if (alpha <= MagickEpsilon)
+              noise=(pixel-QuantumRange);
+            else
+              noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
+                0.5f);
+            break;
+          }
+        beta=1.0f-alpha;
+        if (beta <= (0.5f*MagickEpsilon))
+          noise=(pixel+QuantumRange);
+        else
+          noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
+        break;
+      }
+      case MultiplicativeGaussianNoise:
+      {
+        sigma=1.0f;
+        if (alpha > MagickEpsilon)
+          sigma=sqrt(-2.0f*log(alpha));
+        beta=mwcReadPseudoRandomValue(r);
+        noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
+          cospi((2.0f*beta))/2.0f);
+        break;
+      }
+      case PoissonNoise:
+      {
+        float 
+          poisson;
+        unsigned int i;
+        poisson=exp(-SigmaPoisson*QuantumScale*pixel);
+        for (i=0; alpha > poisson; i++)
+        {
+          beta=mwcReadPseudoRandomValue(r);
+          alpha*=beta;
+        }
+        noise=(QuantumRange*i/SigmaPoisson);
+        break;
+      }
+      case RandomNoise:
+      {
+        noise=(QuantumRange*SigmaRandom*alpha);
+        break;
+      }
+    }
+    return noise;
+  }
+
+  __kernel
+  void AddNoise(const __global CLQuantum *image,
+    const unsigned int number_channels,const ChannelType channel,
+    const unsigned int length,const unsigned int pixelsPerWorkItem,
+    const NoiseType noise_type,const float attenuate,const unsigned int seed0,
+    const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
+    __global CLQuantum *filteredImage)
+  {
+    mwc64x_state_t rng;
+    rng.x = seed0;
+    rng.c = seed1;
+
+    uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
+    uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
+    MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
+
+    uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
+    uint count = pixelsPerWorkItem;
+
+    while (count > 0 && pos < length)
+    {
+      const __global CLQuantum *p = image + pos;
+      __global CLQuantum *q = filteredImage + pos;
+
+      float red;
+      float green;
+      float blue;
+      float alpha;
+
+      ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
+
+      if ((channel & RedChannel) != 0)
+        red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
+
+      if (number_channels > 2)
+      {
+        if ((channel & GreenChannel) != 0)
+          green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
+
+        if ((channel & BlueChannel) != 0)
+          blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
+      }
+
+      if (((number_channels == 4) || (number_channels == 2)) &&
+          ((channel & AlphaChannel) != 0))
+        alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
+
+      WriteChannels(q, number_channels, channel, red, green, blue, alpha);
+
+      pos += (get_local_size(0) * number_channels);
+      count--;
+    }
+  }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    B l u r                                                                  %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+  /*
+  Reduce image noise and reduce detail levels by row
+  */
+  __kernel void BlurRow(const __global CLQuantum *image,
+    const unsigned int number_channels,const ChannelType channel,
+    __constant float *filter,const unsigned int width,
+    const unsigned int imageColumns,const unsigned int imageRows,
+    __local float4 *temp,__global float4 *tempImage)
+  {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int columns = imageColumns;
+
+    const unsigned int radius = (width-1)/2;
+    const int wsize = get_local_size(0);
+    const unsigned int loadSize = wsize+width;
+
+    //group coordinate
+    const int groupX=get_local_size(0)*get_group_id(0);
+
+    //parallel load and clamp
+    for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
+    {
+      int cx = ClampToCanvas(i + groupX - radius, columns);
+      temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
+    }
+
+    // barrier
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // only do the work if this is not a patched item
+    if (get_global_id(0) < columns)
+    {
+      // compute
+      float4 result = (float4) 0;
+
+      int i = 0;
+
+      for ( ; i+7 < width; )
+      {
+        for (int j=0; j < 8; j++)
+          result+=filter[i+j]*temp[i+j+get_local_id(0)];
+        i+=8;
+      }
+
+      for ( ; i < width; i++)
+        result+=filter[i]*temp[i+get_local_id(0)];
+
+      // write back to global
+      tempImage[y*columns+x] = result;
+    }
+  }
+  )
+
+  STRINGIFY(
+  /*
+  Reduce image noise and reduce detail levels by line
+  */
+  __kernel void BlurColumn(const __global float4 *blurRowData,
+    const unsigned int number_channels,const ChannelType channel,
+    __constant float *filter,const unsigned int width,
+    const unsigned int imageColumns,const unsigned int imageRows,
+    __local float4 *temp,__global CLQuantum *filteredImage)
+  {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int columns = imageColumns;
+    const int rows = imageRows;
+
+    unsigned int radius = (width-1)/2;
+    const int wsize = get_local_size(1);
+    const unsigned int loadSize = wsize+width;
+
+    //group coordinate
+    const int groupX=get_local_size(0)*get_group_id(0);
+    const int groupY=get_local_size(1)*get_group_id(1);
+    //notice that get_local_size(0) is 1, so
+    //groupX=get_group_id(0);
+
+    //parallel load and clamp
+    for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
+      temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
+
+    // barrier
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // only do the work if this is not a patched item
+    if (get_global_id(1) < rows)
+    {
+      // compute
+      float4 result = (float4) 0;
+
+      int i = 0;
+
+      for ( ; i+7 < width; )
+      {
+        for (int j=0; j < 8; j++)
+          result+=filter[i+j]*temp[i+j+get_local_id(1)];
+        i+=8;
+      }
+
+      for ( ; i < width; i++)
+        result+=filter[i]*temp[i+get_local_id(1)];
+
+      // write back to global
+      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
+    }
+  }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    C o m p o s i t e                                                        %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+    inline float ColorDodge(const float Sca,
+      const float Sa,const float Dca,const float Da)
+    {
+      /*
+        Oct 2004 SVG specification.
+      */
+      if ((Sca*Da+Dca*Sa) >= Sa*Da)
+        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
+      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
+
+
+      /*
+        New specification, March 2009 SVG specification.  This specification was
+        also wrong of non-overlap cases.
+      */
+      /*
+      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
+        return(Sca*(1.0-Da));
+      if (fabs(Sca-Sa) < MagickEpsilon)
+        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
+      return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
+      */
+
+      /*
+        Working from first principles using the original formula:
+
+           f(Sc,Dc) = Dc/(1-Sc)
+
+        This works correctly! Looks like the 2004 model was right but just
+        required a extra condition for correct handling.
+      */
+
+      /*
+      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
+        return(Sca*(1.0-Da)+Dca*(1.0-Sa));
+      if (fabs(Sca-Sa) < MagickEpsilon)
+        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
+      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
+      */
+    }
+
+    inline void CompositeColorDodge(const float4 *p,
+      const float4 *q,float4 *composite) {
+
+      float 
+      Da,
+      gamma,
+      Sa;
+
+      Sa=QuantumScale*getAlphaF4(*p);  /* simplify and speed up equations */
+      Da=QuantumScale*getAlphaF4(*q);
+      gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
+      setAlphaF4(composite,QuantumRange*gamma);
+      gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
+      setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
+        getRedF4(*q)*Da,Da));
+      setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
+        getGreenF4(*q)*Da,Da));
+      setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
+        getBlueF4(*q)*Da,Da));
+    }
+  )
+
+  STRINGIFY(
+    inline void MagickPixelCompositePlus(const float4 *p,
+      const float alpha,const float4 *q,
+      const float beta,float4 *composite)
+    {
+      float 
+        gamma;
+
+      float
+        Da,
+        Sa;
+      /*
+        Add two pixels with the given opacities.
+      */
+      Sa=QuantumScale*alpha;
+      Da=QuantumScale*beta;
+      gamma=RoundToUnity(Sa+Da);  /* 'Plus' blending -- not 'Over' blending */
+      setAlphaF4(composite,QuantumRange*gamma);
+      gamma=PerceptibleReciprocal(gamma);
+      setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
+      setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
+      setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
+    }
+  )
+
+  STRINGIFY(
+    inline void MagickPixelCompositeBlend(const float4 *p,
+      const float alpha,const float4 *q,
+      const float beta,float4 *composite)
+    {
+      MagickPixelCompositePlus(p,(float) (alpha*
+      (getAlphaF4(*p))),q,(float) (beta*
+      (getAlphaF4(*q))),composite);
+    }
+  )
+  
+  STRINGIFY(
+    __kernel 
+    void Composite(__global CLPixelType *image,
+                   const unsigned int imageWidth, 
+                   const unsigned int imageHeight,
+                   const __global CLPixelType *compositeImage,
+                   const unsigned int compositeWidth, 
+                   const unsigned int compositeHeight,
+                   const unsigned int compose,
+                   const ChannelType channel, 
+                   const unsigned int matte,
+                   const float destination_dissolve,
+                   const float source_dissolve) {
+
+      uint2 index;
+      index.x = get_global_id(0);
+      index.y = get_global_id(1);
+
+
+      if (index.x >= imageWidth
+        || index.y >= imageHeight) {
+          return;
+      }
+      const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
+      float4 destination;
+      setRedF4(&destination,getRed(inputPixel));
+      setGreenF4(&destination,getGreen(inputPixel));
+      setBlueF4(&destination,getBlue(inputPixel));
+
+      
+      const CLPixelType compositePixel 
+        = compositeImage[index.y*imageWidth+index.x];
+      float4 source;
+      setRedF4(&source,getRed(compositePixel));
+      setGreenF4(&source,getGreen(compositePixel));
+      setBlueF4(&source,getBlue(compositePixel));
+
+      if (matte != 0) {
+        setAlphaF4(&destination,getAlpha(inputPixel));
+        setAlphaF4(&source,getAlpha(compositePixel));
+      }
+      else {
+        setAlphaF4(&destination,1.0f);
+        setAlphaF4(&source,1.0f);
+      }
+
+      float4 composite=destination;
+
+      CompositeOperator op = (CompositeOperator)compose;
+      switch (op) {
+      case ColorDodgeCompositeOp:
+        CompositeColorDodge(&source,&destination,&composite);
+        break;
+      case BlendCompositeOp:
+        MagickPixelCompositeBlend(&source,source_dissolve,&destination,
+            destination_dissolve,&composite);
+        break;
+      default:
+        // unsupported operators
+        break;
+      };
+
+      CLPixelType outputPixel;
+      setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
+      setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
+      setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
+      setAlpha(&outputPixel, ClampToQuantum(getAlphaF4(composite)));
+      image[index.y*imageWidth+index.x] = outputPixel;
+    }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    C o n t r a s t                                                          %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+
+  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
+    float3 HueSaturationBrightness;
+    HueSaturationBrightness.x = 0.0f; // Hue
+    HueSaturationBrightness.y = 0.0f; // Saturation
+    HueSaturationBrightness.z = 0.0f; // Brightness
+
+    float r=(float) getRed(pixel);
+    float g=(float) getGreen(pixel);
+    float b=(float) getBlue(pixel);
+
+    float tmin=MagickMin(MagickMin(r,g),b);
+    float tmax=MagickMax(MagickMax(r,g),b);
+
+    if (tmax!=0.0f) {
+      float delta=tmax-tmin;
+      HueSaturationBrightness.y=delta/tmax;
+      HueSaturationBrightness.z=QuantumScale*tmax;
+
+      if (delta != 0.0f) {
+  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
+  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
+        HueSaturationBrightness.x/=6.0f;
+        HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
+      }
+    }
+    return HueSaturationBrightness;
+  }
+
+  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
+
+    float hue = HueSaturationBrightness.x;
+    float brightness = HueSaturationBrightness.z;
+    float saturation = HueSaturationBrightness.y;
+   
+    CLPixelType rgb;
+
+    if (saturation == 0.0f) {
+      setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
+      setGreen(&rgb,getRed(rgb));
+      setBlue(&rgb,getRed(rgb));
+    }
+    else {
+
+      float h=6.0f*(hue-floor(hue));
+      float f=h-floor(h);
+      float p=brightness*(1.0f-saturation);
+      float q=brightness*(1.0f-saturation*f);
+      float t=brightness*(1.0f-(saturation*(1.0f-f)));
+ 
+      float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
+      float clamped_t = ClampToQuantum(QuantumRange*t);
+      float clamped_p = ClampToQuantum(QuantumRange*p);
+      float clamped_q = ClampToQuantum(QuantumRange*q);
+      int ih = (int)h;
+      setRed(&rgb, (ih == 1)?clamped_q:
+        (ih == 2 || ih == 3)?clamped_p:
+        (ih == 4)?clamped_t:
+                 clampedBrightness);
+ 
+      setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
+        (ih == 3)?clamped_q:
+        (ih == 4 || ih == 5)?clamped_p:
+                 clamped_t);
+
+      setBlue(&rgb, (ih == 2)?clamped_t:
+        (ih == 3 || ih == 4)?clampedBrightness:
+        (ih == 5)?clamped_q:
+                 clamped_p);
+    }
+    return rgb;
+  }
+
+  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
+  {
+
+    const int sign = sharpen!=0?1:-1;
+    const int x = get_global_id(0);  
+    const int y = get_global_id(1);
+    const int columns = get_global_size(0);
+    const int c = x + y * columns;
+
+    CLPixelType pixel = im[c];
+    float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
+    float brightness = HueSaturationBrightness.z;
+    brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
+    brightness = clamp(brightness,0.0f,1.0f);
+    HueSaturationBrightness.z = brightness;
+
+    CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
+    filteredPixel.w = pixel.w;
+    im[c] = filteredPixel;
+  }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    C o n t r a s t S t r e t c h                                            %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+    STRINGIFY(
+    /*
+    */
+    __kernel void Histogram(__global CLPixelType * restrict im,
+      const ChannelType channel, 
+      const unsigned int colorspace,
+      const unsigned int method,
+      __global uint4 * restrict histogram)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
+        if ((channel & SyncChannels) != 0)
+        {
+          float red=(float)getRed(im[c]);
+          float green=(float)getGreen(im[c]);
+          float blue=(float)getBlue(im[c]);
+
+          float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
+          uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
+          atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
+        }
+        else
+        {
+          // for equalizing, we always need all channels?
+          // otherwise something more
+        }
+      }
+    )
+
+    STRINGIFY(
+    /*
+    */
+    __kernel void ContrastStretch(__global CLPixelType * restrict im,
+      const ChannelType channel,  
+      __global CLPixelType * restrict stretch_map,
+      const float4 white, const float4 black)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
+
+        uint ePos;
+        CLPixelType oValue, eValue;
+        CLQuantum red, green, blue, alpha;
+
+        //read from global
+        oValue=im[c];
+
+        if ((channel & RedChannel) != 0)
+        {
+          if (getRedF4(white) != getRedF4(black))
+          {
+            ePos = ScaleQuantumToMap(getRed(oValue)); 
+            eValue = stretch_map[ePos];
+            red = getRed(eValue);
+          }
+        }
+
+        if ((channel & GreenChannel) != 0)
+        {
+          if (getGreenF4(white) != getGreenF4(black))
+          {
+            ePos = ScaleQuantumToMap(getGreen(oValue)); 
+            eValue = stretch_map[ePos];
+            green = getGreen(eValue);
+          }
+        }
+
+        if ((channel & BlueChannel) != 0)
+        {
+          if (getBlueF4(white) != getBlueF4(black))
+          {
+            ePos = ScaleQuantumToMap(getBlue(oValue)); 
+            eValue = stretch_map[ePos];
+            blue = getBlue(eValue);
+          }
+        }
+
+        if ((channel & AlphaChannel) != 0)
+        {
+          if (getAlphaF4(white) != getAlphaF4(black))
+          {
+            ePos = ScaleQuantumToMap(getAlpha(oValue)); 
+            eValue = stretch_map[ePos];
+            alpha = getAlpha(eValue);
+          }
+        }
+
+        //write back
+        im[c]=(CLPixelType)(blue, green, red, alpha);
+
+      }
+    )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    C o n v o l v e                                                          %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+    __kernel 
+    void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
+    const unsigned int imageWidth, const unsigned int imageHeight,
+    __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
+    const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
+
+      int2 blockID;
+      blockID.x = get_global_id(0) / get_local_size(0);
+      blockID.y = get_global_id(1) / get_local_size(1);
+
+      // image area processed by this workgroup
+      int2 imageAreaOrg;
+      imageAreaOrg.x = blockID.x * get_local_size(0);
+      imageAreaOrg.y = blockID.y * get_local_size(1);
+
+      int2 midFilterDimen;
+      midFilterDimen.x = (filterWidth-1)/2;
+      midFilterDimen.y = (filterHeight-1)/2;
+
+      int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
+
+      // dimension of the local cache
+      int2 cachedAreaDimen;
+      cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
+      cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
+
+      // cache the pixels accessed by this workgroup in local memory
+      int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
+      int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
+      int groupSize = get_local_size(0) * get_local_size(1);
+      for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
+
+        int2 cachedAreaIndex;
+        cachedAreaIndex.x = i % cachedAreaDimen.x;
+        cachedAreaIndex.y = i / cachedAreaDimen.x;
+
+        int2 imagePixelIndex;
+        imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
+
+        // only support EdgeVirtualPixelMethod through ClampToCanvas
+        // TODO: implement other virtual pixel method
+        imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
+        imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
+
+        pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
+      }
+
+      // cache the filter
+      for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
+        filterCache[i] = filter[i];
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+
+      int2 imageIndex;
+      imageIndex.x = imageAreaOrg.x + get_local_id(0);
+      imageIndex.y = imageAreaOrg.y + get_local_id(1);
+
+      // if out-of-range, stops here and quit
+      if (imageIndex.x >= imageWidth
+        || imageIndex.y >= imageHeight) {
+          return;
+      }
+
+      int filterIndex = 0;
+      float4 sum = (float4)0.0f;
+      float gamma = 0.0f;
+      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
+        int cacheIndexY = get_local_id(1);
+        for (int j = 0; j < filterHeight; j++) {
+          int cacheIndexX = get_local_id(0);
+          for (int i = 0; i < filterWidth; i++) {
+            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
+            float f = filterCache[filterIndex];
+
+            sum.x += f * p.x;
+            sum.y += f * p.y;
+            sum.z += f * p.z; 
+            sum.w += f * p.w;
+
+            gamma += f;
+            filterIndex++;
+            cacheIndexX++;
+          }
+          cacheIndexY++;
+        }
+      }
+      else {
+        int cacheIndexY = get_local_id(1);
+        for (int j = 0; j < filterHeight; j++) {
+          int cacheIndexX = get_local_id(0);
+          for (int i = 0; i < filterWidth; i++) {
+
+            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
+            float alpha = QuantumScale*p.w;
+            float f = filterCache[filterIndex];
+            float g = alpha * f;
+
+            sum.x += g*p.x;
+            sum.y += g*p.y;
+            sum.z += g*p.z;
+            sum.w += f*p.w;
+
+            gamma += g;
+            filterIndex++;
+            cacheIndexX++;
+          }
+          cacheIndexY++;
+        }
+        gamma = PerceptibleReciprocal(gamma);
+        sum.xyz = gamma*sum.xyz;
+      }
+      CLPixelType outputPixel;
+      outputPixel.x = ClampToQuantum(sum.x);
+      outputPixel.y = ClampToQuantum(sum.y);
+      outputPixel.z = ClampToQuantum(sum.z);
+      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
+
+      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
+    }
+  )
+
+  STRINGIFY(
+    __kernel 
+    void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
+                  const uint imageWidth, const uint imageHeight,
+                  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
+                  const uint matte, const ChannelType channel) {
+
+      int2 imageIndex;
+      imageIndex.x = get_global_id(0);
+      imageIndex.y = get_global_id(1);
+
+      /*
+      unsigned int imageWidth = get_global_size(0);
+      unsigned int imageHeight = get_global_size(1);
+      */
+      if (imageIndex.x >= imageWidth
+          || imageIndex.y >= imageHeight)
+          return;
+
+      int2 midFilterDimen;
+      midFilterDimen.x = (filterWidth-1)/2;
+      midFilterDimen.y = (filterHeight-1)/2;
+
+      int filterIndex = 0;
+      float4 sum = (float4)0.0f;
+      float gamma = 0.0f;
+      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
+        for (int j = 0; j < filterHeight; j++) {
+          int2 inputPixelIndex;
+          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
+          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
+          for (int i = 0; i < filterWidth; i++) {
+            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
+            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
+        
+            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
+            float f = filter[filterIndex];
+
+            sum.x += f * p.x;
+            sum.y += f * p.y;
+            sum.z += f * p.z; 
+            sum.w += f * p.w;
+
+            gamma += f;
+
+            filterIndex++;
+          }
+        }
+      }
+      else {
+
+        for (int j = 0; j < filterHeight; j++) {
+          int2 inputPixelIndex;
+          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
+          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
+          for (int i = 0; i < filterWidth; i++) {
+            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
+            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
+        
+            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
+            float alpha = QuantumScale*p.w;
+            float f = filter[filterIndex];
+            float g = alpha * f;
+
+            sum.x += g*p.x;
+            sum.y += g*p.y;
+            sum.z += g*p.z;
+            sum.w += f*p.w;
+
+            gamma += g;
+
+
+            filterIndex++;
+          }
+        }
+        gamma = PerceptibleReciprocal(gamma);
+        sum.xyz = gamma*sum.xyz;
+      }
+
+      CLPixelType outputPixel;
+      outputPixel.x = ClampToQuantum(sum.x);
+      outputPixel.y = ClampToQuantum(sum.y);
+      outputPixel.z = ClampToQuantum(sum.z);
+      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
+
+      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
+    }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     D e s p e c k l e                                                       %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+
+  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
+  , const unsigned int imageWidth, const unsigned int imageHeight
+  , const int2 offset, const int polarity, const int matte) {
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    CLPixelType v = inputImage[y*imageWidth+x];
+
+    int2 neighbor;
+    neighbor.y = y + offset.y;
+    neighbor.x = x + offset.x;
+
+    int2 clampedNeighbor;
+    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
+    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
+
+    CLPixelType r = (clampedNeighbor.x == neighbor.x
+                     && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
+    :(CLPixelType)0;
+
+    int sv[4];
+    sv[0] = (int)v.x;
+    sv[1] = (int)v.y;
+    sv[2] = (int)v.z;
+    sv[3] = (int)v.w;
+
+    int sr[4];
+    sr[0] = (int)r.x;
+    sr[1] = (int)r.y;
+    sr[2] = (int)r.z;
+    sr[3] = (int)r.w;
+
+    if (polarity > 0) {
+      \n #pragma unroll 4\n
+      for (unsigned int i = 0; i < 4; i++) {
+        sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
+      }
+    }
+    else {
+      \n #pragma unroll 4\n
+      for (unsigned int i = 0; i < 4; i++) {
+        sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
+      }
+
+    }
+
+    v.x = (CLQuantum)sv[0];
+    v.y = (CLQuantum)sv[1];
+    v.z = (CLQuantum)sv[2];
+
+    if (matte!=0)
+      v.w = (CLQuantum)sv[3];
+
+    outputImage[y*imageWidth+x] = v;
+
+    }
+
+
+  )
+
+  STRINGIFY(
+
+  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
+  , const unsigned int imageWidth, const unsigned int imageHeight
+  , const int2 offset, const int polarity, const int matte) {
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    CLPixelType v = inputImage[y*imageWidth+x];
+
+    int2 neighbor, clampedNeighbor;
+
+    neighbor.y = y + offset.y;
+    neighbor.x = x + offset.x;
+    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
+    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
+
+    CLPixelType r = (clampedNeighbor.x == neighbor.x
+      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
+    :(CLPixelType)0;
+
+
+    neighbor.y = y - offset.y;
+    neighbor.x = x - offset.x;
+    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
+    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
+
+    CLPixelType s = (clampedNeighbor.x == neighbor.x
+      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
+    :(CLPixelType)0;
+
+
+    int sv[4];
+    sv[0] = (int)v.x;
+    sv[1] = (int)v.y;
+    sv[2] = (int)v.z;
+    sv[3] = (int)v.w;
+
+    int sr[4];
+    sr[0] = (int)r.x;
+    sr[1] = (int)r.y;
+    sr[2] = (int)r.z;
+    sr[3] = (int)r.w;
+
+    int ss[4];
+    ss[0] = (int)s.x;
+    ss[1] = (int)s.y;
+    ss[2] = (int)s.z;
+    ss[3] = (int)s.w;
+
+    if (polarity > 0) {
+      \n #pragma unroll 4\n
+      for (unsigned int i = 0; i < 4; i++) {
+        //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
+        //
+        //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
+        //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
+        sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
+      }
+    }
+    else {
+      \n #pragma unroll 4\n
+      for (unsigned int i = 0; i < 4; i++) {
+        //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
+        //
+        //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
+        sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
+      }
+    }
+
+    v.x = (CLQuantum)sv[0];
+    v.y = (CLQuantum)sv[1];
+    v.z = (CLQuantum)sv[2];
+
+    if (matte!=0)
+      v.w = (CLQuantum)sv[3];
+
+    outputImage[y*imageWidth+x] = v;
+
+    }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     E q u a l i z e                                                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+    STRINGIFY(
+    /*
+    */
+    __kernel void Equalize(__global CLPixelType * restrict im,
+      const ChannelType channel,  
+      __global CLPixelType * restrict equalize_map,
+      const float4 white, const float4 black)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
+
+        uint ePos;
+        CLPixelType oValue, eValue;
+        CLQuantum red, green, blue, alpha;
+
+        //read from global
+        oValue=im[c];
+
+        if ((channel & SyncChannels) != 0)
+        {
+          if (getRedF4(white) != getRedF4(black))
+          {
+            ePos = ScaleQuantumToMap(getRed(oValue)); 
+            eValue = equalize_map[ePos];
+            red = getRed(eValue);
+            ePos = ScaleQuantumToMap(getGreen(oValue)); 
+            eValue = equalize_map[ePos];
+            green = getRed(eValue);
+            ePos = ScaleQuantumToMap(getBlue(oValue)); 
+            eValue = equalize_map[ePos];
+            blue = getRed(eValue);
+            ePos = ScaleQuantumToMap(getAlpha(oValue)); 
+            eValue = equalize_map[ePos];
+            alpha = getRed(eValue);
+ 
+            //write back
+            im[c]=(CLPixelType)(blue, green, red, alpha);
+          }
+
+        }
+
+        // for equalizing, we always need all channels?
+        // otherwise something more
+
+     }
+    )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     F u n c t i o n                                                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+  /*
+  apply FunctionImageChannel(braightness-contrast)
+  */
+  CLQuantum ApplyFunction(float pixel,const MagickFunction function,
+    const unsigned int number_parameters,__constant float *parameters)
+  {
+    float result = 0.0f;
+
+    switch (function)
+    {
+    case PolynomialFunction:
+      {
+        for (unsigned int i=0; i < number_parameters; i++)
+          result = result*QuantumScale*pixel + parameters[i];
+        result *= QuantumRange;
+        break;
+      }
+    case SinusoidFunction:
+      {
+        float  freq,phase,ampl,bias;
+        freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
+        phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
+        ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
+        bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+        result = QuantumRange*(ampl*sin(2.0f*MagickPI*
+          (freq*QuantumScale*pixel + phase/360.0f)) + bias);
+        break;
+      }
+    case ArcsinFunction:
+      {
+        float  width,range,center,bias;
+        width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
+        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
+        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
+        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+
+        result = 2.0f/width*(QuantumScale*pixel - center);
+        result = range/MagickPI*asin(result)+bias;
+        result = ( result <= -1.0f ) ? bias - range/2.0f : result;
+        result = ( result >= 1.0f ) ? bias + range/2.0f : result;
+        result *= QuantumRange;
+        break;
+      }
+    case ArctanFunction:
+      {
+        float slope,range,center,bias;
+        slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
+        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
+        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
+        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+        result = MagickPI*slope*(QuantumScale*pixel-center);
+        result = QuantumRange*(range/MagickPI*atan(result) + bias);
+        break;
+      }
+    case UndefinedFunction:
+      break;
+    }
+    return(ClampToQuantum(result));
+  }
+  )
+
+  STRINGIFY(
+  /*
+  Improve brightness / contrast of the image
+  channel : define which channel is improved
+  function : the function called to enchance the brightness contrast
+  number_parameters : numbers of parameters 
+  parameters : the parameter
+  */
+  __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
+    const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
+    __constant float *parameters)
+  {
+    const unsigned int x = get_global_id(0);
+    const unsigned int y = get_global_id(1);
+    const unsigned int columns = get_global_size(0);
+    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
+
+    float red;
+    float green;
+    float blue;
+    float alpha;
+
+    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
+
+    if ((channel & RedChannel) != 0)
+      red=ApplyFunction(red, function, number_parameters, parameters);
+
+    if (number_channels > 2)
+      {
+        if ((channel & GreenChannel) != 0)
+          green=ApplyFunction(green, function, number_parameters, parameters);
+
+        if ((channel & BlueChannel) != 0)
+          blue=ApplyFunction(blue, function, number_parameters, parameters);
+      }
+
+    if (((number_channels == 4) || (number_channels == 2)) &&
+        ((channel & AlphaChannel) != 0))
+      alpha=ApplyFunction(alpha, function, number_parameters, parameters);
+
+    WriteChannels(p, number_channels, channel, red, green, blue, alpha);
+  }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     G r a y s c a l e                                                       %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+  __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
+    const unsigned int colorspace,const unsigned int method)
+  {
+    const unsigned int x = get_global_id(0);
+    const unsigned int y = get_global_id(1);
+    const unsigned int columns = get_global_size(0);
+    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
+
+    float
+      blue,
+      green,
+      red;
+
+    red=getPixelRed(p);
+    green=getPixelGreen(p);
+    blue=getPixelBlue(p);
+
+    CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
+
+    setPixelRed(p,intensity);
+    setPixelGreen(p,intensity);
+    setPixelBlue(p,intensity);
+  }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     L o c a l C o n t r a s t                                               %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+    STRINGIFY(
+
+      __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
+          const int radius, 
+          const int imageWidth,
+          const int imageHeight)
+      {
+        const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
+
+        int x = get_local_id(0);
+        int y = get_global_id(1);
+
+        global CLPixelType *src = srcImage + y * imageWidth;
+
+        for (int i = x; i < imageWidth; i += get_local_size(0)) {
+            float sum = 0.0f;
+            float weight = 1.0f;
+
+            int j = i - radius;
+            while ((j + 7) < i) {
+                for (int k = 0; k < 8; ++k) // Unroll 8x
+                    sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
+                weight += 8.0f;
+                j+=8;
+            }
+            while (j < i) {
+                sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
+                weight += 1.0f;
+                ++j;
+            }
+
+            while ((j + 7) < radius + i) {
+                for (int k = 0; k < 8; ++k) // Unroll 8x
+                    sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
+                weight -= 8.0f;
+                j+=8;
+            }
+            while (j < radius + i) {
+                sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
+                weight -= 1.0f;
+                ++j;
+            }
+
+            tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
+        }
+      }
+    )
+
+    STRINGIFY(
+      __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
+          const int radius, 
+          const float strength,
+          const int imageWidth,
+          const int imageHeight)
+      {
+        const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
+
+        int x = get_global_id(0);
+        int y = get_global_id(1);
+
+        if ((x >= imageWidth) || (y >= imageHeight))
+                return;
+
+        global float *src = blurImage + x;
+
+        float sum = 0.0f;
+        float weight = 1.0f;
+
+        int j = y - radius;
+        while ((j + 7) < y) {
+            for (int k = 0; k < 8; ++k) // Unroll 8x
+                sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
+            weight += 8.0f;
+            j+=8;
+        }
+        while (j < y) {
+            sum += weight * src[mirrorBottom(j) * imageWidth];
+            weight += 1.0f;
+            ++j;
+        }
+
+        while ((j + 7) < radius + y) {
+            for (int k = 0; k < 8; ++k) // Unroll 8x
+                sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
+            weight -= 8.0f;
+            j+=8;
+        }
+        while (j < radius + y) {
+            sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
+            weight -= 1.0f;
+            ++j;
+        }
+
+        CLPixelType pixel = srcImage[x + y * imageWidth];
+        float srcVal = dot(RGB, convert_float4(pixel));
+        float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
+        mult = (srcVal + mult) / srcVal;
+
+        pixel.x = ClampToQuantum(pixel.x * mult);
+        pixel.y = ClampToQuantum(pixel.y * mult);
+        pixel.z = ClampToQuantum(pixel.z * mult);
+
+        dstImage[x + y * imageWidth] = pixel;
+      }
+    )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     M o d u l a t e                                                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+
+  inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
+    float *hue, float *saturation, float *lightness)
+  {
+  float
+    c,
+    tmax,
+    tmin;
+
+  /*
+     Convert RGB to HSL colorspace.
+     */
+  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
+  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
+
+  c=tmax-tmin;
+
+  *lightness=(tmax+tmin)/2.0;
+  if (c <= 0.0)
+  {
+    *hue=0.0;
+    *saturation=0.0;
+    return;
+  }
+
+  if (tmax == (QuantumScale*red))
+  {
+    *hue=(QuantumScale*green-QuantumScale*blue)/c;
+    if ((QuantumScale*green) < (QuantumScale*blue))
+      *hue+=6.0;
+  }
+  else
+    if (tmax == (QuantumScale*green))
+      *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
+    else
+      *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
+
+  *hue*=60.0/360.0;
+  if (*lightness <= 0.5)
+    *saturation=c/(2.0*(*lightness));
+  else
+    *saturation=c/(2.0-2.0*(*lightness));
+  }
+
+  inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
+      CLQuantum *red,CLQuantum *green,CLQuantum *blue)
+  {
+    float
+      b,
+      c,
+      g,
+      h,
+      tmin,
+      r,
+      x;
+
+    /*
+       Convert HSL to RGB colorspace.
+       */
+    h=hue*360.0;
+    if (lightness <= 0.5)
+      c=2.0*lightness*saturation;
+    else
+      c=(2.0-2.0*lightness)*saturation;
+    tmin=lightness-0.5*c;
+    h-=360.0*floor(h/360.0);
+    h/=60.0;
+    x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
+    switch ((int) floor(h) % 6)
+    {
+      case 0:
+        {
+          r=tmin+c;
+          g=tmin+x;
+          b=tmin;
+          break;
+        }
+      case 1:
+        {
+          r=tmin+x;
+          g=tmin+c;
+          b=tmin;
+          break;
+        }
+      case 2:
+        {
+          r=tmin;
+          g=tmin+c;
+          b=tmin+x;
+          break;
+        }
+      case 3:
+        {
+          r=tmin;
+          g=tmin+x;
+          b=tmin+c;
+          break;
+        }
+      case 4:
+        {
+          r=tmin+x;
+          g=tmin;
+          b=tmin+c;
+          break;
+        }
+      case 5:
+        {
+          r=tmin+c;
+          g=tmin;
+          b=tmin+x;
+          break;
+        }
+      default:
+        {
+          r=0.0;
+          g=0.0;
+          b=0.0;
+        }
+    }
+    *red=ClampToQuantum(QuantumRange*r);
+    *green=ClampToQuantum(QuantumRange*g);
+    *blue=ClampToQuantum(QuantumRange*b);
+  }
+
+  inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 
+    CLQuantum *red,CLQuantum *green,CLQuantum *blue)
+  {
+    float
+      hue,
+      lightness,
+      saturation;
+
+    /*
+    Increase or decrease color lightness, saturation, or hue.
+    */
+    ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
+    hue+=0.5*(0.01*percent_hue-1.0);
+    while (hue < 0.0)
+      hue+=1.0;
+    while (hue >= 1.0)
+      hue-=1.0;
+    saturation*=0.01*percent_saturation;
+    lightness*=0.01*percent_lightness;
+    ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
+  }
+
+  __kernel void Modulate(__global CLPixelType *im, 
+    const float percent_brightness, 
+    const float percent_hue, 
+    const float percent_saturation, 
+    const int colorspace)
+  {
+
+    const int x = get_global_id(0);  
+    const int y = get_global_id(1);
+    const int columns = get_global_size(0);
+    const int c = x + y * columns;
+
+    CLPixelType pixel = im[c];
+
+    CLQuantum
+        blue,
+        green,
+        red;
+
+    red=getRed(pixel);
+    green=getGreen(pixel);
+    blue=getBlue(pixel);
+
+    switch (colorspace)
+    {
+      case HSLColorspace:
+      default:
+        {
+          ModulateHSL(percent_hue, percent_saturation, percent_brightness, 
+              &red, &green, &blue);
+        }
+
+    }
+
+    CLPixelType filteredPixel;
+   
+    setRed(&filteredPixel, red);
+    setGreen(&filteredPixel, green);
+    setBlue(&filteredPixel, blue);
+    filteredPixel.w = pixel.w;
+
+    im[c] = filteredPixel;
+  }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     M o t i o n B l u r                                                     %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+    __kernel 
+    void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
+                    const unsigned int imageWidth, const unsigned int imageHeight,
+                    const __global float *filter, const unsigned int width, const __global int2* offset,
+                    const float4 bias,
+                    const ChannelType channel, const unsigned int matte) {
+
+      int2 currentPixel;
+      currentPixel.x = get_global_id(0);
+      currentPixel.y = get_global_id(1);
+
+      if (currentPixel.x >= imageWidth
+          || currentPixel.y >= imageHeight)
+          return;
+
+      float4 pixel;
+      pixel.x = (float)bias.x;
+      pixel.y = (float)bias.y;
+      pixel.z = (float)bias.z;
+      pixel.w = (float)bias.w;
+
+      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
+        
+        for (int i = 0; i < width; i++) {
+          // only support EdgeVirtualPixelMethod through ClampToCanvas
+          // TODO: implement other virtual pixel method
+          int2 samplePixel = currentPixel + offset[i];
+          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
+          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
+          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
+
+          pixel.x += (filter[i] * (float)samplePixelValue.x);
+          pixel.y += (filter[i] * (float)samplePixelValue.y);
+          pixel.z += (filter[i] * (float)samplePixelValue.z);
+          pixel.w += (filter[i] * (float)samplePixelValue.w);
+        }
+
+        CLPixelType outputPixel;
+        outputPixel.x = ClampToQuantum(pixel.x);
+        outputPixel.y = ClampToQuantum(pixel.y);
+        outputPixel.z = ClampToQuantum(pixel.z);
+        outputPixel.w = ClampToQuantum(pixel.w);
+        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
+      }
+      else {
+
+        float gamma = 0.0f;
+        for (int i = 0; i < width; i++) {
+          // only support EdgeVirtualPixelMethod through ClampToCanvas
+          // TODO: implement other virtual pixel method
+          int2 samplePixel = currentPixel + offset[i];
+          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
+          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
+
+          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
+
+          float alpha = QuantumScale*samplePixelValue.w;
+          float k = filter[i];
+          pixel.x = pixel.x + k * alpha * samplePixelValue.x;
+          pixel.y = pixel.y + k * alpha * samplePixelValue.y;
+          pixel.z = pixel.z + k * alpha * samplePixelValue.z;
+
+          pixel.w += k * alpha * samplePixelValue.w;
+
+          gamma+=k*alpha;
+        }
+        gamma = PerceptibleReciprocal(gamma);
+        pixel.xyz = gamma*pixel.xyz;
+
+        CLPixelType outputPixel;
+        outputPixel.x = ClampToQuantum(pixel.x);
+        outputPixel.y = ClampToQuantum(pixel.y);
+        outputPixel.z = ClampToQuantum(pixel.z);
+        outputPixel.w = ClampToQuantum(pixel.w);
+        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
+      }
+    }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     R e s i z e                                                             %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+  // Based on Box from resize.c
+  float BoxResizeFilter(const float x)
+  {
+    return 1.0f;
+  }
+  )
+    
+  STRINGIFY(
+  // Based on CubicBC from resize.c
+  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
+  {
+    /*
+    Cubic Filters using B,C determined values:
+    Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
+    Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
+    Spline              B = 1   C = 0    B-Spline Gaussian approximation
+    Hermite             B = 0   C = 0    B-Spline interpolator
+
+    See paper by Mitchell and Netravali, Reconstruction Filters in Computer
+    Graphics Computer Graphics, Volume 22, Number 4, August 1988
+    http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
+    Mitchell.pdf.
+
+    Coefficents are determined from B,C values:
+    P0 = (  6 - 2*B       )/6 = coeff[0]
+    P1 =         0
+    P2 = (-18 +12*B + 6*C )/6 = coeff[1]
+    P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
+    Q0 = (      8*B +24*C )/6 = coeff[3]
+    Q1 = (    -12*B -48*C )/6 = coeff[4]
+    Q2 = (      6*B +30*C )/6 = coeff[5]
+    Q3 = (    - 1*B - 6*C )/6 = coeff[6]
+
+    which are used to define the filter:
+
+    P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
+    Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
+
+    which ensures function is continuous in value and derivative (slope).
+    */
+    if (x < 1.0)
+      return(resizeFilterCoefficients[0]+x*(x*
+      (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
+    if (x < 2.0)
+      return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
+      (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
+    return(0.0);
+  }
+  )
+
+  STRINGIFY(
+  float Sinc(const float x)
+  {
+    if (x != 0.0f)
+    {
+      const float alpha=(float) (MagickPI*x);
+      return sinpi(x)/alpha;
+    }
+    return(1.0f);
+  }
+  )
+
+  STRINGIFY(
+  float Triangle(const float x)
+  {
+    /*
+    1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
+    a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
+    for Sinc().
+    */
+    return ((x<1.0f)?(1.0f-x):0.0f);
+  }
+  )
+
+
+  STRINGIFY(
+  float Hann(const float x)
+  {
+    /*
+    Cosine window function:
+      0.5+0.5*cos(pi*x).
+    */
+    const float cosine=cos((MagickPI*x));
+    return(0.5f+0.5f*cosine);
+  }
+  )
+
+  STRINGIFY(
+  float Hamming(const float x)
+  {
+    /*
+      Offset cosine window function:
+       .54 + .46 cos(pi x).
+    */
+    const float cosine=cos((MagickPI*x));
+    return(0.54f+0.46f*cosine);
+  }
+  )
+
+  STRINGIFY(
+  float Blackman(const float x)
+  {
+    /*
+      Blackman: 2nd order cosine windowing function:
+        0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
+
+      Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
+      five flops.
+    */
+    const float cosine=cos((MagickPI*x));
+    return(0.34f+cosine*(0.5f+cosine*0.16f));
+  }
+  )
+
+  STRINGIFY(
+  inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
+  {
+    switch (filterType)
+    {
+    /* Call Sinc even for SincFast to get better precision on GPU 
+       and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
+    case SincWeightingFunction:
+    case SincFastWeightingFunction:
+      return Sinc(x);
+    case CubicBCWeightingFunction:
+      return CubicBC(x,filterCoefficients);
+    case BoxWeightingFunction:
+      return BoxResizeFilter(x);
+    case TriangleWeightingFunction:
+      return Triangle(x);
+    case HannWeightingFunction:
+      return Hann(x);
+    case HammingWeightingFunction:
+      return Hamming(x);
+    case BlackmanWeightingFunction:
+      return Blackman(x);
+
+    default:
+      return 0.0f;
+    }
+  }
+  )
+
+
+  STRINGIFY(
+  inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
+           , const ResizeWeightingFunctionType resizeWindowType
+           , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
+  {
+    float scale;
+    float xBlur = fabs(x/resizeFilterBlur);
+    if (resizeWindowSupport < MagickEpsilon
+        || resizeWindowType == BoxWeightingFunction)
+    {
+      scale = 1.0f;
+    }
+    else
+    {
+      scale = resizeFilterScale;
+      scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
+    }
+    float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
+    return weight;
+  }
+
+  )
+
+  ;
+  const char *accelerateKernels2 =
+
+  STRINGIFY(
+
+  inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
+    return (numWorkItems/pixelPerWorkgroup);
+  }
+
+  // returns the index of the pixel for the current workitem to compute.
+  // returns -1 if this workitem doesn't need to participate in any computation
+  inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
+    const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
+    int pixelIndex = itemID/numWorkItemsPerPixel;
+    pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
+    return pixelIndex;
+  }
+ 
+  )
+
+  STRINGIFY(
+  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
+    void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
+      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
+      const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
+      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
+      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
+      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
+      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
+      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
+  {
+    // calculate the range of resized image pixels computed by this workgroup
+    const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
+    const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
+    const unsigned int actualNumPixelToCompute = stopX - startX;
+
+    // calculate the range of input image pixels to cache
+    float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
+    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
+    scale = PerceptibleReciprocal(scale);
+
+    const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
+    const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
+
+    // cache the input pixels into local memory
+    const unsigned int y = get_global_id(1);
+    const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
+    const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
+    event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
+    wait_group_events(1, &e);
+
+    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
+    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
+    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
+    {
+      const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
+      const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
+      const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
+
+      // determine which resized pixel computed by this workitem
+      const unsigned int itemID = get_local_id(0);
+      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
+
+      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
+
+      float4 filteredPixel = (float4)0.0f;
+      float density = 0.0f;
+      float gamma = 0.0f;
+      // -1 means this workitem doesn't participate in the computation
+      if (pixelIndex != -1)
+      {
+        // x coordinated of the resized pixel computed by this workitem
+        const int x = chunkStartX + pixelIndex;
+
+        // calculate how many steps required for this pixel
+        const float bisect = (x+0.5)/xFactor+MagickEpsilon;
+        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
+        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
+        const unsigned int n = stop - start;
+
+        // calculate how many steps this workitem will contribute
+        unsigned int numStepsPerWorkItem = n / numItems;
+        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
+
+        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
+        if (startStep < n)
+        {
+          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
+
+          unsigned int cacheIndex = start+startStep-cacheRangeStartX;
+
+          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
+          {
+            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
+              (ResizeWeightingFunctionType) resizeFilterType,
+              (ResizeWeightingFunctionType) resizeWindowType,
+              resizeFilterScale, resizeFilterWindowSupport,
+              resizeFilterBlur, scale*(start + i - bisect + 0.5));
+
+            float4 cp = (float4) 0;
+
+            __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
+            cp.x = (float) *(p);
+            if (number_channels > 2)
+            {
+              cp.y = (float) *(p + 1);
+              cp.z = (float) *(p + 2);
+            }
+
+            if (alpha_index != 0)
+            {
+              cp.w = (float) *(p + alpha_index);
+
+              float alpha = weight * QuantumScale * cp.w;
+
+              filteredPixel.x += alpha * cp.x;
+              filteredPixel.y += alpha * cp.y;
+              filteredPixel.z += alpha * cp.z;
+              filteredPixel.w += weight * cp.w;
+              gamma += alpha;
+            }
+            else
+              filteredPixel += ((float4) weight)*cp;
+
+            density += weight;
+          }
+        }
+      }
+
+      // initialize the accumulators to zero
+      if (itemID < actualNumPixelInThisChunk) {
+        outputPixelCache[itemID] = (float4)0.0f;
+        densityCache[itemID] = 0.0f;
+        if (alpha_index != 0)
+          gammaCache[itemID] = 0.0f;
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // accumulatte the filtered pixel value and the density
+      for (unsigned int i = 0; i < numItems; i++) {
+        if (pixelIndex != -1) {
+          if (itemID%numItems == i) {
+            outputPixelCache[pixelIndex]+=filteredPixel;
+            densityCache[pixelIndex]+=density;
+            if (alpha_index != 0)
+              gammaCache[pixelIndex]+=gamma;
+          }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+
+      if (itemID < actualNumPixelInThisChunk)
+      {
+        float4 filteredPixel = outputPixelCache[itemID];
+
+        float gamma = 0.0f;
+        if (alpha_index != 0)
+          gamma = gammaCache[itemID];
+
+        float density = densityCache[itemID];
+        if ((density != 0.0f) && (density != 1.0f))
+        {
+          density = PerceptibleReciprocal(density);
+          filteredPixel *= (float4) density;
+          gamma *= density;
+        }
+
+        if (alpha_index != 0)
+        {
+          gamma = PerceptibleReciprocal(gamma);
+          filteredPixel.x *= gamma;
+          filteredPixel.y *= gamma;
+          filteredPixel.z *= gamma;
+        }
+
+        WriteFloat4(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, AllChannels, filteredPixel);
+      }
+    }
+  }
+  )
+
+
+  STRINGIFY(
+ __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
+    void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
+      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
+      const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
+      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
+      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
+      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
+      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
+      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
+  {
+    // calculate the range of resized image pixels computed by this workgroup
+    const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
+    const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
+    const unsigned int actualNumPixelToCompute = stopY - startY;
+
+    // calculate the range of input image pixels to cache
+    float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
+    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
+    scale = PerceptibleReciprocal(scale);
+
+    const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
+    const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
+
+    // cache the input pixels into local memory
+    const unsigned int x = get_global_id(0);
+    unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
+    unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
+    unsigned int stride = inputColumns * number_channels;
+    for (unsigned int i = 0; i < number_channels; i++)
+    {
+      event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
+      wait_group_events(1,&e);
+    }
+
+    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
+    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
+    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
+    {
+      const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
+      const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
+      const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
+
+      // determine which resized pixel computed by this workitem
+      const unsigned int itemID = get_local_id(1);
+      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
+
+      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
+
+      float4 filteredPixel = (float4)0.0f;
+      float density = 0.0f;
+      float gamma = 0.0f;
+      // -1 means this workitem doesn't participate in the computation
+      if (pixelIndex != -1)
+      {
+        // x coordinated of the resized pixel computed by this workitem
+        const int y = chunkStartY + pixelIndex;
+
+        // calculate how many steps required for this pixel
+        const float bisect = (y+0.5)/yFactor+MagickEpsilon;
+        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
+        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
+        const unsigned int n = stop - start;
+
+        // calculate how many steps this workitem will contribute
+        unsigned int numStepsPerWorkItem = n / numItems;
+        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
+
+        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
+        if (startStep < n)
+        {
+          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
+
+          unsigned int cacheIndex = start+startStep-cacheRangeStartY;
+          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
+          {
+            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
+              (ResizeWeightingFunctionType) resizeFilterType,
+              (ResizeWeightingFunctionType) resizeWindowType,
+              resizeFilterScale, resizeFilterWindowSupport,
+              resizeFilterBlur, scale*(start + i - bisect + 0.5));
+
+            float4 cp = (float4)0.0f;
+
+            __local CLQuantum *p = inputImageCache + cacheIndex;
+            cp.x = (float) *(p);
+            if (number_channels > 2)
+            {
+              cp.y = (float) *(p + rangeLength);
+              cp.z = (float) *(p + (rangeLength * 2));
+            }
+
+            if (alpha_index != 0)
+            {
+              cp.w = (float) *(p + (rangeLength * alpha_index));
+
+              float alpha = weight * QuantumScale * cp.w;
+
+              filteredPixel.x += alpha * cp.x;
+              filteredPixel.y += alpha * cp.y;
+              filteredPixel.z += alpha * cp.z;
+              filteredPixel.w += weight * cp.w;
+              gamma += alpha;
+            }
+            else
+              filteredPixel += ((float4) weight)*cp;
+
+            density += weight;
+          }
+        }
+      }
+
+      // initialize the accumulators to zero
+      if (itemID < actualNumPixelInThisChunk) {
+        outputPixelCache[itemID] = (float4)0.0f;
+        densityCache[itemID] = 0.0f;
+        if (alpha_index != 0)
+          gammaCache[itemID] = 0.0f;
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // accumulatte the filtered pixel value and the density
+      for (unsigned int i = 0; i < numItems; i++) {
+        if (pixelIndex != -1) {
+          if (itemID%numItems == i) {
+            outputPixelCache[pixelIndex]+=filteredPixel;
+            densityCache[pixelIndex]+=density;
+            if (alpha_index != 0)
+              gammaCache[pixelIndex]+=gamma;
+          }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+
+      if (itemID < actualNumPixelInThisChunk)
+      {
+        float4 filteredPixel = outputPixelCache[itemID];
+
+        float gamma = 0.0f;
+        if (alpha_index != 0)
+          gamma = gammaCache[itemID];
+
+        float density = densityCache[itemID];
+        if ((density != 0.0f) && (density != 1.0f))
+        {
+          density = PerceptibleReciprocal(density);
+          filteredPixel *= (float4) density;
+          gamma *= density;
+        }
+
+        if (alpha_index != 0)
+        {
+          gamma = PerceptibleReciprocal(gamma);
+          filteredPixel.x *= gamma;
+          filteredPixel.y *= gamma;
+          filteredPixel.z *= gamma;
+        }
+
+        WriteFloat4(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, AllChannels, filteredPixel);
+      }
+    }
+  }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     R o t a t i o n a l B l u r                                             %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+  __kernel void RotationalBlur(const __global CLQuantum *image,const unsigned int number_channels,
+    const unsigned int channel,const float4 bias,const float2 blurCenter,__constant float *cos_theta,
+    __constant float *sin_theta,const unsigned int cossin_theta_size,__global CLQuantum *filteredImage)
+  {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int columns = get_global_size(0);
+    const int rows = get_global_size(1);
+    unsigned int step = 1;
+    float center_x = (float) x - blurCenter.x;
+    float center_y = (float) y - blurCenter.y;
+    float radius = hypot(center_x, center_y);
+
+    float blur_radius = hypot(blurCenter.x, blurCenter.y);
+
+    if (radius > MagickEpsilon)
+    {
+      step = (unsigned int) (blur_radius / radius);
+      if (step == 0)
+        step = 1;
+      if (step >= cossin_theta_size)
+        step = cossin_theta_size-1;
+    }
+
+    float4 result = bias;
+
+    float normalize = 0.0f;
+    float gamma = 0.0f;
+
+    for (unsigned int i=0; i<cossin_theta_size; i+=step)
+    {
+      int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
+      int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
+
+      float4 pixel = ReadFloat4(image, number_channels, columns, cx, cy, channel);
+
+      if ((number_channels == 4) || (number_channels == 2))
+      {
+        float alpha = (float)(QuantumScale*pixel.w);
+
+        gamma += alpha;
+
+        result.x += alpha * pixel.x;
+        result.y += alpha * pixel.y;
+        result.z += alpha * pixel.z;
+        result.w += pixel.w;
+      }
+      else
+        result += pixel;
+
+      normalize += 1.0f;
+    }
+
+    normalize = PerceptibleReciprocal(normalize);
+
+    if ((number_channels == 4) || (number_channels == 2))
+    {
+      gamma = PerceptibleReciprocal(gamma);
+      result.x *= gamma;
+      result.y *= gamma;
+      result.z *= gamma;
+      result.w *= normalize;
+    }
+    else
+      result *= normalize;
+
+    WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
+  }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     U n s h a r p M a s k                                                   %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+  __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
+    const __global float4 *blurRowData,const unsigned int number_channels,
+    const ChannelType channel,const unsigned int columns,
+    const unsigned int rows,__local float4* cachedData,
+    __local float* cachedFilter,const __global float *filter,
+    const unsigned int width,const float gain, const float threshold,
+    __global CLQuantum *filteredImage)
+  {
+    const unsigned int radius = (width-1)/2;
+
+    // cache the pixel shared by the workgroup
+    const int groupX = get_group_id(0);
+    const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
+    const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
+
+    if ((groupStartY >= 0) && (groupStopY < rows))
+    {
+      event_t e = async_work_group_strided_copy(cachedData,
+        blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
+      wait_group_events(1,&e);
+    }
+    else
+    {
+      for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
+        cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
+
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    // cache the filter as well
+    event_t e = async_work_group_copy(cachedFilter,filter,width,0);
+    wait_group_events(1,&e);
+
+    // only do the work if this is not a patched item
+    const int cy = get_global_id(1);
+
+    if (cy < rows)
+    {
+      float4 blurredPixel = (float4) 0.0f;
+
+      int i = 0;
+
+      for ( ; i+7 < width; )
+      {
+        for (int j=0; j < 8; j++, i++)
+          blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
+      }
+
+      for ( ; i < width; i++)
+        blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+
+      float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
+      float4 outputPixel = inputImagePixel - blurredPixel;
+
+      float quantumThreshold = QuantumRange*threshold;
+
+      int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
+      outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
+
+      //write back
+      WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
+    }
+  }
+  )
+
+  STRINGIFY(
+  __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
+    const ChannelType channel,__constant float *filter,const unsigned int width,
+    const unsigned int columns,const unsigned int rows,__local float4 *pixels,
+    const float gain,const float threshold,__global CLQuantum *filteredImage)
+  {
+    const unsigned int x = get_global_id(0);
+    const unsigned int y = get_global_id(1);
+
+    const unsigned int radius = (width - 1) / 2;
+
+    int row = y - radius;
+    int baseRow = get_group_id(1) * get_local_size(1) - radius;
+    int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
+
+    while (row < endRow) {
+      int srcy = (row < 0) ? -row : row; // mirror pad
+      srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
+
+      float4 value = 0.0f;
+
+      int ix = x - radius;
+      int i = 0;
+
+      while (i + 7 < width) {
+        for (int j = 0; j < 8; ++j) { // unrolled
+          int srcx = ix + j;
+          srcx = (srcx < 0) ? -srcx : srcx;
+          srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
+          value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
+        }
+        ix += 8;
+        i += 8;
+      }
+
+      while (i < width) {
+        int srcx = (ix < 0) ? -ix : ix; // mirror pad
+        srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
+        value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
+        ++i;
+        ++ix;
+      }
+      pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
+      row += get_local_size(1);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const int px = get_local_id(0);
+    const int py = get_local_id(1);
+    const int prp = get_local_size(0);
+    float4 value = (float4)(0.0f);
+
+    int i = 0;
+    while (i + 7 < width) {
+      for (int j = 0; j < 8; ++j) // unrolled
+        value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
+      i += 8;
+    }
+    while (i < width) {
+      value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
+      ++i;
+    }
+
+    float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
+    float4 diff = srcPixel - value;
+
+    float quantumThreshold = QuantumRange*threshold;
+
+    int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
+    value = select(srcPixel + diff * gain, srcPixel, mask);
+
+    if ((x < columns) && (y < rows))
+      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
+  }
+  )
+
+  STRINGIFY(
+    __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
+    void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
+      const unsigned int number_channels,const unsigned int max_channels,
+      const float threshold,const int passes,const unsigned int imageWidth,
+      const unsigned int imageHeight)
+  {
+    const int pad = (1 << (passes - 1));
+    const int tileSize = 64;
+    const int tileRowPixels = 64;
+    const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
+
+    CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
+
+    local float buffer[64 * 64];
+
+    int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0);
+    int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad;
+
+    for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
+      int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
+                (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
+
+      for (int channel = 0; channel < max_channels; ++channel)
+        stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
+    }
+
+    for (int channel = 0; channel < max_channels; ++channel) {
+      // Load LDS
+      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
+        buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
+
+      // Process
+
+      float tmp[16];
+      float accum[16];
+      float pixel;
+
+      for (int i = 0; i < 16; i++)
+        accum[i]=0.0f;
+
+      for (int pass = 0; pass < passes; ++pass) {
+        const int radius = 1 << pass;
+        const int x = get_local_id(0);
+        const float thresh = threshold * noise[pass];
+
+        // Apply horizontal hat
+        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
+          const int offset = i * tileRowPixels;
+          if (pass == 0)
+            tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
+          pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
+          barrier(CLK_LOCAL_MEM_FENCE);
+          buffer[x + offset] = pixel;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Apply vertical hat
+        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
+          pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
+          float delta = tmp[i / 4] - pixel;
+          tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
+          if (delta < -thresh)
+            delta += thresh;
+          else if (delta > thresh)
+            delta -= thresh;
+          else
+            delta = 0;
+          accum[i / 4] += delta;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (pass < passes - 1)
+          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
+            buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
+        else  // last pass
+          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
+            accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+
+      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
+        stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
+
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Write from stage to output
+
+    if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
+      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
+        if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
+          int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
+          for (int channel = 0; channel < max_channels; ++channel) {
+            dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
+          }
+        }
+      }
+    }
+  }
+  )
+
+  ;
+
+#endif // MAGICKCORE_OPENCL_SUPPORT
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif // _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
diff --git a/MagickCore/accelerate-private.h b/MagickCore/accelerate-private.h

index 95b2eab75bc9744adb8108b5b2f2b2817670a7dd..bd2b6c5e3c0a0333f1d8873ad0ee535d6a9d78f4 100644 (file)
--- a/MagickCore/accelerate-private.h
+++ b/MagickCore/accelerate-private.h
@@ -29,3239 +29,6 @@
  extern "C" {
  #endif
  
-#if defined(MAGICKCORE_OPENCL_SUPPORT)
-
-/*
-  Define declarations.
-*/
-#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
-#define OPENCL_ELIF(...)       "\n #""elif " #__VA_ARGS__ " \n"
-#define OPENCL_ELSE()          "\n #""else " " \n"
-#define OPENCL_ENDIF()         "\n #""endif " " \n"
-#define OPENCL_IF(...)         "\n #""if " #__VA_ARGS__ " \n"
-#define STRINGIFY(...) #__VA_ARGS__ "\n"
-
-/*
-  Typedef declarations.
-*/
-
-typedef struct _FloatPixelPacket
-{
-  MagickRealType
-    red,
-    green,
-    blue,
-    alpha,
-    black;
-} FloatPixelPacket;
-
-const char *accelerateKernels =
-
-/*
-  Define declarations.
-*/
-  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
-  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
-  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
-  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
-  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
-  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
-  OPENCL_DEFINE(SigmaRandom, (attenuate))
-  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
-  OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
-  OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
-
-/*
-  Typedef declarations.
-*/
-  STRINGIFY(
-    typedef enum
-  {
-    UndefinedColorspace,
-    RGBColorspace,            /* Linear RGB colorspace */
-    GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
-    TransparentColorspace,
-    OHTAColorspace,
-    LabColorspace,
-    XYZColorspace,
-    YCbCrColorspace,
-    YCCColorspace,
-    YIQColorspace,
-    YPbPrColorspace,
-    YUVColorspace,
-    CMYKColorspace,           /* negared linear RGB with black separated */
-    sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
-    HSBColorspace,
-    HSLColorspace,
-    HWBColorspace,
-    Rec601LumaColorspace,
-    Rec601YCbCrColorspace,
-    Rec709LumaColorspace,
-    Rec709YCbCrColorspace,
-    LogColorspace,
-    CMYColorspace,            /* negated linear RGB colorspace */
-    LuvColorspace,
-    HCLColorspace,
-    LCHColorspace,            /* alias for LCHuv */
-    LMSColorspace,
-    LCHabColorspace,          /* Cylindrical (Polar) Lab */
-    LCHuvColorspace,          /* Cylindrical (Polar) Luv */
-    scRGBColorspace,
-    HSIColorspace,
-    HSVColorspace,            /* alias for HSB */
-    HCLpColorspace,
-    YDbDrColorspace
-  } ColorspaceType;
-  )
-
-  STRINGIFY(
-    typedef enum
-    {
-      UndefinedCompositeOp,
-      NoCompositeOp,
-      ModulusAddCompositeOp,
-      AtopCompositeOp,
-      BlendCompositeOp,
-      BumpmapCompositeOp,
-      ChangeMaskCompositeOp,
-      ClearCompositeOp,
-      ColorBurnCompositeOp,
-      ColorDodgeCompositeOp,
-      ColorizeCompositeOp,
-      CopyBlackCompositeOp,
-      CopyBlueCompositeOp,
-      CopyCompositeOp,
-      CopyCyanCompositeOp,
-      CopyGreenCompositeOp,
-      CopyMagentaCompositeOp,
-      CopyOpacityCompositeOp,
-      CopyRedCompositeOp,
-      CopyYellowCompositeOp,
-      DarkenCompositeOp,
-      DstAtopCompositeOp,
-      DstCompositeOp,
-      DstInCompositeOp,
-      DstOutCompositeOp,
-      DstOverCompositeOp,
-      DifferenceCompositeOp,
-      DisplaceCompositeOp,
-      DissolveCompositeOp,
-      ExclusionCompositeOp,
-      HardLightCompositeOp,
-      HueCompositeOp,
-      InCompositeOp,
-      LightenCompositeOp,
-      LinearLightCompositeOp,
-      LuminizeCompositeOp,
-      MinusDstCompositeOp,
-      ModulateCompositeOp,
-      MultiplyCompositeOp,
-      OutCompositeOp,
-      OverCompositeOp,
-      OverlayCompositeOp,
-      PlusCompositeOp,
-      ReplaceCompositeOp,
-      SaturateCompositeOp,
-      ScreenCompositeOp,
-      SoftLightCompositeOp,
-      SrcAtopCompositeOp,
-      SrcCompositeOp,
-      SrcInCompositeOp,
-      SrcOutCompositeOp,
-      SrcOverCompositeOp,
-      ModulusSubtractCompositeOp,
-      ThresholdCompositeOp,
-      XorCompositeOp,
-      /* These are new operators, added after the above was last sorted.
-       * The list should be re-sorted only when a new library version is
-       * created.
-       */
-      DivideDstCompositeOp,
-      DistortCompositeOp,
-      BlurCompositeOp,
-      PegtopLightCompositeOp,
-      VividLightCompositeOp,
-      PinLightCompositeOp,
-      LinearDodgeCompositeOp,
-      LinearBurnCompositeOp,
-      MathematicsCompositeOp,
-      DivideSrcCompositeOp,
-      MinusSrcCompositeOp,
-      DarkenIntensityCompositeOp,
-      LightenIntensityCompositeOp
-    } CompositeOperator;
-  )
-
-  STRINGIFY(
-     typedef enum
-     {
-       UndefinedFunction,
-       ArcsinFunction,
-       ArctanFunction,
-       PolynomialFunction,
-       SinusoidFunction
-     } MagickFunction;
-  )
-
-  STRINGIFY(
-    typedef enum
-    {
-      UndefinedNoise,
-      UniformNoise,
-      GaussianNoise,
-      MultiplicativeGaussianNoise,
-      ImpulseNoise,
-      LaplacianNoise,
-      PoissonNoise,
-      RandomNoise
-    } NoiseType;
-  )
-
-  STRINGIFY(
-    typedef enum
-  {
-    UndefinedPixelIntensityMethod = 0,
-    AveragePixelIntensityMethod,
-    BrightnessPixelIntensityMethod,
-    LightnessPixelIntensityMethod,
-    Rec601LumaPixelIntensityMethod,
-    Rec601LuminancePixelIntensityMethod,
-    Rec709LumaPixelIntensityMethod,
-    Rec709LuminancePixelIntensityMethod,
-    RMSPixelIntensityMethod,
-    MSPixelIntensityMethod
-  } PixelIntensityMethod;
-  )
-
-  STRINGIFY(
-  typedef enum {
-    BoxWeightingFunction = 0,
-    TriangleWeightingFunction,
-    CubicBCWeightingFunction,
-    HannWeightingFunction,
-    HammingWeightingFunction,
-    BlackmanWeightingFunction,
-    GaussianWeightingFunction,
-    QuadraticWeightingFunction,
-    JincWeightingFunction,
-    SincWeightingFunction,
-    SincFastWeightingFunction,
-    KaiserWeightingFunction,
-    WelshWeightingFunction,
-    BohmanWeightingFunction,
-    LagrangeWeightingFunction,
-    CosineWeightingFunction,
-  } ResizeWeightingFunctionType;
-  )
-
-  STRINGIFY(
-     typedef enum
-     {
-       UndefinedChannel = 0x0000,
-       RedChannel = 0x0001,
-       GrayChannel = 0x0001,
-       CyanChannel = 0x0001,
-       GreenChannel = 0x0002,
-       MagentaChannel = 0x0002,
-       BlueChannel = 0x0004,
-       YellowChannel = 0x0004,
-       BlackChannel = 0x0008,
-       AlphaChannel = 0x0010,
-       OpacityChannel = 0x0010,
-       IndexChannel = 0x0020,             /* Color Index Table? */
-       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
-       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
-       MetaChannel = 0x0100,              /* ???? */
-       CompositeChannels = 0x002F,
-       AllChannels = 0x7ffffff,
-       /*
-         Special purpose channel types.
-         FUTURE: are these needed any more - they are more like hacks
-         SyncChannels for example is NOT a real channel but a 'flag'
-         It really says -- "User has not defined channels"
-         Though it does have extra meaning in the "-auto-level" operator
-       */
-       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
-       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
-       GrayChannels = 0x0400,
-       SyncChannels = 0x20000,    /* channels modified as a single unit */
-       DefaultChannels = AllChannels
-     } ChannelType;  /* must correspond to PixelChannel */
-  )
-
-/*
-  Helper functions.
-*/
-
-OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
-
-  STRINGIFY(
-    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
-    {
-      return((CLQuantum) value);
-    }
-  )
-
-OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
-
-  STRINGIFY(
-    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
-    {
-      return((CLQuantum) (257.0f*value));
-    }
-  )
-
-OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
-
-  STRINGIFY(
-    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
-    {
-      return((CLQuantum) (16843009.0*value));
-    }
-  )
-
-OPENCL_ENDIF()
-
-  STRINGIFY(
-    inline int ClampToCanvas(const int offset,const int range)
-      {
-        return clamp(offset, (int)0, range-1);
-      }
-  )
-
-  STRINGIFY(
-    inline CLQuantum ClampToQuantum(const float value)
-      {
-        return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
-      }
-  )
-
-  STRINGIFY(
-    inline uint ScaleQuantumToMap(CLQuantum value)
-      {
-        if (value >= (CLQuantum) MaxMap)
-          return ((uint)MaxMap);
-        else 
-          return ((uint)value);
-      }
-  )
-
-  STRINGIFY(
-    inline float PerceptibleReciprocal(const float x)
-    {
-      float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
-      return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
-    }
-  )
-
-  STRINGIFY(
-  inline float RoundToUnity(const float value)
-   {
-     return clamp(value,0.0f,1.0f);
-   }
-  )
-
-  STRINGIFY(
-
-  inline unsigned int getPixelIndex(const unsigned int number_channels,
-    const unsigned int columns, const unsigned int x, const unsigned int y)
-  {
-    return (x * number_channels) + (y * columns * number_channels);
-  }
-
-  inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
-  inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
-  inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
-  inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
-
-  inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
-  inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
-  inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
-  inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
-
-  inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
-  inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
-  inline float getBlueF4(float4 p)                      { return p.x; }
-  inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
-
-  inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
-  inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
-  inline float getGreenF4(float4 p)                     { return p.y; }
-  inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
-
-  inline CLQuantum getRed(CLPixelType p)                { return p.z; }
-  inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
-  inline float getRedF4(float4 p)                       { return p.z; }
-  inline void setRedF4(float4* p, float value)          { (*p).z = value; }
-
-  inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
-  inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
-  inline float getAlphaF4(float4 p)                     { return p.w; }
-  inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
-
-  inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
-    const ChannelType channel, float *red, float *green, float *blue, float *alpha)
-  {
-    if ((channel & RedChannel) != 0)
-      *red=getPixelRed(p);
-
-    if (number_channels > 2)
-      {
-        if ((channel & GreenChannel) != 0)
-          *green=getPixelGreen(p);
-
-        if ((channel & BlueChannel) != 0)
-          *blue=getPixelBlue(p);
-      }
-
-    if (((number_channels == 4) || (number_channels == 2)) &&
-        ((channel & AlphaChannel) != 0))
-      *alpha=getPixelAlpha(p,number_channels);
-  }
-
-  inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
-    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
-  {
-    const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
-
-    float red = 0.0f;
-    float green = 0.0f;
-    float blue = 0.0f;
-    float alpha = 0.0f;
-
-    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
-    return (float4)(red, green, blue, alpha);
-  }
-
-  inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
-    const ChannelType channel, float red, float green, float blue, float alpha)
-  {
-    if ((channel & RedChannel) != 0)
-      setPixelRed(p,ClampToQuantum(red));
-
-    if (number_channels > 2)
-      {
-        if ((channel & GreenChannel) != 0)
-          setPixelGreen(p,ClampToQuantum(green));
-
-        if ((channel & BlueChannel) != 0)
-          setPixelBlue(p,ClampToQuantum(blue));
-      }
-
-    if (((number_channels == 4) || (number_channels == 2)) &&
-        ((channel & AlphaChannel) != 0))
-      setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
-  }
-
-  inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
-    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
-    float4 pixel)
-  {
-    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
-    WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
-  }
-
-  inline float GetPixelIntensity(const unsigned int colorspace,
-    const unsigned int method,float red,float green,float blue)
-  {
-    float intensity;
-
-    if (colorspace == GRAYColorspace)
-      return red;
-
-    switch (method)
-    {
-      case AveragePixelIntensityMethod:
-        {
-          intensity=(red+green+blue)/3.0;
-          break;
-        }
-      case BrightnessPixelIntensityMethod:
-        {
-          intensity=MagickMax(MagickMax(red,green),blue);
-          break;
-        }
-      case LightnessPixelIntensityMethod:
-        {
-          intensity=(MagickMin(MagickMin(red,green),blue)+
-              MagickMax(MagickMax(red,green),blue))/2.0;
-          break;
-        }
-      case MSPixelIntensityMethod:
-        {
-          intensity=(float) (((float) red*red+green*green+blue*blue)/
-              (3.0*QuantumRange));
-          break;
-        }
-      case Rec601LumaPixelIntensityMethod:
-        {
-          /*
-          if (image->colorspace == RGBColorspace)
-          {
-            red=EncodePixelGamma(red);
-            green=EncodePixelGamma(green);
-            blue=EncodePixelGamma(blue);
-          }
-          */
-          intensity=0.298839*red+0.586811*green+0.114350*blue;
-          break;
-        }
-      case Rec601LuminancePixelIntensityMethod:
-        {
-          /*
-          if (image->colorspace == sRGBColorspace)
-          {
-            red=DecodePixelGamma(red);
-            green=DecodePixelGamma(green);
-            blue=DecodePixelGamma(blue);
-          }
-          */
-          intensity=0.298839*red+0.586811*green+0.114350*blue;
-          break;
-        }
-      case Rec709LumaPixelIntensityMethod:
-      default:
-        {
-          /*
-          if (image->colorspace == RGBColorspace)
-          {
-            red=EncodePixelGamma(red);
-            green=EncodePixelGamma(green);
-            blue=EncodePixelGamma(blue);
-          }
-          */
-          intensity=0.212656*red+0.715158*green+0.072186*blue;
-          break;
-        }
-      case Rec709LuminancePixelIntensityMethod:
-        {
-          /*
-          if (image->colorspace == sRGBColorspace)
-          {
-            red=DecodePixelGamma(red);
-            green=DecodePixelGamma(green);
-            blue=DecodePixelGamma(blue);
-          }
-          */
-          intensity=0.212656*red+0.715158*green+0.072186*blue;
-          break;
-        }
-      case RMSPixelIntensityMethod:
-        {
-          intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
-              sqrt(3.0));
-          break;
-        }
-    }
-
-    return intensity;
-  }
-
-  inline int mirrorBottom(int value)
-  {
-      return (value < 0) ? - (value) : value;
-  }
-
-  inline int mirrorTop(int value, int width)
-  {
-      return (value >= width) ? (2 * width - value - 1) : value;
-  }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     A d d N o i s e                                                         %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-  /*
-  Part of MWC64X by David Thomas, dt10@imperial.ac.uk
-  This is provided under BSD, full license is with the main package.
-  See http://www.doc.ic.ac.uk/~dt10/research
-  */
-
-  // Pre: a<M, b<M
-  // Post: r=(a+b) mod M
-  ulong MWC_AddMod64(ulong a, ulong b, ulong M)
-  {
-    ulong v=a+b;
-    //if( (v>=M) || (v<a) )
-    if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
-      v=v-M;
-    return v;
-  }
-
-  // Pre: a<M,b<M
-  // Post: r=(a*b) mod M
-  // This could be done more efficently, but it is portable, and should
-  // be easy to understand. It can be replaced with any of the better
-  // modular multiplication algorithms (for example if you know you have
-  // double precision available or something).
-  ulong MWC_MulMod64(ulong a, ulong b, ulong M)
-  {
-    ulong r=0;
-    while(a!=0){
-      if(a&1)
-        r=MWC_AddMod64(r,b,M);
-      b=MWC_AddMod64(b,b,M);
-      a=a>>1;
-    }
-    return r;
-  }
-
-  // Pre: a<M, e>=0
-  // Post: r=(a^b) mod M
-  // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
-  // most architectures
-  ulong MWC_PowMod64(ulong a, ulong e, ulong M)
-  {
-    ulong sqr=a, acc=1;
-    while(e!=0){
-      if(e&1)
-        acc=MWC_MulMod64(acc,sqr,M);
-        sqr=MWC_MulMod64(sqr,sqr,M);
-      e=e>>1;
-    }
-    return acc;
-  }
-
-  uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
-  {
-    ulong m=MWC_PowMod64(A, distance, M);
-    ulong x=curr.x*(ulong)A+curr.y;
-    x=MWC_MulMod64(x, m, M);
-    return (uint2)((uint)(x/A), (uint)(x%A));
-  }
-
-  uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
-  {
-    // This is an arbitrary constant for starting LCG jumping from. I didn't
-    // want to start from 1, as then you end up with the two or three first values
-    // being a bit poor in ones - once you've decided that, one constant is as
-    // good as any another. There is no deep mathematical reason for it, I just
-    // generated a random number.
-    enum{ MWC_BASEID = 4077358422479273989UL };
-
-    ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
-    ulong m=MWC_PowMod64(A, dist, M);
-
-    ulong x=MWC_MulMod64(MWC_BASEID, m, M);
-    return (uint2)((uint)(x/A), (uint)(x%A));
-  }
-
-  //! Represents the state of a particular generator
-  typedef struct{ uint x; uint c; } mwc64x_state_t;
-
-  enum{ MWC64X_A = 4294883355U };
-  enum{ MWC64X_M = 18446383549859758079UL };
-
-  void MWC64X_Step(mwc64x_state_t *s)
-  {
-    uint X=s->x, C=s->c;
-
-    uint Xn=MWC64X_A*X+C;
-    uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
-    uint Cn=mad_hi(MWC64X_A,X,carry);
-
-    s->x=Xn;
-    s->c=Cn;
-  }
-
-  void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
-  {
-    uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
-    s->x=tmp.x;
-    s->c=tmp.y;
-  }
-
-  void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
-  {
-    uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
-    s->x=tmp.x;
-    s->c=tmp.y;
-  }
-
-  //! Return a 32-bit integer in the range [0..2^32)
-  uint MWC64X_NextUint(mwc64x_state_t *s)
-  {
-    uint res=s->x ^ s->c;
-    MWC64X_Step(s);
-    return res;
-  }
-
-  //
-  // End of MWC64X excerpt
-  //
-
-  float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
-  {
-    return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
-  }
-
-  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
-  {
-    float
-      alpha,
-      beta,
-      noise,
-      sigma;
-
-    noise = 0.0f;
-    alpha=mwcReadPseudoRandomValue(r);
-    switch(noise_type)
-    {
-      case UniformNoise:
-      default:
-        {
-          noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
-          break;
-        }
-      case GaussianNoise:
-        {
-          float
-            gamma,
-            tau;
-
-          if (alpha == 0.0f)
-            alpha=1.0f;
-          beta=mwcReadPseudoRandomValue(r);
-          gamma=sqrt(-2.0f*log(alpha));
-          sigma=gamma*cospi((2.0f*beta));
-          tau=gamma*sinpi((2.0f*beta));
-          noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
-          break;
-        }
-      case ImpulseNoise:
-      {
-        if (alpha < (SigmaImpulse/2.0f))
-          noise=0.0f;
-        else
-          if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
-            noise=QuantumRange;
-          else
-            noise=pixel;
-        break;
-      }
-      case LaplacianNoise:
-      {
-        if (alpha <= 0.5f)
-          {
-            if (alpha <= MagickEpsilon)
-              noise=(pixel-QuantumRange);
-            else
-              noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
-                0.5f);
-            break;
-          }
-        beta=1.0f-alpha;
-        if (beta <= (0.5f*MagickEpsilon))
-          noise=(pixel+QuantumRange);
-        else
-          noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
-        break;
-      }
-      case MultiplicativeGaussianNoise:
-      {
-        sigma=1.0f;
-        if (alpha > MagickEpsilon)
-          sigma=sqrt(-2.0f*log(alpha));
-        beta=mwcReadPseudoRandomValue(r);
-        noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
-          cospi((2.0f*beta))/2.0f);
-        break;
-      }
-      case PoissonNoise:
-      {
-        float 
-          poisson;
-        unsigned int i;
-        poisson=exp(-SigmaPoisson*QuantumScale*pixel);
-        for (i=0; alpha > poisson; i++)
-        {
-          beta=mwcReadPseudoRandomValue(r);
-          alpha*=beta;
-        }
-        noise=(QuantumRange*i/SigmaPoisson);
-        break;
-      }
-      case RandomNoise:
-      {
-        noise=(QuantumRange*SigmaRandom*alpha);
-        break;
-      }
-    }
-    return noise;
-  }
-
-  __kernel
-  void AddNoise(const __global CLQuantum *image,
-    const unsigned int number_channels,const ChannelType channel,
-    const unsigned int length,const unsigned int pixelsPerWorkItem,
-    const NoiseType noise_type,const float attenuate,const unsigned int seed0,
-    const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
-    __global CLQuantum *filteredImage)
-  {
-    mwc64x_state_t rng;
-    rng.x = seed0;
-    rng.c = seed1;
-
-    uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
-    uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
-    MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
-
-    uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
-    uint count = pixelsPerWorkItem;
-
-    while (count > 0 && pos < length)
-    {
-      const __global CLQuantum *p = image + pos;
-      __global CLQuantum *q = filteredImage + pos;
-
-      float red;
-      float green;
-      float blue;
-      float alpha;
-
-      ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
-
-      if ((channel & RedChannel) != 0)
-        red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
-
-      if (number_channels > 2)
-      {
-        if ((channel & GreenChannel) != 0)
-          green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
-
-        if ((channel & BlueChannel) != 0)
-          blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
-      }
-
-      if (((number_channels == 4) || (number_channels == 2)) &&
-          ((channel & AlphaChannel) != 0))
-        alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
-
-      WriteChannels(q, number_channels, channel, red, green, blue, alpha);
-
-      pos += (get_local_size(0) * number_channels);
-      count--;
-    }
-  }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%    B l u r                                                                  %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-  /*
-  Reduce image noise and reduce detail levels by row
-  */
-  __kernel void BlurRow(const __global CLQuantum *image,
-    const unsigned int number_channels,const ChannelType channel,
-    __constant float *filter,const unsigned int width,
-    const unsigned int imageColumns,const unsigned int imageRows,
-    __local float4 *temp,__global float4 *tempImage)
-  {
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    const int columns = imageColumns;
-
-    const unsigned int radius = (width-1)/2;
-    const int wsize = get_local_size(0);
-    const unsigned int loadSize = wsize+width;
-
-    //group coordinate
-    const int groupX=get_local_size(0)*get_group_id(0);
-
-    //parallel load and clamp
-    for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
-    {
-      int cx = ClampToCanvas(i + groupX - radius, columns);
-      temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
-    }
-
-    // barrier
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // only do the work if this is not a patched item
-    if (get_global_id(0) < columns)
-    {
-      // compute
-      float4 result = (float4) 0;
-
-      int i = 0;
-
-      for ( ; i+7 < width; )
-      {
-        for (int j=0; j < 8; j++)
-          result+=filter[i+j]*temp[i+j+get_local_id(0)];
-        i+=8;
-      }
-
-      for ( ; i < width; i++)
-        result+=filter[i]*temp[i+get_local_id(0)];
-
-      // write back to global
-      tempImage[y*columns+x] = result;
-    }
-  }
-  )
-
-  STRINGIFY(
-  /*
-  Reduce image noise and reduce detail levels by line
-  */
-  __kernel void BlurColumn(const __global float4 *blurRowData,
-    const unsigned int number_channels,const ChannelType channel,
-    __constant float *filter,const unsigned int width,
-    const unsigned int imageColumns,const unsigned int imageRows,
-    __local float4 *temp,__global CLQuantum *filteredImage)
-  {
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    const int columns = imageColumns;
-    const int rows = imageRows;
-
-    unsigned int radius = (width-1)/2;
-    const int wsize = get_local_size(1);
-    const unsigned int loadSize = wsize+width;
-
-    //group coordinate
-    const int groupX=get_local_size(0)*get_group_id(0);
-    const int groupY=get_local_size(1)*get_group_id(1);
-    //notice that get_local_size(0) is 1, so
-    //groupX=get_group_id(0);
-
-    //parallel load and clamp
-    for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
-      temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
-
-    // barrier
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // only do the work if this is not a patched item
-    if (get_global_id(1) < rows)
-    {
-      // compute
-      float4 result = (float4) 0;
-
-      int i = 0;
-
-      for ( ; i+7 < width; )
-      {
-        for (int j=0; j < 8; j++)
-          result+=filter[i+j]*temp[i+j+get_local_id(1)];
-        i+=8;
-      }
-
-      for ( ; i < width; i++)
-        result+=filter[i]*temp[i+get_local_id(1)];
-
-      // write back to global
-      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
-    }
-  }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%    C o m p o s i t e                                                        %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-    inline float ColorDodge(const float Sca,
-      const float Sa,const float Dca,const float Da)
-    {
-      /*
-        Oct 2004 SVG specification.
-      */
-      if ((Sca*Da+Dca*Sa) >= Sa*Da)
-        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
-      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
-
-
-      /*
-        New specification, March 2009 SVG specification.  This specification was
-        also wrong of non-overlap cases.
-      */
-      /*
-      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
-        return(Sca*(1.0-Da));
-      if (fabs(Sca-Sa) < MagickEpsilon)
-        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
-      return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
-      */
-
-      /*
-        Working from first principles using the original formula:
-
-           f(Sc,Dc) = Dc/(1-Sc)
-
-        This works correctly! Looks like the 2004 model was right but just
-        required a extra condition for correct handling.
-      */
-
-      /*
-      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
-        return(Sca*(1.0-Da)+Dca*(1.0-Sa));
-      if (fabs(Sca-Sa) < MagickEpsilon)
-        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
-      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
-      */
-    }
-
-    inline void CompositeColorDodge(const float4 *p,
-      const float4 *q,float4 *composite) {
-
-      float 
-      Da,
-      gamma,
-      Sa;
-
-      Sa=QuantumScale*getAlphaF4(*p);  /* simplify and speed up equations */
-      Da=QuantumScale*getAlphaF4(*q);
-      gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
-      setAlphaF4(composite,QuantumRange*gamma);
-      gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
-      setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
-        getRedF4(*q)*Da,Da));
-      setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
-        getGreenF4(*q)*Da,Da));
-      setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
-        getBlueF4(*q)*Da,Da));
-    }
-  )
-
-  STRINGIFY(
-    inline void MagickPixelCompositePlus(const float4 *p,
-      const float alpha,const float4 *q,
-      const float beta,float4 *composite)
-    {
-      float 
-        gamma;
-
-      float
-        Da,
-        Sa;
-      /*
-        Add two pixels with the given opacities.
-      */
-      Sa=QuantumScale*alpha;
-      Da=QuantumScale*beta;
-      gamma=RoundToUnity(Sa+Da);  /* 'Plus' blending -- not 'Over' blending */
-      setAlphaF4(composite,QuantumRange*gamma);
-      gamma=PerceptibleReciprocal(gamma);
-      setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
-      setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
-      setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
-    }
-  )
-
-  STRINGIFY(
-    inline void MagickPixelCompositeBlend(const float4 *p,
-      const float alpha,const float4 *q,
-      const float beta,float4 *composite)
-    {
-      MagickPixelCompositePlus(p,(float) (alpha*
-      (getAlphaF4(*p))),q,(float) (beta*
-      (getAlphaF4(*q))),composite);
-    }
-  )
-  
-  STRINGIFY(
-    __kernel 
-    void Composite(__global CLPixelType *image,
-                   const unsigned int imageWidth, 
-                   const unsigned int imageHeight,
-                   const __global CLPixelType *compositeImage,
-                   const unsigned int compositeWidth, 
-                   const unsigned int compositeHeight,
-                   const unsigned int compose,
-                   const ChannelType channel, 
-                   const unsigned int matte,
-                   const float destination_dissolve,
-                   const float source_dissolve) {
-
-      uint2 index;
-      index.x = get_global_id(0);
-      index.y = get_global_id(1);
-
-
-      if (index.x >= imageWidth
-        || index.y >= imageHeight) {
-          return;
-      }
-      const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
-      float4 destination;
-      setRedF4(&destination,getRed(inputPixel));
-      setGreenF4(&destination,getGreen(inputPixel));
-      setBlueF4(&destination,getBlue(inputPixel));
-
-      
-      const CLPixelType compositePixel 
-        = compositeImage[index.y*imageWidth+index.x];
-      float4 source;
-      setRedF4(&source,getRed(compositePixel));
-      setGreenF4(&source,getGreen(compositePixel));
-      setBlueF4(&source,getBlue(compositePixel));
-
-      if (matte != 0) {
-        setAlphaF4(&destination,getAlpha(inputPixel));
-        setAlphaF4(&source,getAlpha(compositePixel));
-      }
-      else {
-        setAlphaF4(&destination,1.0f);
-        setAlphaF4(&source,1.0f);
-      }
-
-      float4 composite=destination;
-
-      CompositeOperator op = (CompositeOperator)compose;
-      switch (op) {
-      case ColorDodgeCompositeOp:
-        CompositeColorDodge(&source,&destination,&composite);
-        break;
-      case BlendCompositeOp:
-        MagickPixelCompositeBlend(&source,source_dissolve,&destination,
-            destination_dissolve,&composite);
-        break;
-      default:
-        // unsupported operators
-        break;
-      };
-
-      CLPixelType outputPixel;
-      setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
-      setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
-      setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
-      setAlpha(&outputPixel, ClampToQuantum(getAlphaF4(composite)));
-      image[index.y*imageWidth+index.x] = outputPixel;
-    }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%    C o n t r a s t                                                          %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-
-  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
-    float3 HueSaturationBrightness;
-    HueSaturationBrightness.x = 0.0f; // Hue
-    HueSaturationBrightness.y = 0.0f; // Saturation
-    HueSaturationBrightness.z = 0.0f; // Brightness
-
-    float r=(float) getRed(pixel);
-    float g=(float) getGreen(pixel);
-    float b=(float) getBlue(pixel);
-
-    float tmin=MagickMin(MagickMin(r,g),b);
-    float tmax=MagickMax(MagickMax(r,g),b);
-
-    if (tmax!=0.0f) {
-      float delta=tmax-tmin;
-      HueSaturationBrightness.y=delta/tmax;
-      HueSaturationBrightness.z=QuantumScale*tmax;
-
-      if (delta != 0.0f) {
-  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
-  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
-        HueSaturationBrightness.x/=6.0f;
-        HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
-      }
-    }
-    return HueSaturationBrightness;
-  }
-
-  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
-
-    float hue = HueSaturationBrightness.x;
-    float brightness = HueSaturationBrightness.z;
-    float saturation = HueSaturationBrightness.y;
-   
-    CLPixelType rgb;
-
-    if (saturation == 0.0f) {
-      setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
-      setGreen(&rgb,getRed(rgb));
-      setBlue(&rgb,getRed(rgb));
-    }
-    else {
-
-      float h=6.0f*(hue-floor(hue));
-      float f=h-floor(h);
-      float p=brightness*(1.0f-saturation);
-      float q=brightness*(1.0f-saturation*f);
-      float t=brightness*(1.0f-(saturation*(1.0f-f)));
- 
-      float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
-      float clamped_t = ClampToQuantum(QuantumRange*t);
-      float clamped_p = ClampToQuantum(QuantumRange*p);
-      float clamped_q = ClampToQuantum(QuantumRange*q);
-      int ih = (int)h;
-      setRed(&rgb, (ih == 1)?clamped_q:
-        (ih == 2 || ih == 3)?clamped_p:
-        (ih == 4)?clamped_t:
-                 clampedBrightness);
- 
-      setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
-        (ih == 3)?clamped_q:
-        (ih == 4 || ih == 5)?clamped_p:
-                 clamped_t);
-
-      setBlue(&rgb, (ih == 2)?clamped_t:
-        (ih == 3 || ih == 4)?clampedBrightness:
-        (ih == 5)?clamped_q:
-                 clamped_p);
-    }
-    return rgb;
-  }
-
-  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
-  {
-
-    const int sign = sharpen!=0?1:-1;
-    const int x = get_global_id(0);  
-    const int y = get_global_id(1);
-    const int columns = get_global_size(0);
-    const int c = x + y * columns;
-
-    CLPixelType pixel = im[c];
-    float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
-    float brightness = HueSaturationBrightness.z;
-    brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
-    brightness = clamp(brightness,0.0f,1.0f);
-    HueSaturationBrightness.z = brightness;
-
-    CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
-    filteredPixel.w = pixel.w;
-    im[c] = filteredPixel;
-  }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%    C o n t r a s t S t r e t c h                                            %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-    STRINGIFY(
-    /*
-    */
-    __kernel void Histogram(__global CLPixelType * restrict im,
-      const ChannelType channel, 
-      const unsigned int colorspace,
-      const unsigned int method,
-      __global uint4 * restrict histogram)
-      {
-        const int x = get_global_id(0);  
-        const int y = get_global_id(1);  
-        const int columns = get_global_size(0);  
-        const int c = x + y * columns;
-        if ((channel & SyncChannels) != 0)
-        {
-          float red=(float)getRed(im[c]);
-          float green=(float)getGreen(im[c]);
-          float blue=(float)getBlue(im[c]);
-
-          float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
-          uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
-          atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
-        }
-        else
-        {
-          // for equalizing, we always need all channels?
-          // otherwise something more
-        }
-      }
-    )
-
-    STRINGIFY(
-    /*
-    */
-    __kernel void ContrastStretch(__global CLPixelType * restrict im,
-      const ChannelType channel,  
-      __global CLPixelType * restrict stretch_map,
-      const float4 white, const float4 black)
-      {
-        const int x = get_global_id(0);  
-        const int y = get_global_id(1);  
-        const int columns = get_global_size(0);  
-        const int c = x + y * columns;
-
-        uint ePos;
-        CLPixelType oValue, eValue;
-        CLQuantum red, green, blue, alpha;
-
-        //read from global
-        oValue=im[c];
-
-        if ((channel & RedChannel) != 0)
-        {
-          if (getRedF4(white) != getRedF4(black))
-          {
-            ePos = ScaleQuantumToMap(getRed(oValue)); 
-            eValue = stretch_map[ePos];
-            red = getRed(eValue);
-          }
-        }
-
-        if ((channel & GreenChannel) != 0)
-        {
-          if (getGreenF4(white) != getGreenF4(black))
-          {
-            ePos = ScaleQuantumToMap(getGreen(oValue)); 
-            eValue = stretch_map[ePos];
-            green = getGreen(eValue);
-          }
-        }
-
-        if ((channel & BlueChannel) != 0)
-        {
-          if (getBlueF4(white) != getBlueF4(black))
-          {
-            ePos = ScaleQuantumToMap(getBlue(oValue)); 
-            eValue = stretch_map[ePos];
-            blue = getBlue(eValue);
-          }
-        }
-
-        if ((channel & AlphaChannel) != 0)
-        {
-          if (getAlphaF4(white) != getAlphaF4(black))
-          {
-            ePos = ScaleQuantumToMap(getAlpha(oValue)); 
-            eValue = stretch_map[ePos];
-            alpha = getAlpha(eValue);
-          }
-        }
-
-        //write back
-        im[c]=(CLPixelType)(blue, green, red, alpha);
-
-      }
-    )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%    C o n v o l v e                                                          %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-    __kernel 
-    void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
-    const unsigned int imageWidth, const unsigned int imageHeight,
-    __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
-    const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
-
-      int2 blockID;
-      blockID.x = get_global_id(0) / get_local_size(0);
-      blockID.y = get_global_id(1) / get_local_size(1);
-
-      // image area processed by this workgroup
-      int2 imageAreaOrg;
-      imageAreaOrg.x = blockID.x * get_local_size(0);
-      imageAreaOrg.y = blockID.y * get_local_size(1);
-
-      int2 midFilterDimen;
-      midFilterDimen.x = (filterWidth-1)/2;
-      midFilterDimen.y = (filterHeight-1)/2;
-
-      int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
-
-      // dimension of the local cache
-      int2 cachedAreaDimen;
-      cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
-      cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
-
-      // cache the pixels accessed by this workgroup in local memory
-      int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
-      int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
-      int groupSize = get_local_size(0) * get_local_size(1);
-      for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
-
-        int2 cachedAreaIndex;
-        cachedAreaIndex.x = i % cachedAreaDimen.x;
-        cachedAreaIndex.y = i / cachedAreaDimen.x;
-
-        int2 imagePixelIndex;
-        imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
-
-        // only support EdgeVirtualPixelMethod through ClampToCanvas
-        // TODO: implement other virtual pixel method
-        imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
-        imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
-
-        pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
-      }
-
-      // cache the filter
-      for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
-        filterCache[i] = filter[i];
-      }
-      barrier(CLK_LOCAL_MEM_FENCE);
-
-
-      int2 imageIndex;
-      imageIndex.x = imageAreaOrg.x + get_local_id(0);
-      imageIndex.y = imageAreaOrg.y + get_local_id(1);
-
-      // if out-of-range, stops here and quit
-      if (imageIndex.x >= imageWidth
-        || imageIndex.y >= imageHeight) {
-          return;
-      }
-
-      int filterIndex = 0;
-      float4 sum = (float4)0.0f;
-      float gamma = 0.0f;
-      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
-        int cacheIndexY = get_local_id(1);
-        for (int j = 0; j < filterHeight; j++) {
-          int cacheIndexX = get_local_id(0);
-          for (int i = 0; i < filterWidth; i++) {
-            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
-            float f = filterCache[filterIndex];
-
-            sum.x += f * p.x;
-            sum.y += f * p.y;
-            sum.z += f * p.z; 
-            sum.w += f * p.w;
-
-            gamma += f;
-            filterIndex++;
-            cacheIndexX++;
-          }
-          cacheIndexY++;
-        }
-      }
-      else {
-        int cacheIndexY = get_local_id(1);
-        for (int j = 0; j < filterHeight; j++) {
-          int cacheIndexX = get_local_id(0);
-          for (int i = 0; i < filterWidth; i++) {
-
-            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
-            float alpha = QuantumScale*p.w;
-            float f = filterCache[filterIndex];
-            float g = alpha * f;
-
-            sum.x += g*p.x;
-            sum.y += g*p.y;
-            sum.z += g*p.z;
-            sum.w += f*p.w;
-
-            gamma += g;
-            filterIndex++;
-            cacheIndexX++;
-          }
-          cacheIndexY++;
-        }
-        gamma = PerceptibleReciprocal(gamma);
-        sum.xyz = gamma*sum.xyz;
-      }
-      CLPixelType outputPixel;
-      outputPixel.x = ClampToQuantum(sum.x);
-      outputPixel.y = ClampToQuantum(sum.y);
-      outputPixel.z = ClampToQuantum(sum.z);
-      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
-
-      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
-    }
-  )
-
-  STRINGIFY(
-    __kernel 
-    void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
-                  const uint imageWidth, const uint imageHeight,
-                  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
-                  const uint matte, const ChannelType channel) {
-
-      int2 imageIndex;
-      imageIndex.x = get_global_id(0);
-      imageIndex.y = get_global_id(1);
-
-      /*
-      unsigned int imageWidth = get_global_size(0);
-      unsigned int imageHeight = get_global_size(1);
-      */
-      if (imageIndex.x >= imageWidth
-          || imageIndex.y >= imageHeight)
-          return;
-
-      int2 midFilterDimen;
-      midFilterDimen.x = (filterWidth-1)/2;
-      midFilterDimen.y = (filterHeight-1)/2;
-
-      int filterIndex = 0;
-      float4 sum = (float4)0.0f;
-      float gamma = 0.0f;
-      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
-        for (int j = 0; j < filterHeight; j++) {
-          int2 inputPixelIndex;
-          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
-          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
-          for (int i = 0; i < filterWidth; i++) {
-            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
-            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
-        
-            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
-            float f = filter[filterIndex];
-
-            sum.x += f * p.x;
-            sum.y += f * p.y;
-            sum.z += f * p.z; 
-            sum.w += f * p.w;
-
-            gamma += f;
-
-            filterIndex++;
-          }
-        }
-      }
-      else {
-
-        for (int j = 0; j < filterHeight; j++) {
-          int2 inputPixelIndex;
-          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
-          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
-          for (int i = 0; i < filterWidth; i++) {
-            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
-            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
-        
-            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
-            float alpha = QuantumScale*p.w;
-            float f = filter[filterIndex];
-            float g = alpha * f;
-
-            sum.x += g*p.x;
-            sum.y += g*p.y;
-            sum.z += g*p.z;
-            sum.w += f*p.w;
-
-            gamma += g;
-
-
-            filterIndex++;
-          }
-        }
-        gamma = PerceptibleReciprocal(gamma);
-        sum.xyz = gamma*sum.xyz;
-      }
-
-      CLPixelType outputPixel;
-      outputPixel.x = ClampToQuantum(sum.x);
-      outputPixel.y = ClampToQuantum(sum.y);
-      outputPixel.z = ClampToQuantum(sum.z);
-      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
-
-      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
-    }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     D e s p e c k l e                                                       %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-
-  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
-  , const unsigned int imageWidth, const unsigned int imageHeight
-  , const int2 offset, const int polarity, const int matte) {
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    CLPixelType v = inputImage[y*imageWidth+x];
-
-    int2 neighbor;
-    neighbor.y = y + offset.y;
-    neighbor.x = x + offset.x;
-
-    int2 clampedNeighbor;
-    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
-    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
-
-    CLPixelType r = (clampedNeighbor.x == neighbor.x
-                     && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
-    :(CLPixelType)0;
-
-    int sv[4];
-    sv[0] = (int)v.x;
-    sv[1] = (int)v.y;
-    sv[2] = (int)v.z;
-    sv[3] = (int)v.w;
-
-    int sr[4];
-    sr[0] = (int)r.x;
-    sr[1] = (int)r.y;
-    sr[2] = (int)r.z;
-    sr[3] = (int)r.w;
-
-    if (polarity > 0) {
-      \n #pragma unroll 4\n
-      for (unsigned int i = 0; i < 4; i++) {
-        sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
-      }
-    }
-    else {
-      \n #pragma unroll 4\n
-      for (unsigned int i = 0; i < 4; i++) {
-        sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
-      }
-
-    }
-
-    v.x = (CLQuantum)sv[0];
-    v.y = (CLQuantum)sv[1];
-    v.z = (CLQuantum)sv[2];
-
-    if (matte!=0)
-      v.w = (CLQuantum)sv[3];
-
-    outputImage[y*imageWidth+x] = v;
-
-    }
-
-
-  )
-
-  STRINGIFY(
-
-  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
-  , const unsigned int imageWidth, const unsigned int imageHeight
-  , const int2 offset, const int polarity, const int matte) {
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    CLPixelType v = inputImage[y*imageWidth+x];
-
-    int2 neighbor, clampedNeighbor;
-
-    neighbor.y = y + offset.y;
-    neighbor.x = x + offset.x;
-    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
-    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
-
-    CLPixelType r = (clampedNeighbor.x == neighbor.x
-      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
-    :(CLPixelType)0;
-
-
-    neighbor.y = y - offset.y;
-    neighbor.x = x - offset.x;
-    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
-    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
-
-    CLPixelType s = (clampedNeighbor.x == neighbor.x
-      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
-    :(CLPixelType)0;
-
-
-    int sv[4];
-    sv[0] = (int)v.x;
-    sv[1] = (int)v.y;
-    sv[2] = (int)v.z;
-    sv[3] = (int)v.w;
-
-    int sr[4];
-    sr[0] = (int)r.x;
-    sr[1] = (int)r.y;
-    sr[2] = (int)r.z;
-    sr[3] = (int)r.w;
-
-    int ss[4];
-    ss[0] = (int)s.x;
-    ss[1] = (int)s.y;
-    ss[2] = (int)s.z;
-    ss[3] = (int)s.w;
-
-    if (polarity > 0) {
-      \n #pragma unroll 4\n
-      for (unsigned int i = 0; i < 4; i++) {
-        //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
-        //
-        //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
-        //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
-        sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
-      }
-    }
-    else {
-      \n #pragma unroll 4\n
-      for (unsigned int i = 0; i < 4; i++) {
-        //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
-        //
-        //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
-        sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
-      }
-    }
-
-    v.x = (CLQuantum)sv[0];
-    v.y = (CLQuantum)sv[1];
-    v.z = (CLQuantum)sv[2];
-
-    if (matte!=0)
-      v.w = (CLQuantum)sv[3];
-
-    outputImage[y*imageWidth+x] = v;
-
-    }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     E q u a l i z e                                                         %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-    STRINGIFY(
-    /*
-    */
-    __kernel void Equalize(__global CLPixelType * restrict im,
-      const ChannelType channel,  
-      __global CLPixelType * restrict equalize_map,
-      const float4 white, const float4 black)
-      {
-        const int x = get_global_id(0);  
-        const int y = get_global_id(1);  
-        const int columns = get_global_size(0);  
-        const int c = x + y * columns;
-
-        uint ePos;
-        CLPixelType oValue, eValue;
-        CLQuantum red, green, blue, alpha;
-
-        //read from global
-        oValue=im[c];
-
-        if ((channel & SyncChannels) != 0)
-        {
-          if (getRedF4(white) != getRedF4(black))
-          {
-            ePos = ScaleQuantumToMap(getRed(oValue)); 
-            eValue = equalize_map[ePos];
-            red = getRed(eValue);
-            ePos = ScaleQuantumToMap(getGreen(oValue)); 
-            eValue = equalize_map[ePos];
-            green = getRed(eValue);
-            ePos = ScaleQuantumToMap(getBlue(oValue)); 
-            eValue = equalize_map[ePos];
-            blue = getRed(eValue);
-            ePos = ScaleQuantumToMap(getAlpha(oValue)); 
-            eValue = equalize_map[ePos];
-            alpha = getRed(eValue);
- 
-            //write back
-            im[c]=(CLPixelType)(blue, green, red, alpha);
-          }
-
-        }
-
-        // for equalizing, we always need all channels?
-        // otherwise something more
-
-     }
-    )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     F u n c t i o n                                                         %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-  /*
-  apply FunctionImageChannel(braightness-contrast)
-  */
-  CLQuantum ApplyFunction(float pixel,const MagickFunction function,
-    const unsigned int number_parameters,__constant float *parameters)
-  {
-    float result = 0.0f;
-
-    switch (function)
-    {
-    case PolynomialFunction:
-      {
-        for (unsigned int i=0; i < number_parameters; i++)
-          result = result*QuantumScale*pixel + parameters[i];
-        result *= QuantumRange;
-        break;
-      }
-    case SinusoidFunction:
-      {
-        float  freq,phase,ampl,bias;
-        freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
-        phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
-        ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
-        bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
-        result = QuantumRange*(ampl*sin(2.0f*MagickPI*
-          (freq*QuantumScale*pixel + phase/360.0f)) + bias);
-        break;
-      }
-    case ArcsinFunction:
-      {
-        float  width,range,center,bias;
-        width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
-        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
-        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
-        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
-
-        result = 2.0f/width*(QuantumScale*pixel - center);
-        result = range/MagickPI*asin(result)+bias;
-        result = ( result <= -1.0f ) ? bias - range/2.0f : result;
-        result = ( result >= 1.0f ) ? bias + range/2.0f : result;
-        result *= QuantumRange;
-        break;
-      }
-    case ArctanFunction:
-      {
-        float slope,range,center,bias;
-        slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
-        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
-        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
-        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
-        result = MagickPI*slope*(QuantumScale*pixel-center);
-        result = QuantumRange*(range/MagickPI*atan(result) + bias);
-        break;
-      }
-    case UndefinedFunction:
-      break;
-    }
-    return(ClampToQuantum(result));
-  }
-  )
-
-  STRINGIFY(
-  /*
-  Improve brightness / contrast of the image
-  channel : define which channel is improved
-  function : the function called to enchance the brightness contrast
-  number_parameters : numbers of parameters 
-  parameters : the parameter
-  */
-  __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
-    const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
-    __constant float *parameters)
-  {
-    const unsigned int x = get_global_id(0);
-    const unsigned int y = get_global_id(1);
-    const unsigned int columns = get_global_size(0);
-    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
-
-    float red;
-    float green;
-    float blue;
-    float alpha;
-
-    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
-
-    if ((channel & RedChannel) != 0)
-      red=ApplyFunction(red, function, number_parameters, parameters);
-
-    if (number_channels > 2)
-      {
-        if ((channel & GreenChannel) != 0)
-          green=ApplyFunction(green, function, number_parameters, parameters);
-
-        if ((channel & BlueChannel) != 0)
-          blue=ApplyFunction(blue, function, number_parameters, parameters);
-      }
-
-    if (((number_channels == 4) || (number_channels == 2)) &&
-        ((channel & AlphaChannel) != 0))
-      alpha=ApplyFunction(alpha, function, number_parameters, parameters);
-
-    WriteChannels(p, number_channels, channel, red, green, blue, alpha);
-  }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     G r a y s c a l e                                                       %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-  __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
-    const unsigned int colorspace,const unsigned int method)
-  {
-    const unsigned int x = get_global_id(0);
-    const unsigned int y = get_global_id(1);
-    const unsigned int columns = get_global_size(0);
-    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
-
-    float
-      blue,
-      green,
-      red;
-
-    red=getPixelRed(p);
-    green=getPixelGreen(p);
-    blue=getPixelBlue(p);
-
-    CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
-
-    setPixelRed(p,intensity);
-    setPixelGreen(p,intensity);
-    setPixelBlue(p,intensity);
-  }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     L o c a l C o n t r a s t                                               %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-    STRINGIFY(
-
-      __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
-          const int radius, 
-          const int imageWidth,
-          const int imageHeight)
-      {
-        const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
-
-        int x = get_local_id(0);
-        int y = get_global_id(1);
-
-        global CLPixelType *src = srcImage + y * imageWidth;
-
-        for (int i = x; i < imageWidth; i += get_local_size(0)) {
-            float sum = 0.0f;
-            float weight = 1.0f;
-
-            int j = i - radius;
-            while ((j + 7) < i) {
-                for (int k = 0; k < 8; ++k) // Unroll 8x
-                    sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
-                weight += 8.0f;
-                j+=8;
-            }
-            while (j < i) {
-                sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
-                weight += 1.0f;
-                ++j;
-            }
-
-            while ((j + 7) < radius + i) {
-                for (int k = 0; k < 8; ++k) // Unroll 8x
-                    sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
-                weight -= 8.0f;
-                j+=8;
-            }
-            while (j < radius + i) {
-                sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
-                weight -= 1.0f;
-                ++j;
-            }
-
-            tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
-        }
-      }
-    )
-
-    STRINGIFY(
-      __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
-          const int radius, 
-          const float strength,
-          const int imageWidth,
-          const int imageHeight)
-      {
-        const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
-
-        int x = get_global_id(0);
-        int y = get_global_id(1);
-
-        if ((x >= imageWidth) || (y >= imageHeight))
-                return;
-
-        global float *src = blurImage + x;
-
-        float sum = 0.0f;
-        float weight = 1.0f;
-
-        int j = y - radius;
-        while ((j + 7) < y) {
-            for (int k = 0; k < 8; ++k) // Unroll 8x
-                sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
-            weight += 8.0f;
-            j+=8;
-        }
-        while (j < y) {
-            sum += weight * src[mirrorBottom(j) * imageWidth];
-            weight += 1.0f;
-            ++j;
-        }
-
-        while ((j + 7) < radius + y) {
-            for (int k = 0; k < 8; ++k) // Unroll 8x
-                sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
-            weight -= 8.0f;
-            j+=8;
-        }
-        while (j < radius + y) {
-            sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
-            weight -= 1.0f;
-            ++j;
-        }
-
-        CLPixelType pixel = srcImage[x + y * imageWidth];
-        float srcVal = dot(RGB, convert_float4(pixel));
-        float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
-        mult = (srcVal + mult) / srcVal;
-
-        pixel.x = ClampToQuantum(pixel.x * mult);
-        pixel.y = ClampToQuantum(pixel.y * mult);
-        pixel.z = ClampToQuantum(pixel.z * mult);
-
-        dstImage[x + y * imageWidth] = pixel;
-      }
-    )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     M o d u l a t e                                                         %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-
-  inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
-    float *hue, float *saturation, float *lightness)
-  {
-  float
-    c,
-    tmax,
-    tmin;
-
-  /*
-     Convert RGB to HSL colorspace.
-     */
-  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
-  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
-
-  c=tmax-tmin;
-
-  *lightness=(tmax+tmin)/2.0;
-  if (c <= 0.0)
-  {
-    *hue=0.0;
-    *saturation=0.0;
-    return;
-  }
-
-  if (tmax == (QuantumScale*red))
-  {
-    *hue=(QuantumScale*green-QuantumScale*blue)/c;
-    if ((QuantumScale*green) < (QuantumScale*blue))
-      *hue+=6.0;
-  }
-  else
-    if (tmax == (QuantumScale*green))
-      *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
-    else
-      *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
-
-  *hue*=60.0/360.0;
-  if (*lightness <= 0.5)
-    *saturation=c/(2.0*(*lightness));
-  else
-    *saturation=c/(2.0-2.0*(*lightness));
-  }
-
-  inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
-      CLQuantum *red,CLQuantum *green,CLQuantum *blue)
-  {
-    float
-      b,
-      c,
-      g,
-      h,
-      tmin,
-      r,
-      x;
-
-    /*
-       Convert HSL to RGB colorspace.
-       */
-    h=hue*360.0;
-    if (lightness <= 0.5)
-      c=2.0*lightness*saturation;
-    else
-      c=(2.0-2.0*lightness)*saturation;
-    tmin=lightness-0.5*c;
-    h-=360.0*floor(h/360.0);
-    h/=60.0;
-    x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
-    switch ((int) floor(h) % 6)
-    {
-      case 0:
-        {
-          r=tmin+c;
-          g=tmin+x;
-          b=tmin;
-          break;
-        }
-      case 1:
-        {
-          r=tmin+x;
-          g=tmin+c;
-          b=tmin;
-          break;
-        }
-      case 2:
-        {
-          r=tmin;
-          g=tmin+c;
-          b=tmin+x;
-          break;
-        }
-      case 3:
-        {
-          r=tmin;
-          g=tmin+x;
-          b=tmin+c;
-          break;
-        }
-      case 4:
-        {
-          r=tmin+x;
-          g=tmin;
-          b=tmin+c;
-          break;
-        }
-      case 5:
-        {
-          r=tmin+c;
-          g=tmin;
-          b=tmin+x;
-          break;
-        }
-      default:
-        {
-          r=0.0;
-          g=0.0;
-          b=0.0;
-        }
-    }
-    *red=ClampToQuantum(QuantumRange*r);
-    *green=ClampToQuantum(QuantumRange*g);
-    *blue=ClampToQuantum(QuantumRange*b);
-  }
-
-  inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 
-    CLQuantum *red,CLQuantum *green,CLQuantum *blue)
-  {
-    float
-      hue,
-      lightness,
-      saturation;
-
-    /*
-    Increase or decrease color lightness, saturation, or hue.
-    */
-    ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
-    hue+=0.5*(0.01*percent_hue-1.0);
-    while (hue < 0.0)
-      hue+=1.0;
-    while (hue >= 1.0)
-      hue-=1.0;
-    saturation*=0.01*percent_saturation;
-    lightness*=0.01*percent_lightness;
-    ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
-  }
-
-  __kernel void Modulate(__global CLPixelType *im, 
-    const float percent_brightness, 
-    const float percent_hue, 
-    const float percent_saturation, 
-    const int colorspace)
-  {
-
-    const int x = get_global_id(0);  
-    const int y = get_global_id(1);
-    const int columns = get_global_size(0);
-    const int c = x + y * columns;
-
-    CLPixelType pixel = im[c];
-
-    CLQuantum
-        blue,
-        green,
-        red;
-
-    red=getRed(pixel);
-    green=getGreen(pixel);
-    blue=getBlue(pixel);
-
-    switch (colorspace)
-    {
-      case HSLColorspace:
-      default:
-        {
-          ModulateHSL(percent_hue, percent_saturation, percent_brightness, 
-              &red, &green, &blue);
-        }
-
-    }
-
-    CLPixelType filteredPixel;
-   
-    setRed(&filteredPixel, red);
-    setGreen(&filteredPixel, green);
-    setBlue(&filteredPixel, blue);
-    filteredPixel.w = pixel.w;
-
-    im[c] = filteredPixel;
-  }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     M o t i o n B l u r                                                     %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-    __kernel 
-    void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
-                    const unsigned int imageWidth, const unsigned int imageHeight,
-                    const __global float *filter, const unsigned int width, const __global int2* offset,
-                    const float4 bias,
-                    const ChannelType channel, const unsigned int matte) {
-
-      int2 currentPixel;
-      currentPixel.x = get_global_id(0);
-      currentPixel.y = get_global_id(1);
-
-      if (currentPixel.x >= imageWidth
-          || currentPixel.y >= imageHeight)
-          return;
-
-      float4 pixel;
-      pixel.x = (float)bias.x;
-      pixel.y = (float)bias.y;
-      pixel.z = (float)bias.z;
-      pixel.w = (float)bias.w;
-
-      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
-        
-        for (int i = 0; i < width; i++) {
-          // only support EdgeVirtualPixelMethod through ClampToCanvas
-          // TODO: implement other virtual pixel method
-          int2 samplePixel = currentPixel + offset[i];
-          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
-          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
-          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
-
-          pixel.x += (filter[i] * (float)samplePixelValue.x);
-          pixel.y += (filter[i] * (float)samplePixelValue.y);
-          pixel.z += (filter[i] * (float)samplePixelValue.z);
-          pixel.w += (filter[i] * (float)samplePixelValue.w);
-        }
-
-        CLPixelType outputPixel;
-        outputPixel.x = ClampToQuantum(pixel.x);
-        outputPixel.y = ClampToQuantum(pixel.y);
-        outputPixel.z = ClampToQuantum(pixel.z);
-        outputPixel.w = ClampToQuantum(pixel.w);
-        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
-      }
-      else {
-
-        float gamma = 0.0f;
-        for (int i = 0; i < width; i++) {
-          // only support EdgeVirtualPixelMethod through ClampToCanvas
-          // TODO: implement other virtual pixel method
-          int2 samplePixel = currentPixel + offset[i];
-          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
-          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
-
-          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
-
-          float alpha = QuantumScale*samplePixelValue.w;
-          float k = filter[i];
-          pixel.x = pixel.x + k * alpha * samplePixelValue.x;
-          pixel.y = pixel.y + k * alpha * samplePixelValue.y;
-          pixel.z = pixel.z + k * alpha * samplePixelValue.z;
-
-          pixel.w += k * alpha * samplePixelValue.w;
-
-          gamma+=k*alpha;
-        }
-        gamma = PerceptibleReciprocal(gamma);
-        pixel.xyz = gamma*pixel.xyz;
-
-        CLPixelType outputPixel;
-        outputPixel.x = ClampToQuantum(pixel.x);
-        outputPixel.y = ClampToQuantum(pixel.y);
-        outputPixel.z = ClampToQuantum(pixel.z);
-        outputPixel.w = ClampToQuantum(pixel.w);
-        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
-      }
-    }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     R e s i z e                                                             %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-  // Based on Box from resize.c
-  float BoxResizeFilter(const float x)
-  {
-    return 1.0f;
-  }
-  )
-    
-  STRINGIFY(
-  // Based on CubicBC from resize.c
-  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
-  {
-    /*
-    Cubic Filters using B,C determined values:
-    Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
-    Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
-    Spline              B = 1   C = 0    B-Spline Gaussian approximation
-    Hermite             B = 0   C = 0    B-Spline interpolator
-
-    See paper by Mitchell and Netravali, Reconstruction Filters in Computer
-    Graphics Computer Graphics, Volume 22, Number 4, August 1988
-    http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
-    Mitchell.pdf.
-
-    Coefficents are determined from B,C values:
-    P0 = (  6 - 2*B       )/6 = coeff[0]
-    P1 =         0
-    P2 = (-18 +12*B + 6*C )/6 = coeff[1]
-    P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
-    Q0 = (      8*B +24*C )/6 = coeff[3]
-    Q1 = (    -12*B -48*C )/6 = coeff[4]
-    Q2 = (      6*B +30*C )/6 = coeff[5]
-    Q3 = (    - 1*B - 6*C )/6 = coeff[6]
-
-    which are used to define the filter:
-
-    P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
-    Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
-
-    which ensures function is continuous in value and derivative (slope).
-    */
-    if (x < 1.0)
-      return(resizeFilterCoefficients[0]+x*(x*
-      (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
-    if (x < 2.0)
-      return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
-      (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
-    return(0.0);
-  }
-  )
-
-  STRINGIFY(
-  float Sinc(const float x)
-  {
-    if (x != 0.0f)
-    {
-      const float alpha=(float) (MagickPI*x);
-      return sinpi(x)/alpha;
-    }
-    return(1.0f);
-  }
-  )
-
-  STRINGIFY(
-  float Triangle(const float x)
-  {
-    /*
-    1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
-    a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
-    for Sinc().
-    */
-    return ((x<1.0f)?(1.0f-x):0.0f);
-  }
-  )
-
-
-  STRINGIFY(
-  float Hann(const float x)
-  {
-    /*
-    Cosine window function:
-      0.5+0.5*cos(pi*x).
-    */
-    const float cosine=cos((MagickPI*x));
-    return(0.5f+0.5f*cosine);
-  }
-  )
-
-  STRINGIFY(
-  float Hamming(const float x)
-  {
-    /*
-      Offset cosine window function:
-       .54 + .46 cos(pi x).
-    */
-    const float cosine=cos((MagickPI*x));
-    return(0.54f+0.46f*cosine);
-  }
-  )
-
-  STRINGIFY(
-  float Blackman(const float x)
-  {
-    /*
-      Blackman: 2nd order cosine windowing function:
-        0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
-
-      Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
-      five flops.
-    */
-    const float cosine=cos((MagickPI*x));
-    return(0.34f+cosine*(0.5f+cosine*0.16f));
-  }
-  )
-
-  STRINGIFY(
-  inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
-  {
-    switch (filterType)
-    {
-    /* Call Sinc even for SincFast to get better precision on GPU 
-       and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
-    case SincWeightingFunction:
-    case SincFastWeightingFunction:
-      return Sinc(x);
-    case CubicBCWeightingFunction:
-      return CubicBC(x,filterCoefficients);
-    case BoxWeightingFunction:
-      return BoxResizeFilter(x);
-    case TriangleWeightingFunction:
-      return Triangle(x);
-    case HannWeightingFunction:
-      return Hann(x);
-    case HammingWeightingFunction:
-      return Hamming(x);
-    case BlackmanWeightingFunction:
-      return Blackman(x);
-
-    default:
-      return 0.0f;
-    }
-  }
-  )
-
-
-  STRINGIFY(
-  inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
-           , const ResizeWeightingFunctionType resizeWindowType
-           , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
-  {
-    float scale;
-    float xBlur = fabs(x/resizeFilterBlur);
-    if (resizeWindowSupport < MagickEpsilon
-        || resizeWindowType == BoxWeightingFunction)
-    {
-      scale = 1.0f;
-    }
-    else
-    {
-      scale = resizeFilterScale;
-      scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
-    }
-    float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
-    return weight;
-  }
-
-  )
-
-  ;
-  const char *accelerateKernels2 =
-
-  STRINGIFY(
-
-  inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
-    return (numWorkItems/pixelPerWorkgroup);
-  }
-
-  // returns the index of the pixel for the current workitem to compute.
-  // returns -1 if this workitem doesn't need to participate in any computation
-  inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
-    const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
-    int pixelIndex = itemID/numWorkItemsPerPixel;
-    pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
-    return pixelIndex;
-  }
- 
-  )
-
-  STRINGIFY(
-  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
-    void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
-      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
-      const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
-      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
-      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
-      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
-      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
-      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
-  {
-    // calculate the range of resized image pixels computed by this workgroup
-    const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
-    const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
-    const unsigned int actualNumPixelToCompute = stopX - startX;
-
-    // calculate the range of input image pixels to cache
-    float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
-    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
-    scale = PerceptibleReciprocal(scale);
-
-    const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
-    const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
-
-    // cache the input pixels into local memory
-    const unsigned int y = get_global_id(1);
-    const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
-    const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
-    event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
-    wait_group_events(1, &e);
-
-    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
-    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
-    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
-    {
-      const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
-      const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
-      const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
-
-      // determine which resized pixel computed by this workitem
-      const unsigned int itemID = get_local_id(0);
-      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
-
-      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
-
-      float4 filteredPixel = (float4)0.0f;
-      float density = 0.0f;
-      float gamma = 0.0f;
-      // -1 means this workitem doesn't participate in the computation
-      if (pixelIndex != -1)
-      {
-        // x coordinated of the resized pixel computed by this workitem
-        const int x = chunkStartX + pixelIndex;
-
-        // calculate how many steps required for this pixel
-        const float bisect = (x+0.5)/xFactor+MagickEpsilon;
-        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
-        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
-        const unsigned int n = stop - start;
-
-        // calculate how many steps this workitem will contribute
-        unsigned int numStepsPerWorkItem = n / numItems;
-        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
-
-        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
-        if (startStep < n)
-        {
-          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
-
-          unsigned int cacheIndex = start+startStep-cacheRangeStartX;
-
-          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
-          {
-            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
-              (ResizeWeightingFunctionType) resizeFilterType,
-              (ResizeWeightingFunctionType) resizeWindowType,
-              resizeFilterScale, resizeFilterWindowSupport,
-              resizeFilterBlur, scale*(start + i - bisect + 0.5));
-
-            float4 cp = (float4) 0;
-
-            __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
-            cp.x = (float) *(p);
-            if (number_channels > 2)
-            {
-              cp.y = (float) *(p + 1);
-              cp.z = (float) *(p + 2);
-            }
-
-            if (alpha_index != 0)
-            {
-              cp.w = (float) *(p + alpha_index);
-
-              float alpha = weight * QuantumScale * cp.w;
-
-              filteredPixel.x += alpha * cp.x;
-              filteredPixel.y += alpha * cp.y;
-              filteredPixel.z += alpha * cp.z;
-              filteredPixel.w += weight * cp.w;
-              gamma += alpha;
-            }
-            else
-              filteredPixel += ((float4) weight)*cp;
-
-            density += weight;
-          }
-        }
-      }
-
-      // initialize the accumulators to zero
-      if (itemID < actualNumPixelInThisChunk) {
-        outputPixelCache[itemID] = (float4)0.0f;
-        densityCache[itemID] = 0.0f;
-        if (alpha_index != 0)
-          gammaCache[itemID] = 0.0f;
-      }
-      barrier(CLK_LOCAL_MEM_FENCE);
-
-      // accumulatte the filtered pixel value and the density
-      for (unsigned int i = 0; i < numItems; i++) {
-        if (pixelIndex != -1) {
-          if (itemID%numItems == i) {
-            outputPixelCache[pixelIndex]+=filteredPixel;
-            densityCache[pixelIndex]+=density;
-            if (alpha_index != 0)
-              gammaCache[pixelIndex]+=gamma;
-          }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-      }
-
-      if (itemID < actualNumPixelInThisChunk)
-      {
-        float4 filteredPixel = outputPixelCache[itemID];
-
-        float gamma = 0.0f;
-        if (alpha_index != 0)
-          gamma = gammaCache[itemID];
-
-        float density = densityCache[itemID];
-        if ((density != 0.0f) && (density != 1.0f))
-        {
-          density = PerceptibleReciprocal(density);
-          filteredPixel *= (float4) density;
-          gamma *= density;
-        }
-
-        if (alpha_index != 0)
-        {
-          gamma = PerceptibleReciprocal(gamma);
-          filteredPixel.x *= gamma;
-          filteredPixel.y *= gamma;
-          filteredPixel.z *= gamma;
-        }
-
-        WriteFloat4(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, AllChannels, filteredPixel);
-      }
-    }
-  }
-  )
-
-
-  STRINGIFY(
- __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
-    void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
-      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
-      const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
-      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
-      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
-      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
-      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
-      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
-  {
-    // calculate the range of resized image pixels computed by this workgroup
-    const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
-    const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
-    const unsigned int actualNumPixelToCompute = stopY - startY;
-
-    // calculate the range of input image pixels to cache
-    float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
-    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
-    scale = PerceptibleReciprocal(scale);
-
-    const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
-    const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
-
-    // cache the input pixels into local memory
-    const unsigned int x = get_global_id(0);
-    unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
-    unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
-    unsigned int stride = inputColumns * number_channels;
-    for (unsigned int i = 0; i < number_channels; i++)
-    {
-      event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
-      wait_group_events(1,&e);
-    }
-
-    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
-    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
-    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
-    {
-      const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
-      const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
-      const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
-
-      // determine which resized pixel computed by this workitem
-      const unsigned int itemID = get_local_id(1);
-      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
-
-      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
-
-      float4 filteredPixel = (float4)0.0f;
-      float density = 0.0f;
-      float gamma = 0.0f;
-      // -1 means this workitem doesn't participate in the computation
-      if (pixelIndex != -1)
-      {
-        // x coordinated of the resized pixel computed by this workitem
-        const int y = chunkStartY + pixelIndex;
-
-        // calculate how many steps required for this pixel
-        const float bisect = (y+0.5)/yFactor+MagickEpsilon;
-        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
-        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
-        const unsigned int n = stop - start;
-
-        // calculate how many steps this workitem will contribute
-        unsigned int numStepsPerWorkItem = n / numItems;
-        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
-
-        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
-        if (startStep < n)
-        {
-          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
-
-          unsigned int cacheIndex = start+startStep-cacheRangeStartY;
-          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
-          {
-            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
-              (ResizeWeightingFunctionType) resizeFilterType,
-              (ResizeWeightingFunctionType) resizeWindowType,
-              resizeFilterScale, resizeFilterWindowSupport,
-              resizeFilterBlur, scale*(start + i - bisect + 0.5));
-
-            float4 cp = (float4)0.0f;
-
-            __local CLQuantum *p = inputImageCache + cacheIndex;
-            cp.x = (float) *(p);
-            if (number_channels > 2)
-            {
-              cp.y = (float) *(p + rangeLength);
-              cp.z = (float) *(p + (rangeLength * 2));
-            }
-
-            if (alpha_index != 0)
-            {
-              cp.w = (float) *(p + (rangeLength * alpha_index));
-
-              float alpha = weight * QuantumScale * cp.w;
-
-              filteredPixel.x += alpha * cp.x;
-              filteredPixel.y += alpha * cp.y;
-              filteredPixel.z += alpha * cp.z;
-              filteredPixel.w += weight * cp.w;
-              gamma += alpha;
-            }
-            else
-              filteredPixel += ((float4) weight)*cp;
-
-            density += weight;
-          }
-        }
-      }
-
-      // initialize the accumulators to zero
-      if (itemID < actualNumPixelInThisChunk) {
-        outputPixelCache[itemID] = (float4)0.0f;
-        densityCache[itemID] = 0.0f;
-        if (alpha_index != 0)
-          gammaCache[itemID] = 0.0f;
-      }
-      barrier(CLK_LOCAL_MEM_FENCE);
-
-      // accumulatte the filtered pixel value and the density
-      for (unsigned int i = 0; i < numItems; i++) {
-        if (pixelIndex != -1) {
-          if (itemID%numItems == i) {
-            outputPixelCache[pixelIndex]+=filteredPixel;
-            densityCache[pixelIndex]+=density;
-            if (alpha_index != 0)
-              gammaCache[pixelIndex]+=gamma;
-          }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-      }
-
-      if (itemID < actualNumPixelInThisChunk)
-      {
-        float4 filteredPixel = outputPixelCache[itemID];
-
-        float gamma = 0.0f;
-        if (alpha_index != 0)
-          gamma = gammaCache[itemID];
-
-        float density = densityCache[itemID];
-        if ((density != 0.0f) && (density != 1.0f))
-        {
-          density = PerceptibleReciprocal(density);
-          filteredPixel *= (float4) density;
-          gamma *= density;
-        }
-
-        if (alpha_index != 0)
-        {
-          gamma = PerceptibleReciprocal(gamma);
-          filteredPixel.x *= gamma;
-          filteredPixel.y *= gamma;
-          filteredPixel.z *= gamma;
-        }
-
-        WriteFloat4(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, AllChannels, filteredPixel);
-      }
-    }
-  }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     R o t a t i o n a l B l u r                                             %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-  __kernel void RotationalBlur(const __global CLQuantum *image,const unsigned int number_channels,
-    const unsigned int channel,const float4 bias,const float2 blurCenter,__constant float *cos_theta,
-    __constant float *sin_theta,const unsigned int cossin_theta_size,__global CLQuantum *filteredImage)
-  {
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int columns = get_global_size(0);
-    const int rows = get_global_size(1);
-    unsigned int step = 1;
-    float center_x = (float) x - blurCenter.x;
-    float center_y = (float) y - blurCenter.y;
-    float radius = hypot(center_x, center_y);
-
-    float blur_radius = hypot(blurCenter.x, blurCenter.y);
-
-    if (radius > MagickEpsilon)
-    {
-      step = (unsigned int) (blur_radius / radius);
-      if (step == 0)
-        step = 1;
-      if (step >= cossin_theta_size)
-        step = cossin_theta_size-1;
-    }
-
-    float4 result = bias;
-
-    float normalize = 0.0f;
-    float gamma = 0.0f;
-
-    for (unsigned int i=0; i<cossin_theta_size; i+=step)
-    {
-      int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
-      int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
-
-      float4 pixel = ReadFloat4(image, number_channels, columns, cx, cy, channel);
-
-      if ((number_channels == 4) || (number_channels == 2))
-      {
-        float alpha = (float)(QuantumScale*pixel.w);
-
-        gamma += alpha;
-
-        result.x += alpha * pixel.x;
-        result.y += alpha * pixel.y;
-        result.z += alpha * pixel.z;
-        result.w += pixel.w;
-      }
-      else
-        result += pixel;
-
-      normalize += 1.0f;
-    }
-
-    normalize = PerceptibleReciprocal(normalize);
-
-    if ((number_channels == 4) || (number_channels == 2))
-    {
-      gamma = PerceptibleReciprocal(gamma);
-      result.x *= gamma;
-      result.y *= gamma;
-      result.z *= gamma;
-      result.w *= normalize;
-    }
-    else
-      result *= normalize;
-
-    WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
-  }
-  )
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     U n s h a r p M a s k                                                   %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-*/
-
-  STRINGIFY(
-  __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
-    const __global float4 *blurRowData,const unsigned int number_channels,
-    const ChannelType channel,const unsigned int columns,
-    const unsigned int rows,__local float4* cachedData,
-    __local float* cachedFilter,const __global float *filter,
-    const unsigned int width,const float gain, const float threshold,
-    __global CLQuantum *filteredImage)
-  {
-    const unsigned int radius = (width-1)/2;
-
-    // cache the pixel shared by the workgroup
-    const int groupX = get_group_id(0);
-    const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
-    const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
-
-    if ((groupStartY >= 0) && (groupStopY < rows))
-    {
-      event_t e = async_work_group_strided_copy(cachedData,
-        blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
-      wait_group_events(1,&e);
-    }
-    else
-    {
-      for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
-        cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
-
-      barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    // cache the filter as well
-    event_t e = async_work_group_copy(cachedFilter,filter,width,0);
-    wait_group_events(1,&e);
-
-    // only do the work if this is not a patched item
-    const int cy = get_global_id(1);
-
-    if (cy < rows)
-    {
-      float4 blurredPixel = (float4) 0.0f;
-
-      int i = 0;
-
-      for ( ; i+7 < width; )
-      {
-        for (int j=0; j < 8; j++, i++)
-          blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
-      }
-
-      for ( ; i < width; i++)
-        blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
-
-      float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
-      float4 outputPixel = inputImagePixel - blurredPixel;
-
-      float quantumThreshold = QuantumRange*threshold;
-
-      int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
-      outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
-
-      //write back
-      WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
-    }
-  }
-  )
-
-  STRINGIFY(
-  __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
-    const ChannelType channel,__constant float *filter,const unsigned int width,
-    const unsigned int columns,const unsigned int rows,__local float4 *pixels,
-    const float gain,const float threshold,__global CLQuantum *filteredImage)
-  {
-    const unsigned int x = get_global_id(0);
-    const unsigned int y = get_global_id(1);
-
-    const unsigned int radius = (width - 1) / 2;
-
-    int row = y - radius;
-    int baseRow = get_group_id(1) * get_local_size(1) - radius;
-    int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
-
-    while (row < endRow) {
-      int srcy = (row < 0) ? -row : row; // mirror pad
-      srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
-
-      float4 value = 0.0f;
-
-      int ix = x - radius;
-      int i = 0;
-
-      while (i + 7 < width) {
-        for (int j = 0; j < 8; ++j) { // unrolled
-          int srcx = ix + j;
-          srcx = (srcx < 0) ? -srcx : srcx;
-          srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
-          value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
-        }
-        ix += 8;
-        i += 8;
-      }
-
-      while (i < width) {
-        int srcx = (ix < 0) ? -ix : ix; // mirror pad
-        srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
-        value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
-        ++i;
-        ++ix;
-      }
-      pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
-      row += get_local_size(1);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    const int px = get_local_id(0);
-    const int py = get_local_id(1);
-    const int prp = get_local_size(0);
-    float4 value = (float4)(0.0f);
-
-    int i = 0;
-    while (i + 7 < width) {
-      for (int j = 0; j < 8; ++j) // unrolled
-        value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
-      i += 8;
-    }
-    while (i < width) {
-      value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
-      ++i;
-    }
-
-    float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
-    float4 diff = srcPixel - value;
-
-    float quantumThreshold = QuantumRange*threshold;
-
-    int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
-    value = select(srcPixel + diff * gain, srcPixel, mask);
-
-    if ((x < columns) && (y < rows))
-      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
-  }
-  )
-
-  STRINGIFY(
-    __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
-    void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
-      const unsigned int number_channels,const unsigned int max_channels,
-      const float threshold,const int passes,const unsigned int imageWidth,
-      const unsigned int imageHeight)
-  {
-    const int pad = (1 << (passes - 1));
-    const int tileSize = 64;
-    const int tileRowPixels = 64;
-    const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
-
-    CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
-
-    local float buffer[64 * 64];
-
-    int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0);
-    int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad;
-
-    for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
-      int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
-                (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
-
-      for (int channel = 0; channel < max_channels; ++channel)
-        stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
-    }
-
-    for (int channel = 0; channel < max_channels; ++channel) {
-      // Load LDS
-      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
-        buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
-
-      // Process
-
-      float tmp[16];
-      float accum[16];
-      float pixel;
-
-      for (int i = 0; i < 16; i++)
-        accum[i]=0.0f;
-
-      for (int pass = 0; pass < passes; ++pass) {
-        const int radius = 1 << pass;
-        const int x = get_local_id(0);
-        const float thresh = threshold * noise[pass];
-
-        // Apply horizontal hat
-        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
-          const int offset = i * tileRowPixels;
-          if (pass == 0)
-            tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
-          pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
-          barrier(CLK_LOCAL_MEM_FENCE);
-          buffer[x + offset] = pixel;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Apply vertical hat
-        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
-          pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
-          float delta = tmp[i / 4] - pixel;
-          tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
-          if (delta < -thresh)
-            delta += thresh;
-          else if (delta > thresh)
-            delta -= thresh;
-          else
-            delta = 0;
-          accum[i / 4] += delta;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (pass < passes - 1)
-          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
-            buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
-        else  // last pass
-          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
-            accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
-        barrier(CLK_LOCAL_MEM_FENCE);
-      }
-
-      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
-        stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
-
-      barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    // Write from stage to output
-
-    if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
-      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
-        if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
-          int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
-          for (int channel = 0; channel < max_channels; ++channel) {
-            dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
-          }
-        }
-      }
-    }
-  }
-  )
-
-  ;
-
-#endif // MAGICKCORE_OPENCL_SUPPORT
-
  extern MagickPrivate Image
    *AccelerateAddNoiseImage(const Image*,const NoiseType,ExceptionInfo *),
    *AccelerateBlurImage(const Image *,const double,const double,ExceptionInfo *),
diff --git a/MagickCore/accelerate.c b/MagickCore/accelerate.c

index 507012867092e44eabe169070fbd11b3498f81ff..92daf1e8b89f06eef75116280e2a542908c18875 100644 (file)
--- a/MagickCore/accelerate.c
+++ b/MagickCore/accelerate.c
@@ -41,6 +41,7 @@ Include declarations.
  */
  #include "MagickCore/studio.h"
  #include "MagickCore/accelerate-private.h"
+#include "MagickCore/accelerate-kernels-private.h"
  #include "MagickCore/artifact.h"
  #include "MagickCore/cache.h"
  #include "MagickCore/cache-private.h"
author	dirk <dirk@git.imagemagick.org>
	Sun, 29 May 2016 19:26:07 +0000 (21:26 +0200)
committer	dirk <dirk@git.imagemagick.org>
	Sun, 29 May 2016 19:26:07 +0000 (21:26 +0200)
MagickCore/accelerate-kernels-private.h	[new file with mode: 0644]	patch \| blob
MagickCore/accelerate-private.h		patch \| blob \| history
MagickCore/accelerate.c		patch \| blob \| history