]> granicus.if.org Git - imagemagick/commitdiff
Placed the accelerated methods in alphabetic order.
authordirk <dirk@git.imagemagick.org>
Thu, 12 Nov 2015 20:36:56 +0000 (21:36 +0100)
committerdirk <dirk@git.imagemagick.org>
Thu, 12 Nov 2015 20:36:56 +0000 (21:36 +0100)
MagickCore/accelerate-private.h
MagickCore/accelerate.c
MagickCore/accelerate.h

index 5710fa7670aebfa9bdd3998eb654d2d492a3636f..ce5297e608cb720459fde725a2bf7b11ad45f281 100644 (file)
 extern "C" {
 #endif
 
-
 #if defined(MAGICKCORE_OPENCL_SUPPORT)
 
+/*
+  Define declarations.
+*/
 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
 #define OPENCL_ELIF(...)       "\n #""elif " #__VA_ARGS__ " \n"
 #define OPENCL_ELSE()          "\n #""else " " \n"
@@ -33,6 +35,10 @@ extern "C" {
 #define OPENCL_IF(...)         "\n #""if " #__VA_ARGS__ " \n"
 #define STRINGIFY(...) #__VA_ARGS__ "\n"
 
+/*
+  Typedef declarations.
+*/
+
 typedef struct _FloatPixelPacket
 {
   MagickRealType
@@ -44,6 +50,189 @@ typedef struct _FloatPixelPacket
 } FloatPixelPacket;
 
 const char* accelerateKernels =
+
+/*
+  Define declarations.
+*/
+  OPENCL_DEFINE(GetPixelAlpha(pixel), pixel.w)
+  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
+  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
+  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
+  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
+  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
+  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
+  OPENCL_DEFINE(SigmaRandom, (attenuate))
+  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
+
+/*
+  Typedef declarations.
+*/
+  STRINGIFY(
+    typedef enum
+  {
+    UndefinedColorspace,
+    RGBColorspace,            /* Linear RGB colorspace */
+    GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
+    TransparentColorspace,
+    OHTAColorspace,
+    LabColorspace,
+    XYZColorspace,
+    YCbCrColorspace,
+    YCCColorspace,
+    YIQColorspace,
+    YPbPrColorspace,
+    YUVColorspace,
+    CMYKColorspace,           /* negared linear RGB with black separated */
+    sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
+    HSBColorspace,
+    HSLColorspace,
+    HWBColorspace,
+    Rec601LumaColorspace,
+    Rec601YCbCrColorspace,
+    Rec709LumaColorspace,
+    Rec709YCbCrColorspace,
+    LogColorspace,
+    CMYColorspace,            /* negated linear RGB colorspace */
+    LuvColorspace,
+    HCLColorspace,
+    LCHColorspace,            /* alias for LCHuv */
+    LMSColorspace,
+    LCHabColorspace,          /* Cylindrical (Polar) Lab */
+    LCHuvColorspace,          /* Cylindrical (Polar) Luv */
+    scRGBColorspace,
+    HSIColorspace,
+    HSVColorspace,            /* alias for HSB */
+    HCLpColorspace,
+    YDbDrColorspace
+  } ColorspaceType;
+  )
+
+  STRINGIFY(
+    typedef enum
+    {
+      UndefinedCompositeOp,
+      NoCompositeOp,
+      ModulusAddCompositeOp,
+      AtopCompositeOp,
+      BlendCompositeOp,
+      BumpmapCompositeOp,
+      ChangeMaskCompositeOp,
+      ClearCompositeOp,
+      ColorBurnCompositeOp,
+      ColorDodgeCompositeOp,
+      ColorizeCompositeOp,
+      CopyBlackCompositeOp,
+      CopyBlueCompositeOp,
+      CopyCompositeOp,
+      CopyCyanCompositeOp,
+      CopyGreenCompositeOp,
+      CopyMagentaCompositeOp,
+      CopyOpacityCompositeOp,
+      CopyRedCompositeOp,
+      CopyYellowCompositeOp,
+      DarkenCompositeOp,
+      DstAtopCompositeOp,
+      DstCompositeOp,
+      DstInCompositeOp,
+      DstOutCompositeOp,
+      DstOverCompositeOp,
+      DifferenceCompositeOp,
+      DisplaceCompositeOp,
+      DissolveCompositeOp,
+      ExclusionCompositeOp,
+      HardLightCompositeOp,
+      HueCompositeOp,
+      InCompositeOp,
+      LightenCompositeOp,
+      LinearLightCompositeOp,
+      LuminizeCompositeOp,
+      MinusDstCompositeOp,
+      ModulateCompositeOp,
+      MultiplyCompositeOp,
+      OutCompositeOp,
+      OverCompositeOp,
+      OverlayCompositeOp,
+      PlusCompositeOp,
+      ReplaceCompositeOp,
+      SaturateCompositeOp,
+      ScreenCompositeOp,
+      SoftLightCompositeOp,
+      SrcAtopCompositeOp,
+      SrcCompositeOp,
+      SrcInCompositeOp,
+      SrcOutCompositeOp,
+      SrcOverCompositeOp,
+      ModulusSubtractCompositeOp,
+      ThresholdCompositeOp,
+      XorCompositeOp,
+      /* These are new operators, added after the above was last sorted.
+       * The list should be re-sorted only when a new library version is
+       * created.
+       */
+      DivideDstCompositeOp,
+      DistortCompositeOp,
+      BlurCompositeOp,
+      PegtopLightCompositeOp,
+      VividLightCompositeOp,
+      PinLightCompositeOp,
+      LinearDodgeCompositeOp,
+      LinearBurnCompositeOp,
+      MathematicsCompositeOp,
+      DivideSrcCompositeOp,
+      MinusSrcCompositeOp,
+      DarkenIntensityCompositeOp,
+      LightenIntensityCompositeOp
+    } CompositeOperator;
+  )
+
+  STRINGIFY(
+     typedef enum
+     {
+       UndefinedFunction,
+       PolynomialFunction,
+       SinusoidFunction,
+       ArcsinFunction,
+       ArctanFunction
+     } MagickFunction;
+  )
+
+  STRINGIFY(
+    typedef enum
+  {
+    UndefinedPixelIntensityMethod = 0,
+    AveragePixelIntensityMethod,
+    BrightnessPixelIntensityMethod,
+    LightnessPixelIntensityMethod,
+    Rec601LumaPixelIntensityMethod,
+    Rec601LuminancePixelIntensityMethod,
+    Rec709LumaPixelIntensityMethod,
+    Rec709LuminancePixelIntensityMethod,
+    RMSPixelIntensityMethod,
+    MSPixelIntensityMethod
+  } PixelIntensityMethod;
+  )
+
+  STRINGIFY(
+  typedef enum {
+    BoxWeightingFunction = 0,
+    TriangleWeightingFunction,
+    CubicBCWeightingFunction,
+    HannWeightingFunction,
+    HammingWeightingFunction,
+    BlackmanWeightingFunction,
+    GaussianWeightingFunction,
+    QuadraticWeightingFunction,
+    JincWeightingFunction,
+    SincWeightingFunction,
+    SincFastWeightingFunction,
+    KaiserWeightingFunction,
+    WelshWeightingFunction,
+    BohmanWeightingFunction,
+    LagrangeWeightingFunction,
+    CosineWeightingFunction,
+  } ResizeWeightingFunctionType;
+  )
+
   STRINGIFY(
      typedef enum
      {
@@ -79,7 +268,11 @@ const char* accelerateKernels =
      } ChannelType;  /* must correspond to PixelChannel */
   )
 
-  OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
+/*
+  Helper functions.
+*/
+
+OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
 
   STRINGIFY(
     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
@@ -88,7 +281,7 @@ const char* accelerateKernels =
     }
   )
 
-  OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
+OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
 
   STRINGIFY(
     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
@@ -97,7 +290,7 @@ const char* accelerateKernels =
     }
   )
 
-  OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
+OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
 
   STRINGIFY(
     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
@@ -106,8 +299,7 @@ const char* accelerateKernels =
     }
   )
 
-  OPENCL_ENDIF()
-
+OPENCL_ENDIF()
 
   STRINGIFY(
     inline int ClampToCanvas(const int offset,const int range)
@@ -148,64 +340,6 @@ const char* accelerateKernels =
     }
   )
 
-  OPENCL_DEFINE(GetPixelAlpha(pixel),pixel.w)
-
-  STRINGIFY(
-  typedef enum
-  {
-    UndefinedPixelIntensityMethod = 0,
-    AveragePixelIntensityMethod,
-    BrightnessPixelIntensityMethod,
-    LightnessPixelIntensityMethod,
-    Rec601LumaPixelIntensityMethod,
-    Rec601LuminancePixelIntensityMethod,
-    Rec709LumaPixelIntensityMethod,
-    Rec709LuminancePixelIntensityMethod,
-    RMSPixelIntensityMethod,
-    MSPixelIntensityMethod
-  } PixelIntensityMethod;
-  )
-
-  STRINGIFY(
-  typedef enum
-  {
-    UndefinedColorspace,
-    RGBColorspace,            /* Linear RGB colorspace */
-    GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
-    TransparentColorspace,
-    OHTAColorspace,
-    LabColorspace,
-    XYZColorspace,
-    YCbCrColorspace,
-    YCCColorspace,
-    YIQColorspace,
-    YPbPrColorspace,
-    YUVColorspace,
-    CMYKColorspace,           /* negared linear RGB with black separated */
-    sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
-    HSBColorspace,
-    HSLColorspace,
-    HWBColorspace,
-    Rec601LumaColorspace,
-    Rec601YCbCrColorspace,
-    Rec709LumaColorspace,
-    Rec709YCbCrColorspace,
-    LogColorspace,
-    CMYColorspace,            /* negated linear RGB colorspace */
-    LuvColorspace,
-    HCLColorspace,
-    LCHColorspace,            /* alias for LCHuv */
-    LMSColorspace,
-    LCHabColorspace,          /* Cylindrical (Polar) Lab */
-    LCHuvColorspace,          /* Cylindrical (Polar) Luv */
-    scRGBColorspace,
-    HSIColorspace,
-    HSVColorspace,            /* alias for HSB */
-    HCLpColorspace,
-    YDbDrColorspace
-  } ColorspaceType;
-  )
-
   STRINGIFY(
   inline float RoundToUnity(const float value)
    {
@@ -338,589 +472,562 @@ const char* accelerateKernels =
   }
   )
 
-  STRINGIFY(
-    __kernel 
-    void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
-    const unsigned int imageWidth, const unsigned int imageHeight,
-    __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
-    const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
-
-      int2 blockID;
-      blockID.x = get_group_id(0);
-      blockID.y = get_group_id(1);
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     A d d N o i s e                                                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-      // image area processed by this workgroup
-      int2 imageAreaOrg;
-      imageAreaOrg.x = blockID.x * get_local_size(0);
-      imageAreaOrg.y = blockID.y * get_local_size(1);
+    STRINGIFY(
 
-      int2 midFilterDimen;
-      midFilterDimen.x = (filterWidth-1)/2;
-      midFilterDimen.y = (filterHeight-1)/2;
+/*
+Part of MWC64X by David Thomas, dt10@imperial.ac.uk
+This is provided under BSD, full license is with the main package.
+See http://www.doc.ic.ac.uk/~dt10/research
+*/
 
-      int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
+// Pre: a<M, b<M
+// Post: r=(a+b) mod M
+ulong MWC_AddMod64(ulong a, ulong b, ulong M)
+{
+       ulong v=a+b;
+       //if( (v>=M) || (v<a) )
+       if( (v>=M) || (convert_float(v) < convert_float(a)) )   // workaround for what appears to be an optimizer bug.
+               v=v-M;
+       return v;
+}
 
-      // dimension of the local cache
-      int2 cachedAreaDimen;
-      cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
-      cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
+// Pre: a<M,b<M
+// Post: r=(a*b) mod M
+// This could be done more efficently, but it is portable, and should
+// be easy to understand. It can be replaced with any of the better
+// modular multiplication algorithms (for example if you know you have
+// double precision available or something).
+ulong MWC_MulMod64(ulong a, ulong b, ulong M)
+{      
+       ulong r=0;
+       while(a!=0){
+               if(a&1)
+                       r=MWC_AddMod64(r,b,M);
+               b=MWC_AddMod64(b,b,M);
+               a=a>>1;
+       }
+       return r;
+}
 
-      // cache the pixels accessed by this workgroup in local memory
-      int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
-      int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
-      int groupSize = get_local_size(0) * get_local_size(1);
-      for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
 
-        int2 cachedAreaIndex;
-        cachedAreaIndex.x = i % cachedAreaDimen.x;
-        cachedAreaIndex.y = i / cachedAreaDimen.x;
+// Pre: a<M, e>=0
+// Post: r=(a^b) mod M
+// This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
+// most architectures
+ulong MWC_PowMod64(ulong a, ulong e, ulong M)
+{
+       ulong sqr=a, acc=1;
+       while(e!=0){
+               if(e&1)
+                       acc=MWC_MulMod64(acc,sqr,M);
+               sqr=MWC_MulMod64(sqr,sqr,M);
+               e=e>>1;
+       }
+       return acc;
+}
 
-        int2 imagePixelIndex;
-        imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
+uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
+{
+       ulong m=MWC_PowMod64(A, distance, M);
+       ulong x=curr.x*(ulong)A+curr.y;
+       x=MWC_MulMod64(x, m, M);
+       return (uint2)((uint)(x/A), (uint)(x%A));
+}
 
-        // only support EdgeVirtualPixelMethod through ClampToCanvas
-        // TODO: implement other virtual pixel method
-        imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
-        imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
+uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
+{
+       // This is an arbitrary constant for starting LCG jumping from. I didn't
+       // want to start from 1, as then you end up with the two or three first values
+       // being a bit poor in ones - once you've decided that, one constant is as
+       // good as any another. There is no deep mathematical reason for it, I just
+       // generated a random number.
+       enum{ MWC_BASEID = 4077358422479273989UL };
+       
+       ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
+       ulong m=MWC_PowMod64(A, dist, M);
+       
+       ulong x=MWC_MulMod64(MWC_BASEID, m, M);
+       return (uint2)((uint)(x/A), (uint)(x%A));
+}
 
-        pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
-      }
+//! Represents the state of a particular generator
+typedef struct{ uint x; uint c; } mwc64x_state_t;
 
-      // cache the filter
-      for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
-        filterCache[i] = filter[i];
-      }
-      barrier(CLK_LOCAL_MEM_FENCE);
+enum{ MWC64X_A = 4294883355U };
+enum{ MWC64X_M = 18446383549859758079UL };
 
+void MWC64X_Step(mwc64x_state_t *s)
+{
+       uint X=s->x, C=s->c;
+       
+       uint Xn=MWC64X_A*X+C;
+       uint carry=(uint)(Xn<C);                                // The (Xn<C) will be zero or one for scalar
+       uint Cn=mad_hi(MWC64X_A,X,carry);  
+       
+       s->x=Xn;
+       s->c=Cn;
+}
 
-      int2 imageIndex;
-      imageIndex.x = imageAreaOrg.x + get_local_id(0);
-      imageIndex.y = imageAreaOrg.y + get_local_id(1);
+void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
+{
+       uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
+       s->x=tmp.x;
+       s->c=tmp.y;
+}
 
-      // if out-of-range, stops here and quit
-      if (imageIndex.x >= imageWidth
-        || imageIndex.y >= imageHeight) {
-          return;
-      }
+void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
+{
+       uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
+       s->x=tmp.x;
+       s->c=tmp.y;
+}
 
-      int filterIndex = 0;
-      float4 sum = (float4)0.0f;
-      float gamma = 0.0f;
-      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
-        int cacheIndexY = get_local_id(1);
-        for (int j = 0; j < filterHeight; j++) {
-          int cacheIndexX = get_local_id(0);
-          for (int i = 0; i < filterWidth; i++) {
-            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
-            float f = filterCache[filterIndex];
+//! Return a 32-bit integer in the range [0..2^32)
+uint MWC64X_NextUint(mwc64x_state_t *s)
+{
+       uint res=s->x ^ s->c;
+       MWC64X_Step(s);
+       return res;
+}
 
-            sum.x += f * p.x;
-            sum.y += f * p.y;
-            sum.z += f * p.z; 
-            sum.w += f * p.w;
+//
+// End of MWC64X excerpt
+//
 
-            gamma += f;
-            filterIndex++;
-            cacheIndexX++;
-          }
-          cacheIndexY++;
-        }
-      }
-      else {
-        int cacheIndexY = get_local_id(1);
-        for (int j = 0; j < filterHeight; j++) {
-          int cacheIndexX = get_local_id(0);
-          for (int i = 0; i < filterWidth; i++) {
+  typedef enum
+  {
+    UndefinedNoise,
+    UniformNoise,
+    GaussianNoise,
+    MultiplicativeGaussianNoise,
+    ImpulseNoise,
+    LaplacianNoise,
+    PoissonNoise,
+    RandomNoise
+  } NoiseType;
 
-            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
-            float alpha = QuantumScale*p.w;
-            float f = filterCache[filterIndex];
-            float g = alpha * f;
+  float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {
+       return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff);     // normalized to 1.0
+  }
 
-            sum.x += g*p.x;
-            sum.y += g*p.y;
-            sum.z += g*p.z;
-            sum.w += f*p.w;
+  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
+    float 
+      alpha,
+      beta,
+      noise,
+      sigma;
 
-            gamma += g;
-            filterIndex++;
-            cacheIndexX++;
-          }
-          cacheIndexY++;
-        }
-        gamma = PerceptibleReciprocal(gamma);
-        sum.xyz = gamma*sum.xyz;
+    noise = 0.0f;
+    alpha=mwcReadPseudoRandomValue(r);
+    switch(noise_type) {
+    case UniformNoise:
+    default:
+      {
+        noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
+        break;
       }
-      CLPixelType outputPixel;
-      outputPixel.x = ClampToQuantum(sum.x);
-      outputPixel.y = ClampToQuantum(sum.y);
-      outputPixel.z = ClampToQuantum(sum.z);
-      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
-
-      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
-    }
-  )
-
-  STRINGIFY(
-    __kernel 
-    void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
-                  const uint imageWidth, const uint imageHeight,
-                  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
-                  const uint matte, const ChannelType channel) {
+    case GaussianNoise:
+      {
+        float
+          gamma,
+          tau;
 
-      int2 imageIndex;
-      imageIndex.x = get_global_id(0);
-      imageIndex.y = get_global_id(1);
+        if (alpha == 0.0f)
+          alpha=1.0f;
+        beta=mwcReadPseudoRandomValue(r);
+        gamma=sqrt(-2.0f*log(alpha));
+        sigma=gamma*cospi((2.0f*beta));
+        tau=gamma*sinpi((2.0f*beta));
+        noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
+                      QuantumRange*TauGaussian*tau);
+        break;
+      }
 
-      /*
-      unsigned int imageWidth = get_global_size(0);
-      unsigned int imageHeight = get_global_size(1);
-      */
-      if (imageIndex.x >= imageWidth
-          || imageIndex.y >= imageHeight)
-          return;
 
-      int2 midFilterDimen;
-      midFilterDimen.x = (filterWidth-1)/2;
-      midFilterDimen.y = (filterHeight-1)/2;
-
-      int filterIndex = 0;
-      float4 sum = (float4)0.0f;
-      float gamma = 0.0f;
-      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
-        for (int j = 0; j < filterHeight; j++) {
-          int2 inputPixelIndex;
-          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
-          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
-          for (int i = 0; i < filterWidth; i++) {
-            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
-            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
-        
-            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
-            float f = filter[filterIndex];
-
-            sum.x += f * p.x;
-            sum.y += f * p.y;
-            sum.z += f * p.z; 
-            sum.w += f * p.w;
-
-            gamma += f;
-
-            filterIndex++;
-          }
+    case ImpulseNoise:
+    {
+      if (alpha < (SigmaImpulse/2.0f))
+        noise=0.0f;
+      else
+        if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
+          noise=(float)QuantumRange;
+        else
+          noise=(float)pixel;
+      break;
+    }
+    case LaplacianNoise:
+    {
+      if (alpha <= 0.5f)
+        {
+          if (alpha <= MagickEpsilon)
+            noise=(float) (pixel-QuantumRange);
+          else
+            noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
+              0.5f);
+          break;
         }
+      beta=1.0f-alpha;
+      if (beta <= (0.5f*MagickEpsilon))
+        noise=(float) (pixel+QuantumRange);
+      else
+        noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
+      break;
+    }
+    case MultiplicativeGaussianNoise:
+    {
+      sigma=1.0f;
+      if (alpha > MagickEpsilon)
+        sigma=sqrt(-2.0f*log(alpha));
+      beta=mwcReadPseudoRandomValue(r);
+      noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
+        cospi((float) (2.0f*beta))/2.0f);
+      break;
+    }
+    case PoissonNoise:
+    {
+      float 
+        poisson;
+      unsigned int i;
+      poisson=exp(-SigmaPoisson*QuantumScale*pixel);
+      for (i=0; alpha > poisson; i++)
+      {
+        beta=mwcReadPseudoRandomValue(r);
+        alpha*=beta;
       }
-      else {
-
-        for (int j = 0; j < filterHeight; j++) {
-          int2 inputPixelIndex;
-          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
-          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
-          for (int i = 0; i < filterWidth; i++) {
-            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
-            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
-        
-            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
-            float alpha = QuantumScale*p.w;
-            float f = filter[filterIndex];
-            float g = alpha * f;
-
-            sum.x += g*p.x;
-            sum.y += g*p.y;
-            sum.z += g*p.z;
-            sum.w += f*p.w;
-
-            gamma += g;
-
+      noise=(float) (QuantumRange*i/SigmaPoisson);
+      break;
+    }
+    case RandomNoise:
+    {
+      noise=(float) (QuantumRange*SigmaRandom*alpha);
+      break;
+    }
 
-            filterIndex++;
-          }
-        }
-        gamma = PerceptibleReciprocal(gamma);
-        sum.xyz = gamma*sum.xyz;
-      }
+    };
+    return noise;
+  }
 
-      CLPixelType outputPixel;
-      outputPixel.x = ClampToQuantum(sum.x);
-      outputPixel.y = ClampToQuantum(sum.y);
-      outputPixel.z = ClampToQuantum(sum.z);
-      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
+  __kernel
+  void AddNoiseImage(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
+                    ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem
+                    ,const ChannelType channel 
+                    ,const NoiseType noise_type, const float attenuate
+                    ,const unsigned int seed0, const unsigned int seed1
+                                       ,const unsigned int numRandomNumbersPerPixel) {
 
-      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
-    }
-  )
+       mwc64x_state_t rng;
+       rng.x = seed0;
+       rng.c = seed1;
 
-  STRINGIFY(
-     typedef enum
-     {
-       UndefinedFunction,
-       PolynomialFunction,
-       SinusoidFunction,
-       ArcsinFunction,
-       ArctanFunction
-     } MagickFunction;
-  )
+       uint span = pixelsPerWorkItem * numRandomNumbersPerPixel;       // length of RNG substream each workitem will use
+       uint offset = span * get_local_size(0) * get_group_id(0);       // offset of this workgroup's RNG substream (in master stream);
 
-  STRINGIFY(
+       MWC64X_SeedStreams(&rng, offset, span);                                         // Seed the RNG streams
 
-    /*
-    apply FunctionImageChannel(braightness-contrast)
-    */
-    CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
-        const unsigned int number_parameters,
-        __constant float *parameters)
-      {
-        float4 result = (float4) 0.0f;
-        switch (function)
-        {
-        case PolynomialFunction:
-          {
-            for (unsigned int i=0; i < number_parameters; i++)
-              result = result*(float4)QuantumScale*convert_float4(pixel) + parameters[i];
-            result *= (float4)QuantumRange;
-            break;
-          }
-        case SinusoidFunction:
-          {
-            float  freq,phase,ampl,bias;
-            freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
-            phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
-            ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
-            bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
-            result.x = QuantumRange*(ampl*sin(2.0f*MagickPI*
-              (freq*QuantumScale*(float)pixel.x + phase/360.0f)) + bias);
-            result.y = QuantumRange*(ampl*sin(2.0f*MagickPI*
-              (freq*QuantumScale*(float)pixel.y + phase/360.0f)) + bias);
-            result.z = QuantumRange*(ampl*sin(2.0f*MagickPI*
-              (freq*QuantumScale*(float)pixel.z + phase/360.0f)) + bias);
-            result.w = QuantumRange*(ampl*sin(2.0f*MagickPI*
-              (freq*QuantumScale*(float)pixel.w + phase/360.0f)) + bias);
-            break;
-          }
-        case ArcsinFunction:
-          {
-            float  width,range,center,bias;
-            width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
-            center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
-            range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
-            bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+       uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0);   // pixel to process
 
-            result.x = 2.0f/width*(QuantumScale*(float)pixel.x - center);
-            result.x = range/MagickPI*asin(result.x)+bias;
-            result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
-            result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
+       uint count = pixelsPerWorkItem;
 
-            result.y = 2.0f/width*(QuantumScale*(float)pixel.y - center);
-            result.y = range/MagickPI*asin(result.y)+bias;
-            result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
-            result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
+       while (count > 0) {
+               if (pos < inputPixelCount) {
+                       CLPixelType p = inputImage[pos];
 
-            result.z = 2.0f/width*(QuantumScale*(float)pixel.z - center);
-            result.z = range/MagickPI*asin(result.z)+bias;
-            result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
-            result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
+                       if ((channel&RedChannel)!=0) {
+                         setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
+                       }
+    
+                       if ((channel&GreenChannel)!=0) {
+                         setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
+                       }
 
+                       if ((channel&BlueChannel)!=0) {
+                         setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
+                       }
 
-            result.w = 2.0f/width*(QuantumScale*(float)pixel.w - center);
-            result.w = range/MagickPI*asin(result.w)+bias;
-            result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
-            result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
+                       if ((channel & AlphaChannel) != 0) {
+                         setAlpha(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getAlpha(p),noise_type,attenuate)));
+                       }
 
-            result *= (float4)QuantumRange;
-            break;
-          }
-        case ArctanFunction:
-          {
-            float slope,range,center,bias;
-            slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
-            center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
-            range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
-            bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
-            result = (float4)MagickPI*(float4)slope*((float4)QuantumScale*convert_float4(pixel)-(float4)center);
-            result = (float4)QuantumRange*((float4)range/(float4)MagickPI*atan(result) + (float4)bias);
-            break;
-          }
-        case UndefinedFunction:
-          break;
-        }
-        return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
-          ClampToQuantum(result.z), ClampToQuantum(result.w));
-      }
-    )
+                       filteredImage[pos] = p;
+                       //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);
+               }
+               pos += get_local_size(0);
+               --count;
+       }
+  }
+  )
 
-    STRINGIFY(
-    /*
-    Improve brightness / contrast of the image
-    channel : define which channel is improved
-    function : the function called to enchance the brightness contrast
-    number_parameters : numbers of parameters 
-    parameters : the parameter
-    */
-    __kernel void FunctionImage(__global CLPixelType *im,
-                                        const ChannelType channel, const MagickFunction function,
-                                        const unsigned int number_parameters, __constant float *parameters)
-      {
-        const int x = get_global_id(0);  
-        const int y = get_global_id(1);  
-        const int columns = get_global_size(0);  
-        const int c = x + y * columns;
-        im[c] = ApplyFunction(im[c], function, number_parameters, parameters); 
-      }
-    )
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    B l u r                                                                  %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
     STRINGIFY(
-    /*
-    */
-    __kernel void Stretch(__global CLPixelType * restrict im,
-      const ChannelType channel,  
-      __global CLPixelType * restrict stretch_map,
-      const float4 white, const float4 black)
+      /*
+      Reduce image noise and reduce detail levels by line
+      im: input pixels filtered_in  filtered_im: output pixels
+      filter : convolve kernel  width: convolve kernel size
+      channel : define which channel is blured\
+      is_RGBA_BGRA : define the input is RGBA or BGRA
+      */
+      __kernel void BlurSectionColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
+                                const ChannelType channel, __constant float *filter,
+                                const unsigned int width, 
+                                const unsigned int imageColumns, const unsigned int imageRows,
+                                __local float4 *temp, 
+                                const unsigned int offsetRows, const unsigned int section)
       {
         const int x = get_global_id(0);  
-        const int y = get_global_id(1);  
-        const int columns = get_global_size(0);  
-        const int c = x + y * columns;
+        const int y = get_global_id(1);
 
-        uint ePos;
-        CLPixelType oValue, eValue;
-        CLQuantum red, green, blue, alpha;
+        //const int columns = get_global_size(0);
+        //const int rows = get_global_size(1);  
+        const int columns = imageColumns;  
+        const int rows = imageRows;  
 
-        //read from global
-        oValue=im[c];
+        unsigned int radius = (width-1)/2;
+        const int wsize = get_local_size(1);  
+        const unsigned int loadSize = wsize+width;
 
-        if ((channel & RedChannel) != 0)
-        {
-          if (getRedF4(white) != getRedF4(black))
-          {
-            ePos = ScaleQuantumToMap(getRed(oValue)); 
-            eValue = stretch_map[ePos];
-            red = getRed(eValue);
-          }
-        }
+        //group coordinate
+        const int groupX=get_local_size(0)*get_group_id(0);
+        const int groupY=get_local_size(1)*get_group_id(1);
+        //notice that get_local_size(0) is 1, so
+        //groupX=get_group_id(0);
+       
+        // offset the input data
+        blurRowData += imageColumns * radius * section;
 
-        if ((channel & GreenChannel) != 0)
+        //parallel load and clamp
+        for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
         {
-          if (getGreenF4(white) != getGreenF4(black))
-          {
-            ePos = ScaleQuantumToMap(getGreen(oValue)); 
-            eValue = stretch_map[ePos];
-            green = getGreen(eValue);
-          }
+          int pos = ClampToCanvasWithHalo(i+groupY-radius, rows, radius, section) * columns + groupX;
+          temp[i] = *(blurRowData+pos);
         }
+        
+        // barrier        
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        if ((channel & BlueChannel) != 0)
+        // only do the work if this is not a patched item
+        if (get_global_id(1) < rows)
         {
-          if (getBlueF4(white) != getBlueF4(black))
+          // compute
+          float4 result = (float4) 0;
+
+          int i = 0;
+          
+          \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
+          
+          for ( ; i+UFACTOR < width; ) 
           {
-            ePos = ScaleQuantumToMap(getBlue(oValue)); 
-            eValue = stretch_map[ePos];
-            blue = getBlue(eValue);
+            \n #pragma unroll UFACTOR \n
+            for (int j=0; j < UFACTOR; j++, i++)
+            {
+              result+=filter[i]*temp[i+get_local_id(1)];
+            }
           }
-        }
-
-        if ((channel & AlphaChannel) != 0)
-        {
-          if (getAlphaF4(white) != getAlphaF4(black))
+          for ( ; i < width; i++)
           {
-            ePos = ScaleQuantumToMap(getAlpha(oValue)); 
-            eValue = stretch_map[ePos];
-            alpha = getAlpha(eValue);
+            result+=filter[i]*temp[i+get_local_id(1)];
           }
-        }
 
-        //write back
-        im[c]=(CLPixelType)(blue, green, red, alpha);
+          result.x = ClampToQuantum(result.x);
+          result.y = ClampToQuantum(result.y);
+          result.z = ClampToQuantum(result.z);
+          result.w = ClampToQuantum(result.w);
+
+          // offset the output data
+          filtered_im += imageColumns * offsetRows;
+
+          // write back to global
+          filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
+        }
 
       }
     )
 
     STRINGIFY(
-    /*
-    */
-    __kernel void Equalize(__global CLPixelType * restrict im,
-      const ChannelType channel,  
-      __global CLPixelType * restrict equalize_map,
-      const float4 white, const float4 black)
+      /*
+      Reduce image noise and reduce detail levels by row
+      im: input pixels filtered_in  filtered_im: output pixels
+      filter : convolve kernel  width: convolve kernel size
+      channel : define which channel is blured
+      is_RGBA_BGRA : define the input is RGBA or BGRA
+      */
+      __kernel void BlurSectionRow(__global CLPixelType *im, __global float4 *filtered_im,
+                         const ChannelType channel, __constant float *filter,
+                         const unsigned int width, 
+                         const unsigned int imageColumns, const unsigned int imageRows,
+                         __local CLPixelType *temp, 
+                         const unsigned int offsetRows, const unsigned int section)
       {
         const int x = get_global_id(0);  
         const int y = get_global_id(1);  
-        const int columns = get_global_size(0);  
-        const int c = x + y * columns;
-
-        uint ePos;
-        CLPixelType oValue, eValue;
-        CLQuantum red, green, blue, alpha;
-
-        //read from global
-        oValue=im[c];
 
-        if ((channel & SyncChannels) != 0)
-        {
-          if (getRedF4(white) != getRedF4(black))
-          {
-            ePos = ScaleQuantumToMap(getRed(oValue)); 
-            eValue = equalize_map[ePos];
-            red = getRed(eValue);
-            ePos = ScaleQuantumToMap(getGreen(oValue)); 
-            eValue = equalize_map[ePos];
-            green = getRed(eValue);
-            ePos = ScaleQuantumToMap(getBlue(oValue)); 
-            eValue = equalize_map[ePos];
-            blue = getRed(eValue);
-            ePos = ScaleQuantumToMap(getAlpha(oValue)); 
-            eValue = equalize_map[ePos];
-            alpha = getRed(eValue);
-            //write back
-            im[c]=(CLPixelType)(blue, green, red, alpha);
-          }
+        const int columns = imageColumns;  
 
-        }
+        const unsigned int radius = (width-1)/2;
+        const int wsize = get_local_size(0);  
+        const unsigned int loadSize = wsize+width;
 
-        // for equalizing, we always need all channels?
-        // otherwise something more
+        //group coordinate
+        const int groupX=get_local_size(0)*get_group_id(0);
+        const int groupY=get_local_size(1)*get_group_id(1);
 
-     }
-    )
+        //offset the input data, assuming section is 0, 1 
+        im += imageColumns * (offsetRows - radius * section);
 
-    STRINGIFY(
-    /*
-    */
-    __kernel void Histogram(__global CLPixelType * restrict im,
-      const ChannelType channel, 
-      const int method,
-      const int colorspace,
-      __global uint4 * restrict histogram)
-      {
-        const int x = get_global_id(0);  
-        const int y = get_global_id(1);  
-        const int columns = get_global_size(0);  
-        const int c = x + y * columns;
-        if ((channel & SyncChannels) != 0)
-        {
-          float intensity = GetPixelIntensity(method, colorspace,im[c]);
-          uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
-          atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
-        }
-        else
+        //parallel load and clamp
+        for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
         {
-          // for equalizing, we always need all channels?
-          // otherwise something more
-        }
-      }
-    )
-
-    STRINGIFY(
-      inline int mirrorBottom(int value)
-      {
-          return (value < 0) ? - (value) : value;
-      }
-      inline int mirrorTop(int value, int width)
-      {
-          return (value >= width) ? (2 * width - value - 1) : value;
-      }
+          //int cx = ClampToCanvas(groupX+i, columns);
+          temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
 
-      __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
-          const int radius, 
-          const int imageWidth,
-          const int imageHeight)
-      {
-        const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
+          /*if (0 && y==0 && get_group_id(1) == 0)
+          {
+            printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
+          }*/
+        }
 
-        int x = get_local_id(0);
-        int y = get_global_id(1);
+        // barrier        
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        global CLPixelType *src = srcImage + y * imageWidth;
+        // only do the work if this is not a patched item
+        if (get_global_id(0) < columns) 
+        {
+          // compute
+          float4 result = (float4) 0;
 
-        for (int i = x; i < imageWidth; i += get_local_size(0)) {
-            float sum = 0.0f;
-            float weight = 1.0f;
+          int i = 0;
+          
+          \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
 
-            int j = i - radius;
-            while ((j + 7) < i) {
-                for (int k = 0; k < 8; ++k) // Unroll 8x
-                    sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
-                weight += 8.0f;
-                j+=8;
-            }
-            while (j < i) {
-                sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
-                weight += 1.0f;
-                ++j;
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR\n
+            for (int j=0; j < UFACTOR; j++, i++)
+            {
+              result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
             }
+          }
 
-            while ((j + 7) < radius + i) {
-                for (int k = 0; k < 8; ++k) // Unroll 8x
-                    sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
-                weight -= 8.0f;
-                j+=8;
-            }
-            while (j < radius + i) {
-                sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
-                weight -= 1.0f;
-                ++j;
-            }
+          for ( ; i < width; i++)
+          {
+            result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
+          }
+
+          result.x = ClampToQuantum(result.x);
+          result.y = ClampToQuantum(result.y);
+          result.z = ClampToQuantum(result.z);
+          result.w = ClampToQuantum(result.w);
 
-            tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
+          // write back to global
+          filtered_im[y*columns+x] = result;
         }
+
       }
     )
 
     STRINGIFY(
-      __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
-          const int radius, 
-          const float strength,
-          const int imageWidth,
-          const int imageHeight)
+      /*
+      Reduce image noise and reduce detail levels by line
+      im: input pixels filtered_in  filtered_im: output pixels
+      filter : convolve kernel  width: convolve kernel size
+      channel : define which channel is blured\
+      is_RGBA_BGRA : define the input is RGBA or BGRA
+      */
+      __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
+                                const ChannelType channel, __constant float *filter,
+                                const unsigned int width, 
+                                const unsigned int imageColumns, const unsigned int imageRows,
+                                __local float4 *temp)
       {
-        const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);
 
-        int x = get_global_id(0);
-        int y = get_global_id(1);
+        //const int columns = get_global_size(0);
+        //const int rows = get_global_size(1);  
+        const int columns = imageColumns;  
+        const int rows = imageRows;  
 
-        if ((x >= imageWidth) || (y >= imageHeight))
-                return;
+        unsigned int radius = (width-1)/2;
+        const int wsize = get_local_size(1);  
+        const unsigned int loadSize = wsize+width;
 
-        global float *src = blurImage + x;
+        //group coordinate
+        const int groupX=get_local_size(0)*get_group_id(0);
+        const int groupY=get_local_size(1)*get_group_id(1);
+        //notice that get_local_size(0) is 1, so
+        //groupX=get_group_id(0);
+        
+        //parallel load and clamp
+        for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
+        {
+          temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
+        }
+        
+        // barrier        
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        float sum = 0.0f;
-        float weight = 1.0f;
+        // only do the work if this is not a patched item
+        if (get_global_id(1) < rows)
+        {
+          // compute
+          float4 result = (float4) 0;
 
-        int j = y - radius;
-        while ((j + 7) < y) {
-            for (int k = 0; k < 8; ++k) // Unroll 8x
-                sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
-            weight += 8.0f;
-            j+=8;
-        }
-        while (j < y) {
-            sum += weight * src[mirrorBottom(j) * imageWidth];
-            weight += 1.0f;
-            ++j;
-        }
+          int i = 0;
+          
+          \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
+          
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR \n
+            for (int j=0; j < UFACTOR; j++, i++)
+            {
+              result+=filter[i]*temp[i+get_local_id(1)];
+            }
+          }
 
-        while ((j + 7) < radius + y) {
-            for (int k = 0; k < 8; ++k) // Unroll 8x
-                sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
-            weight -= 8.0f;
-            j+=8;
-        }
-        while (j < radius + y) {
-            sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
-            weight -= 1.0f;
-            ++j;
-        }
+          for ( ; i < width; i++)
+          {
+            result+=filter[i]*temp[i+get_local_id(1)];
+          }
 
-        CLPixelType pixel = srcImage[x + y * imageWidth];
-        float srcVal = dot(RGB, convert_float4(pixel));
-        float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
-        mult = (srcVal + mult) / srcVal;
+          result.x = ClampToQuantum(result.x);
+          result.y = ClampToQuantum(result.y);
+          result.z = ClampToQuantum(result.z);
+          result.w = ClampToQuantum(result.w);
 
-        pixel.x = ClampToQuantum(pixel.x * mult);
-        pixel.y = ClampToQuantum(pixel.y * mult);
-        pixel.z = ClampToQuantum(pixel.z * mult);
+          // write back to global
+          filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
+        }
 
-        dstImage[x + y * imageWidth] = pixel;
       }
     )
 
@@ -988,552 +1095,678 @@ const char* accelerateKernels =
           }*/
         }
 
-        // barrier        
-        barrier(CLK_LOCAL_MEM_FENCE);
+        // barrier        
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // only do the work if this is not a patched item
+        if (get_global_id(0) < columns) 
+        {
+          // compute
+          float4 result = (float4) 0;
+
+          int i = 0;
+          
+          \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
+
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR\n
+            for (int j=0; j < UFACTOR; j++, i++)
+            {
+              result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
+            }
+          }
+
+          for ( ; i < width; i++)
+          {
+            result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
+          }
+
+          result.x = ClampToQuantum(result.x);
+          result.y = ClampToQuantum(result.y);
+          result.z = ClampToQuantum(result.z);
+          result.w = ClampToQuantum(result.w);
+
+          // write back to global
+          filtered_im[y*columns+x] = result;
+        }
+      }
+    )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    C o m p o s i t e                                                        %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+    inline float ColorDodge(const float Sca,
+      const float Sa,const float Dca,const float Da)
+    {
+      /*
+        Oct 2004 SVG specification.
+      */
+      if ((Sca*Da+Dca*Sa) >= Sa*Da)
+        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
+      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
+
+
+      /*
+        New specification, March 2009 SVG specification.  This specification was
+        also wrong of non-overlap cases.
+      */
+      /*
+      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
+        return(Sca*(1.0-Da));
+      if (fabs(Sca-Sa) < MagickEpsilon)
+        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
+      return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
+      */
+
+      /*
+        Working from first principles using the original formula:
+
+           f(Sc,Dc) = Dc/(1-Sc)
+
+        This works correctly! Looks like the 2004 model was right but just
+        required a extra condition for correct handling.
+      */
+
+      /*
+      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
+        return(Sca*(1.0-Da)+Dca*(1.0-Sa));
+      if (fabs(Sca-Sa) < MagickEpsilon)
+        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
+      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
+      */
+    }
+
+    inline void CompositeColorDodge(const float4 *p,
+      const float4 *q,float4 *composite) {
+
+      float 
+      Da,
+      gamma,
+      Sa;
+
+      Sa=QuantumScale*getAlphaF4(*p);  /* simplify and speed up equations */
+      Da=QuantumScale*getAlphaF4(*q);
+      gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
+      setAlphaF4(composite,QuantumRange*gamma);
+      gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
+      setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
+        getRedF4(*q)*Da,Da));
+      setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
+        getGreenF4(*q)*Da,Da));
+      setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
+        getBlueF4(*q)*Da,Da));
+    }
+  )
+
+  STRINGIFY(
+    inline void MagickPixelCompositePlus(const float4 *p,
+      const float alpha,const float4 *q,
+      const float beta,float4 *composite)
+    {
+      float 
+        gamma;
+
+      float
+        Da,
+        Sa;
+      /*
+        Add two pixels with the given opacities.
+      */
+      Sa=QuantumScale*alpha;
+      Da=QuantumScale*beta;
+      gamma=RoundToUnity(Sa+Da);  /* 'Plus' blending -- not 'Over' blending */
+      setAlphaF4(composite,(float) QuantumRange*gamma);
+      gamma=PerceptibleReciprocal(gamma);
+      setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
+      setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
+      setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
+    }
+  )
+
+  STRINGIFY(
+    inline void MagickPixelCompositeBlend(const float4 *p,
+      const float alpha,const float4 *q,
+      const float beta,float4 *composite)
+    {
+      MagickPixelCompositePlus(p,(float) (alpha*
+      (getAlphaF4(*p))),q,(float) (beta*
+      (getAlphaF4(*q))),composite);
+    }
+  )
+  
+  STRINGIFY(
+    __kernel 
+    void Composite(__global CLPixelType *image,
+                   const unsigned int imageWidth, 
+                   const unsigned int imageHeight,
+                   const __global CLPixelType *compositeImage,
+                   const unsigned int compositeWidth, 
+                   const unsigned int compositeHeight,
+                   const unsigned int compose,
+                   const ChannelType channel, 
+                   const unsigned int matte,
+                   const float destination_dissolve,
+                   const float source_dissolve) {
+
+      uint2 index;
+      index.x = get_global_id(0);
+      index.y = get_global_id(1);
 
-        // only do the work if this is not a patched item
-        if (get_global_id(0) < columns) 
-        {
-          // compute
-          float4 result = (float4) 0;
 
-          int i = 0;
-          
-          \n #ifndef UFACTOR   \n 
-          \n #define UFACTOR 8 \n 
-          \n #endif                  \n 
+      if (index.x >= imageWidth
+        || index.y >= imageHeight) {
+          return;
+      }
+      const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
+      float4 destination;
+      setRedF4(&destination,getRed(inputPixel));
+      setGreenF4(&destination,getGreen(inputPixel));
+      setBlueF4(&destination,getBlue(inputPixel));
 
-          for ( ; i+UFACTOR < width; ) 
-          {
-            \n #pragma unroll UFACTOR\n
-            for (int j=0; j < UFACTOR; j++, i++)
-            {
-              result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
-            }
-          }
+      
+      const CLPixelType compositePixel 
+        = compositeImage[index.y*imageWidth+index.x];
+      float4 source;
+      setRedF4(&source,getRed(compositePixel));
+      setGreenF4(&source,getGreen(compositePixel));
+      setBlueF4(&source,getBlue(compositePixel));
 
-          for ( ; i < width; i++)
-          {
-            result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
-          }
+      if (matte != 0) {
+        setAlphaF4(&destination,getAlpha(inputPixel));
+        setAlphaF4(&source,getAlpha(compositePixel));
+      }
+      else {
+        setAlphaF4(&destination,1.0f);
+        setAlphaF4(&source,1.0f);
+      }
 
-          result.x = ClampToQuantum(result.x);
-          result.y = ClampToQuantum(result.y);
-          result.z = ClampToQuantum(result.z);
-          result.w = ClampToQuantum(result.w);
+      float4 composite=destination;
 
-          // write back to global
-          filtered_im[y*columns+x] = result;
-        }
-      }
-    )
+      CompositeOperator op = (CompositeOperator)compose;
+      switch (op) {
+      case ColorDodgeCompositeOp:
+        CompositeColorDodge(&source,&destination,&composite);
+        break;
+      case BlendCompositeOp:
+        MagickPixelCompositeBlend(&source,source_dissolve,&destination,
+            destination_dissolve,&composite);
+        break;
+      default:
+        // unsupported operators
+        break;
+      };
 
-    STRINGIFY(
-      /*
-      Reduce image noise and reduce detail levels by row
-      im: input pixels filtered_in  filtered_im: output pixels
-      filter : convolve kernel  width: convolve kernel size
-      channel : define which channel is blured
-      is_RGBA_BGRA : define the input is RGBA or BGRA
-      */
-      __kernel void BlurRowSection(__global CLPixelType *im, __global float4 *filtered_im,
-                         const ChannelType channel, __constant float *filter,
-                         const unsigned int width, 
-                         const unsigned int imageColumns, const unsigned int imageRows,
-                         __local CLPixelType *temp, 
-                         const unsigned int offsetRows, const unsigned int section)
-      {
-        const int x = get_global_id(0);  
-        const int y = get_global_id(1);  
+      CLPixelType outputPixel;
+      setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
+      setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
+      setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
+      setAlpha(&outputPixel, ClampToQuantum(getAlphaF4(composite)));
+      image[index.y*imageWidth+index.x] = outputPixel;
+    }
+  )
 
-        const int columns = imageColumns;  
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    C o n t r a s t                                                          %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-        const unsigned int radius = (width-1)/2;
-        const int wsize = get_local_size(0);  
-        const unsigned int loadSize = wsize+width;
+  STRINGIFY(
 
-        //group coordinate
-        const int groupX=get_local_size(0)*get_group_id(0);
-        const int groupY=get_local_size(1)*get_group_id(1);
+  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
+    float3 HueSaturationBrightness;
+    HueSaturationBrightness.x = 0.0f; // Hue
+    HueSaturationBrightness.y = 0.0f; // Saturation
+    HueSaturationBrightness.z = 0.0f; // Brightness
 
-        //offset the input data, assuming section is 0, 1 
-        im += imageColumns * (offsetRows - radius * section);
+    float r=(float) getRed(pixel);
+    float g=(float) getGreen(pixel);
+    float b=(float) getBlue(pixel);
 
-        //parallel load and clamp
-        for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
-        {
-          //int cx = ClampToCanvas(groupX+i, columns);
-          temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
+    float tmin=min(min(r,g),b);
+    float tmax=max(max(r,g),b);
 
-          /*if (0 && y==0 && get_group_id(1) == 0)
-          {
-            printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
-          }*/
-        }
+    if (tmax!=0.0f) {
+      float delta=tmax-tmin;
+      HueSaturationBrightness.y=delta/tmax;
+      HueSaturationBrightness.z=QuantumScale*tmax;
 
-        // barrier        
-        barrier(CLK_LOCAL_MEM_FENCE);
+      if (delta != 0.0f) {
+  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
+  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
+        HueSaturationBrightness.x/=6.0f;
+        HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
+      }
+    }
+    return HueSaturationBrightness;
+  }
 
-        // only do the work if this is not a patched item
-        if (get_global_id(0) < columns) 
-        {
-          // compute
-          float4 result = (float4) 0;
+  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
 
-          int i = 0;
-          
-          \n #ifndef UFACTOR   \n 
-          \n #define UFACTOR 8 \n 
-          \n #endif                  \n 
+    float hue = HueSaturationBrightness.x;
+    float brightness = HueSaturationBrightness.z;
+    float saturation = HueSaturationBrightness.y;
+   
+    CLPixelType rgb;
 
-          for ( ; i+UFACTOR < width; ) 
-          {
-            \n #pragma unroll UFACTOR\n
-            for (int j=0; j < UFACTOR; j++, i++)
-            {
-              result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
-            }
-          }
+    if (saturation == 0.0f) {
+      setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
+      setGreen(&rgb,getRed(rgb));
+      setBlue(&rgb,getRed(rgb));
+    }
+    else {
 
-          for ( ; i < width; i++)
-          {
-            result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
-          }
+      float h=6.0f*(hue-floor(hue));
+      float f=h-floor(h);
+      float p=brightness*(1.0f-saturation);
+      float q=brightness*(1.0f-saturation*f);
+      float t=brightness*(1.0f-(saturation*(1.0f-f)));
+      float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
+      float clamped_t = ClampToQuantum(QuantumRange*t);
+      float clamped_p = ClampToQuantum(QuantumRange*p);
+      float clamped_q = ClampToQuantum(QuantumRange*q);
+      int ih = (int)h;
+      setRed(&rgb, (ih == 1)?clamped_q:
+        (ih == 2 || ih == 3)?clamped_p:
+        (ih == 4)?clamped_t:
+                 clampedBrightness);
+      setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
+        (ih == 3)?clamped_q:
+        (ih == 4 || ih == 5)?clamped_p:
+                 clamped_t);
 
-          result.x = ClampToQuantum(result.x);
-          result.y = ClampToQuantum(result.y);
-          result.z = ClampToQuantum(result.z);
-          result.w = ClampToQuantum(result.w);
+      setBlue(&rgb, (ih == 2)?clamped_t:
+        (ih == 3 || ih == 4)?clampedBrightness:
+        (ih == 5)?clamped_q:
+                 clamped_p);
+    }
+    return rgb;
+  }
 
-          // write back to global
-          filtered_im[y*columns+x] = result;
-        }
+  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
+  {
 
-      }
-    )
+    const int sign = sharpen!=0?1:-1;
+    const int x = get_global_id(0);  
+    const int y = get_global_id(1);
+    const int columns = get_global_size(0);
+    const int c = x + y * columns;
 
-    STRINGIFY(
-      /*
-      Reduce image noise and reduce detail levels by line
-      im: input pixels filtered_in  filtered_im: output pixels
-      filter : convolve kernel  width: convolve kernel size
-      channel : define which channel is blured\
-      is_RGBA_BGRA : define the input is RGBA or BGRA
-      */
-      __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
-                                const ChannelType channel, __constant float *filter,
-                                const unsigned int width, 
-                                const unsigned int imageColumns, const unsigned int imageRows,
-                                __local float4 *temp)
-      {
-        const int x = get_global_id(0);  
-        const int y = get_global_id(1);
+    CLPixelType pixel = im[c];
+    float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
+    float brightness = HueSaturationBrightness.z;
+    brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
+    brightness = clamp(brightness,0.0f,1.0f);
+    HueSaturationBrightness.z = brightness;
 
-        //const int columns = get_global_size(0);
-        //const int rows = get_global_size(1);  
-        const int columns = imageColumns;  
-        const int rows = imageRows;  
+    CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
+    filteredPixel.w = pixel.w;
+    im[c] = filteredPixel;
+  }
+  )
 
-        unsigned int radius = (width-1)/2;
-        const int wsize = get_local_size(1);  
-        const unsigned int loadSize = wsize+width;
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    C o n t r a s t S t r e t c h                                            %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-        //group coordinate
-        const int groupX=get_local_size(0)*get_group_id(0);
-        const int groupY=get_local_size(1)*get_group_id(1);
-        //notice that get_local_size(0) is 1, so
-        //groupX=get_group_id(0);
-        
-        //parallel load and clamp
-        for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
+    STRINGIFY(
+    /*
+    */
+    __kernel void Histogram(__global CLPixelType * restrict im,
+      const ChannelType channel, 
+      const int method,
+      const int colorspace,
+      __global uint4 * restrict histogram)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
+        if ((channel & SyncChannels) != 0)
         {
-          temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
+          float intensity = GetPixelIntensity(method, colorspace,im[c]);
+          uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
+          atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
         }
-        
-        // barrier        
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // only do the work if this is not a patched item
-        if (get_global_id(1) < rows)
+        else
         {
-          // compute
-          float4 result = (float4) 0;
-
-          int i = 0;
-          
-          \n #ifndef UFACTOR   \n 
-          \n #define UFACTOR 8 \n 
-          \n #endif                  \n 
-          
-          for ( ; i+UFACTOR < width; ) 
-          {
-            \n #pragma unroll UFACTOR \n
-            for (int j=0; j < UFACTOR; j++, i++)
-            {
-              result+=filter[i]*temp[i+get_local_id(1)];
-            }
-          }
-
-          for ( ; i < width; i++)
-          {
-            result+=filter[i]*temp[i+get_local_id(1)];
-          }
-
-          result.x = ClampToQuantum(result.x);
-          result.y = ClampToQuantum(result.y);
-          result.z = ClampToQuantum(result.z);
-          result.w = ClampToQuantum(result.w);
-
-          // write back to global
-          filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
+          // for equalizing, we always need all channels?
+          // otherwise something more
         }
-
       }
     )
 
-
     STRINGIFY(
-      /*
-      Reduce image noise and reduce detail levels by line
-      im: input pixels filtered_in  filtered_im: output pixels
-      filter : convolve kernel  width: convolve kernel size
-      channel : define which channel is blured\
-      is_RGBA_BGRA : define the input is RGBA or BGRA
-      */
-      __kernel void BlurColumnSection(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
-                                const ChannelType channel, __constant float *filter,
-                                const unsigned int width, 
-                                const unsigned int imageColumns, const unsigned int imageRows,
-                                __local float4 *temp, 
-                                const unsigned int offsetRows, const unsigned int section)
+    /*
+    */
+    __kernel void ContrastStretch(__global CLPixelType * restrict im,
+      const ChannelType channel,  
+      __global CLPixelType * restrict stretch_map,
+      const float4 white, const float4 black)
       {
         const int x = get_global_id(0);  
-        const int y = get_global_id(1);
-
-        //const int columns = get_global_size(0);
-        //const int rows = get_global_size(1);  
-        const int columns = imageColumns;  
-        const int rows = imageRows;  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
 
-        unsigned int radius = (width-1)/2;
-        const int wsize = get_local_size(1);  
-        const unsigned int loadSize = wsize+width;
+        uint ePos;
+        CLPixelType oValue, eValue;
+        CLQuantum red, green, blue, alpha;
 
-        //group coordinate
-        const int groupX=get_local_size(0)*get_group_id(0);
-        const int groupY=get_local_size(1)*get_group_id(1);
-        //notice that get_local_size(0) is 1, so
-        //groupX=get_group_id(0);
-       
-        // offset the input data
-        blurRowData += imageColumns * radius * section;
+        //read from global
+        oValue=im[c];
 
-        //parallel load and clamp
-        for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
+        if ((channel & RedChannel) != 0)
         {
-          int pos = ClampToCanvasWithHalo(i+groupY-radius, rows, radius, section) * columns + groupX;
-          temp[i] = *(blurRowData+pos);
+          if (getRedF4(white) != getRedF4(black))
+          {
+            ePos = ScaleQuantumToMap(getRed(oValue)); 
+            eValue = stretch_map[ePos];
+            red = getRed(eValue);
+          }
         }
-        
-        // barrier        
-        barrier(CLK_LOCAL_MEM_FENCE);
 
-        // only do the work if this is not a patched item
-        if (get_global_id(1) < rows)
+        if ((channel & GreenChannel) != 0)
         {
-          // compute
-          float4 result = (float4) 0;
-
-          int i = 0;
-          
-          \n #ifndef UFACTOR   \n 
-          \n #define UFACTOR 8 \n 
-          \n #endif                  \n 
-          
-          for ( ; i+UFACTOR < width; ) 
+          if (getGreenF4(white) != getGreenF4(black))
           {
-            \n #pragma unroll UFACTOR \n
-            for (int j=0; j < UFACTOR; j++, i++)
-            {
-              result+=filter[i]*temp[i+get_local_id(1)];
-            }
+            ePos = ScaleQuantumToMap(getGreen(oValue)); 
+            eValue = stretch_map[ePos];
+            green = getGreen(eValue);
           }
-          for ( ; i < width; i++)
+        }
+
+        if ((channel & BlueChannel) != 0)
+        {
+          if (getBlueF4(white) != getBlueF4(black))
           {
-            result+=filter[i]*temp[i+get_local_id(1)];
+            ePos = ScaleQuantumToMap(getBlue(oValue)); 
+            eValue = stretch_map[ePos];
+            blue = getBlue(eValue);
           }
-
-          result.x = ClampToQuantum(result.x);
-          result.y = ClampToQuantum(result.y);
-          result.z = ClampToQuantum(result.z);
-          result.w = ClampToQuantum(result.w);
-
-          // offset the output data
-          filtered_im += imageColumns * offsetRows;
-
-          // write back to global
-          filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
         }
 
-      }
-    )
-
-
-    STRINGIFY(
-    __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage, 
-          const __global float4 *blurRowData, __global CLPixelType *filtered_im,
-          const unsigned int imageColumns, const unsigned int imageRows, 
-          __local float4* cachedData, __local float* cachedFilter,
-          const ChannelType channel, const __global float *filter, const unsigned int width, 
-          const float gain, const float threshold)
-    {
-      const unsigned int radius = (width-1)/2;
-
-      // cache the pixel shared by the workgroup
-      const int groupX = get_group_id(0);
-      const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
-      const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
-
-      if (groupStartY >= 0
-          && groupStopY < imageRows) {
-        event_t e = async_work_group_strided_copy(cachedData
-                                                ,blurRowData+groupStartY*imageColumns+groupX
-                                                ,groupStopY-groupStartY,imageColumns,0);
-        wait_group_events(1,&e);
-      }
-      else {
-        for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
-          cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
+        if ((channel & AlphaChannel) != 0)
+        {
+          if (getAlphaF4(white) != getAlphaF4(black))
+          {
+            ePos = ScaleQuantumToMap(getAlpha(oValue)); 
+            eValue = stretch_map[ePos];
+            alpha = getAlpha(eValue);
+          }
         }
-        barrier(CLK_LOCAL_MEM_FENCE);
-      }
-      // cache the filter as well
-      event_t e = async_work_group_copy(cachedFilter,filter,width,0);
-      wait_group_events(1,&e);
 
-      // only do the work if this is not a patched item
-      //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
-      const int cy = get_global_id(1);
-
-      if (cy < imageRows) {
-        float4 blurredPixel = (float4) 0.0f;
-
-        int i = 0;
+        //write back
+        im[c]=(CLPixelType)(blue, green, red, alpha);
 
-        \n #ifndef UFACTOR   \n 
-          \n #define UFACTOR 8 \n 
-          \n #endif                  \n 
+      }
+    )
 
-          for ( ; i+UFACTOR < width; ) 
-          {
-            \n #pragma unroll UFACTOR \n
-              for (int j=0; j < UFACTOR; j++, i++)
-              {
-                blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
-              }
-          }
 
-        for ( ; i < width; i++)
-        {
-          blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
-        }
+  STRINGIFY(
+    __kernel 
+    void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
+    const unsigned int imageWidth, const unsigned int imageHeight,
+    __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
+    const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
 
-        blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
-                                      ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
+      int2 blockID;
+      blockID.x = get_group_id(0);
+      blockID.y = get_group_id(1);
 
-        float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
-        float4 outputPixel = inputImagePixel - blurredPixel;
+      // image area processed by this workgroup
+      int2 imageAreaOrg;
+      imageAreaOrg.x = blockID.x * get_local_size(0);
+      imageAreaOrg.y = blockID.y * get_local_size(1);
 
-        float quantumThreshold = QuantumRange*threshold;
+      int2 midFilterDimen;
+      midFilterDimen.x = (filterWidth-1)/2;
+      midFilterDimen.y = (filterHeight-1)/2;
 
-        int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
-        outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
+      int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
 
-        //write back
-        filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
-                                                            ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
+      // dimension of the local cache
+      int2 cachedAreaDimen;
+      cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
+      cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
 
-      }
-    }
+      // cache the pixels accessed by this workgroup in local memory
+      int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
+      int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
+      int groupSize = get_local_size(0) * get_local_size(1);
+      for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
 
-    __kernel void UnsharpMaskBlurColumnSection(const __global CLPixelType* inputImage, 
-          const __global float4 *blurRowData, __global CLPixelType *filtered_im,
-          const unsigned int imageColumns, const unsigned int imageRows, 
-          __local float4* cachedData, __local float* cachedFilter,
-          const ChannelType channel, const __global float *filter, const unsigned int width, 
-          const float gain, const float threshold, 
-          const unsigned int offsetRows, const unsigned int section)
-    {
-      const unsigned int radius = (width-1)/2;
+        int2 cachedAreaIndex;
+        cachedAreaIndex.x = i % cachedAreaDimen.x;
+        cachedAreaIndex.y = i / cachedAreaDimen.x;
 
-      // cache the pixel shared by the workgroup
-      const int groupX = get_group_id(0);
-      const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
-      const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
+        int2 imagePixelIndex;
+        imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
 
-      // offset the input data
-      blurRowData += imageColumns * radius * section;
+        // only support EdgeVirtualPixelMethod through ClampToCanvas
+        // TODO: implement other virtual pixel method
+        imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
+        imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
 
-      if (groupStartY >= 0
-          && groupStopY < imageRows) {
-        event_t e = async_work_group_strided_copy(cachedData
-                                                ,blurRowData+groupStartY*imageColumns+groupX
-                                                ,groupStopY-groupStartY,imageColumns,0);
-        wait_group_events(1,&e);
+        pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
       }
-      else {
-        for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
-          int pos = ClampToCanvasWithHalo(groupStartY+i,imageRows, radius, section)*imageColumns+ groupX;
-          cachedData[i] = *(blurRowData + pos);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
+
+      // cache the filter
+      for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
+        filterCache[i] = filter[i];
       }
-      // cache the filter as well
-      event_t e = async_work_group_copy(cachedFilter,filter,width,0);
-      wait_group_events(1,&e);
+      barrier(CLK_LOCAL_MEM_FENCE);
 
-      // only do the work if this is not a patched item
-      //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
-      const int cy = get_global_id(1);
 
-      if (cy < imageRows) {
-        float4 blurredPixel = (float4) 0.0f;
+      int2 imageIndex;
+      imageIndex.x = imageAreaOrg.x + get_local_id(0);
+      imageIndex.y = imageAreaOrg.y + get_local_id(1);
 
-        int i = 0;
+      // if out-of-range, stops here and quit
+      if (imageIndex.x >= imageWidth
+        || imageIndex.y >= imageHeight) {
+          return;
+      }
 
-        \n #ifndef UFACTOR   \n 
-          \n #define UFACTOR 8 \n 
-          \n #endif                  \n 
+      int filterIndex = 0;
+      float4 sum = (float4)0.0f;
+      float gamma = 0.0f;
+      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
+        int cacheIndexY = get_local_id(1);
+        for (int j = 0; j < filterHeight; j++) {
+          int cacheIndexX = get_local_id(0);
+          for (int i = 0; i < filterWidth; i++) {
+            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
+            float f = filterCache[filterIndex];
 
-          for ( ; i+UFACTOR < width; ) 
-          {
-            \n #pragma unroll UFACTOR \n
-              for (int j=0; j < UFACTOR; j++, i++)
-              {
-                blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
-              }
+            sum.x += f * p.x;
+            sum.y += f * p.y;
+            sum.z += f * p.z; 
+            sum.w += f * p.w;
+
+            gamma += f;
+            filterIndex++;
+            cacheIndexX++;
           }
+          cacheIndexY++;
+        }
+      }
+      else {
+        int cacheIndexY = get_local_id(1);
+        for (int j = 0; j < filterHeight; j++) {
+          int cacheIndexX = get_local_id(0);
+          for (int i = 0; i < filterWidth; i++) {
 
-        for ( ; i < width; i++)
-        {
-          blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
+            float alpha = QuantumScale*p.w;
+            float f = filterCache[filterIndex];
+            float g = alpha * f;
+
+            sum.x += g*p.x;
+            sum.y += g*p.y;
+            sum.z += g*p.z;
+            sum.w += f*p.w;
+
+            gamma += g;
+            filterIndex++;
+            cacheIndexX++;
+          }
+          cacheIndexY++;
         }
+        gamma = PerceptibleReciprocal(gamma);
+        sum.xyz = gamma*sum.xyz;
+      }
+      CLPixelType outputPixel;
+      outputPixel.x = ClampToQuantum(sum.x);
+      outputPixel.y = ClampToQuantum(sum.y);
+      outputPixel.z = ClampToQuantum(sum.z);
+      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
 
-        blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
-                                      ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
+      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
+    }
+  )
 
-        // offset the output data
-        inputImage += imageColumns * offsetRows; 
-        filtered_im += imageColumns * offsetRows;
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%    C o n v o l v e                                                          %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-        float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
-        float4 outputPixel = inputImagePixel - blurredPixel;
+  STRINGIFY(
+    __kernel 
+    void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
+                  const uint imageWidth, const uint imageHeight,
+                  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
+                  const uint matte, const ChannelType channel) {
 
-        float quantumThreshold = QuantumRange*threshold;
+      int2 imageIndex;
+      imageIndex.x = get_global_id(0);
+      imageIndex.y = get_global_id(1);
 
-        int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
-        outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
+      /*
+      unsigned int imageWidth = get_global_size(0);
+      unsigned int imageHeight = get_global_size(1);
+      */
+      if (imageIndex.x >= imageWidth
+          || imageIndex.y >= imageHeight)
+          return;
 
-        //write back
-        filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
-                                                            ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
+      int2 midFilterDimen;
+      midFilterDimen.x = (filterWidth-1)/2;
+      midFilterDimen.y = (filterHeight-1)/2;
 
-      }
-     
-    }
-    )
+      int filterIndex = 0;
+      float4 sum = (float4)0.0f;
+      float gamma = 0.0f;
+      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
+        for (int j = 0; j < filterHeight; j++) {
+          int2 inputPixelIndex;
+          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
+          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
+          for (int i = 0; i < filterWidth; i++) {
+            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
+            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
+        
+            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
+            float f = filter[filterIndex];
 
+            sum.x += f * p.x;
+            sum.y += f * p.y;
+            sum.z += f * p.z; 
+            sum.w += f * p.w;
 
-    STRINGIFY(
-      __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,
-                         __constant float *filter,
-                         const unsigned int width, 
-                         const unsigned int imageColumns, const unsigned int imageRows,
-                         __local float4 *pixels, 
-                         const float gain, const float threshold, const unsigned int justBlur)
-      {
-        const int x = get_global_id(0);
-        const int y = get_global_id(1);
+            gamma += f;
 
-        const unsigned int radius = (width - 1) / 2;
-                               
-               int row = y - radius;
-               int baseRow = get_group_id(1) * get_local_size(1) - radius;
-               int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
-                               
-               while (row < endRow) {
-                       int srcy =  (row < 0) ? -row : row;                     // mirror pad
-                       srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;
-                                       
-                       float4 value = 0.0f;
-                                       
-                       int ix = x - radius;
-                       int i = 0;
+            filterIndex++;
+          }
+        }
+      }
+      else {
 
-                       while (i + 7 < width) {
-                               for (int j = 0; j < 8; ++j) {           // unrolled
-                                       int srcx = ix + j;
-                                       srcx = (srcx < 0) ? -srcx : srcx;
-                                       srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
-                                       value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);
-                               }
-                               ix += 8;
-                               i += 8;
-                       }
+        for (int j = 0; j < filterHeight; j++) {
+          int2 inputPixelIndex;
+          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
+          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
+          for (int i = 0; i < filterWidth; i++) {
+            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
+            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
+        
+            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
+            float alpha = QuantumScale*p.w;
+            float f = filter[filterIndex];
+            float g = alpha * f;
 
-                       while (i < width) {
-                               int srcx = (ix < 0) ? -ix : ix;                 // mirror pad
-                               srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
-                               value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);
-                               ++i;
-                               ++ix;
-                       }       
-                       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
-                       row += get_local_size(1);
-               }
-                               
-                       
-               barrier(CLK_LOCAL_MEM_FENCE);
+            sum.x += g*p.x;
+            sum.y += g*p.y;
+            sum.z += g*p.z;
+            sum.w += f*p.w;
 
-                                               
-               const int px = get_local_id(0);
-               const int py = get_local_id(1);
-               const int prp = get_local_size(0);
-               float4 value = (float4)(0.0f);
-                       
-               int i = 0;
-               while (i + 7 < width) {                 // unrolled
-                       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
-                       value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];
-                       value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];
-                       value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];
-                       value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];
-                       value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];
-                       value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];
-                       value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];
-                       i += 8;
-               }
-               while (i < width) {
-                       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
-                       ++i;
-               }
+            gamma += g;
 
-               if (justBlur == 0) {            // apply sharpening
-                       float4 srcPixel = convert_float4(im[x + y * imageColumns]);
-                       float4 diff = srcPixel - value;
 
-                       float quantumThreshold = QuantumRange*threshold;
+            filterIndex++;
+          }
+        }
+        gamma = PerceptibleReciprocal(gamma);
+        sum.xyz = gamma*sum.xyz;
+      }
 
-                       int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
-                       value = select(srcPixel + diff * gain, srcPixel, mask);
-               }
-       
-               if ((x < imageColumns) && (y < imageRows))
-                       filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));
-               }       
-       )
+      CLPixelType outputPixel;
+      outputPixel.x = ClampToQuantum(sum.x);
+      outputPixel.y = ClampToQuantum(sum.y);
+      outputPixel.z = ClampToQuantum(sum.z);
+      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
 
+      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
+    }
+  )
 
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     D e s p e c k l e                                                       %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
   STRINGIFY(
 
@@ -1598,8 +1831,6 @@ const char* accelerateKernels =
 
   )
 
-
-
   STRINGIFY(
 
   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
@@ -1675,189 +1906,463 @@ const char* accelerateKernels =
     v.y = (CLQuantum)sv[1];
     v.z = (CLQuantum)sv[2];
 
-    if (matte!=0)
-      v.w = (CLQuantum)sv[3];
+    if (matte!=0)
+      v.w = (CLQuantum)sv[3];
+
+    outputImage[y*imageWidth+x] = v;
+
+    }
+  )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     E q u a l i z e                                                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+    STRINGIFY(
+    /*
+    */
+    __kernel void Equalize(__global CLPixelType * restrict im,
+      const ChannelType channel,  
+      __global CLPixelType * restrict equalize_map,
+      const float4 white, const float4 black)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
+
+        uint ePos;
+        CLPixelType oValue, eValue;
+        CLQuantum red, green, blue, alpha;
+
+        //read from global
+        oValue=im[c];
+
+        if ((channel & SyncChannels) != 0)
+        {
+          if (getRedF4(white) != getRedF4(black))
+          {
+            ePos = ScaleQuantumToMap(getRed(oValue)); 
+            eValue = equalize_map[ePos];
+            red = getRed(eValue);
+            ePos = ScaleQuantumToMap(getGreen(oValue)); 
+            eValue = equalize_map[ePos];
+            green = getRed(eValue);
+            ePos = ScaleQuantumToMap(getBlue(oValue)); 
+            eValue = equalize_map[ePos];
+            blue = getRed(eValue);
+            ePos = ScaleQuantumToMap(getAlpha(oValue)); 
+            eValue = equalize_map[ePos];
+            alpha = getRed(eValue);
+            //write back
+            im[c]=(CLPixelType)(blue, green, red, alpha);
+          }
+
+        }
+
+        // for equalizing, we always need all channels?
+        // otherwise something more
+
+     }
+    )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     F u n c t i o n                                                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+
+    /*
+    apply FunctionImageChannel(braightness-contrast)
+    */
+    CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
+        const unsigned int number_parameters,
+        __constant float *parameters)
+      {
+        float4 result = (float4) 0.0f;
+        switch (function)
+        {
+        case PolynomialFunction:
+          {
+            for (unsigned int i=0; i < number_parameters; i++)
+              result = result*(float4)QuantumScale*convert_float4(pixel) + parameters[i];
+            result *= (float4)QuantumRange;
+            break;
+          }
+        case SinusoidFunction:
+          {
+            float  freq,phase,ampl,bias;
+            freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
+            phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
+            ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
+            bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+            result.x = QuantumRange*(ampl*sin(2.0f*MagickPI*
+              (freq*QuantumScale*(float)pixel.x + phase/360.0f)) + bias);
+            result.y = QuantumRange*(ampl*sin(2.0f*MagickPI*
+              (freq*QuantumScale*(float)pixel.y + phase/360.0f)) + bias);
+            result.z = QuantumRange*(ampl*sin(2.0f*MagickPI*
+              (freq*QuantumScale*(float)pixel.z + phase/360.0f)) + bias);
+            result.w = QuantumRange*(ampl*sin(2.0f*MagickPI*
+              (freq*QuantumScale*(float)pixel.w + phase/360.0f)) + bias);
+            break;
+          }
+        case ArcsinFunction:
+          {
+            float  width,range,center,bias;
+            width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
+            center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
+            range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
+            bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+
+            result.x = 2.0f/width*(QuantumScale*(float)pixel.x - center);
+            result.x = range/MagickPI*asin(result.x)+bias;
+            result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
+            result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
+
+            result.y = 2.0f/width*(QuantumScale*(float)pixel.y - center);
+            result.y = range/MagickPI*asin(result.y)+bias;
+            result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
+            result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
+
+            result.z = 2.0f/width*(QuantumScale*(float)pixel.z - center);
+            result.z = range/MagickPI*asin(result.z)+bias;
+            result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
+            result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
+
+
+            result.w = 2.0f/width*(QuantumScale*(float)pixel.w - center);
+            result.w = range/MagickPI*asin(result.w)+bias;
+            result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
+            result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
+
+            result *= (float4)QuantumRange;
+            break;
+          }
+        case ArctanFunction:
+          {
+            float slope,range,center,bias;
+            slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
+            center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
+            range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
+            bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+            result = (float4)MagickPI*(float4)slope*((float4)QuantumScale*convert_float4(pixel)-(float4)center);
+            result = (float4)QuantumRange*((float4)range/(float4)MagickPI*atan(result) + (float4)bias);
+            break;
+          }
+        case UndefinedFunction:
+          break;
+        }
+        return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
+          ClampToQuantum(result.z), ClampToQuantum(result.w));
+      }
+    )
+
+    STRINGIFY(
+    /*
+    Improve brightness / contrast of the image
+    channel : define which channel is improved
+    function : the function called to enchance the brightness contrast
+    number_parameters : numbers of parameters 
+    parameters : the parameter
+    */
+    __kernel void FunctionImage(__global CLPixelType *im,
+                                        const ChannelType channel, const MagickFunction function,
+                                        const unsigned int number_parameters, __constant float *parameters)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
+        im[c] = ApplyFunction(im[c], function, number_parameters, parameters); 
+      }
+    )
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     G r a y s c a l e                                                       %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+  STRINGIFY(
+  __kernel void Grayscale(__global CLPixelType *im, 
+    const int method, const int colorspace)
+  {
+
+    const int x = get_global_id(0);  
+    const int y = get_global_id(1);
+    const int columns = get_global_size(0);
+    const int c = x + y * columns;
 
-    outputImage[y*imageWidth+x] = v;
+    CLPixelType pixel = im[c];
 
-    }
+    float
+        blue,
+        green,
+        intensity,
+        red;
 
+    red=(float)getRed(pixel);
+    green=(float)getGreen(pixel);
+    blue=(float)getBlue(pixel);
 
-  )
+    intensity=0.0;
 
+    CLPixelType filteredPixel;
  
-  STRINGIFY(
-    __kernel void RotationalBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
-                                 const float4 bias,
-                                 const unsigned int channel, const unsigned int matte,
-                                 const float2 blurCenter,
-                                 __constant float *cos_theta, __constant float *sin_theta, 
-                                 const unsigned int cossin_theta_size)
-      {
-        const int x = get_global_id(0);  
-        const int y = get_global_id(1);
-        const int columns = get_global_size(0);
-        const int rows = get_global_size(1);  
-        unsigned int step = 1;
-        float center_x = (float) x - blurCenter.x;
-        float center_y = (float) y - blurCenter.y;
-        float radius = hypot(center_x, center_y);
-        
-        //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
-        float blur_radius = hypot(blurCenter.x, blurCenter.y);
-
-        if (radius > MagickEpsilon)
+    switch (method)
+    {
+      case AveragePixelIntensityMethod:
         {
-          step = (unsigned int) (blur_radius / radius);
-          if (step == 0)
-            step = 1;
-          if (step >= cossin_theta_size)
-            step = cossin_theta_size-1;
+          intensity=(red+green+blue)/3.0;
+          break;
         }
-
-        float4 result;
-        result.x = (float)bias.x;
-        result.y = (float)bias.y;
-        result.z = (float)bias.z;
-        result.w = (float)bias.w;
-        float normalize = 0.0f;
-
-        if (((channel & AlphaChannel) == 0) || (matte == 0)) {
-          for (unsigned int i=0; i<cossin_theta_size; i+=step)
+      case BrightnessPixelIntensityMethod:
+        {
+          intensity=max(max(red,green),blue);
+          break;
+        }
+      case LightnessPixelIntensityMethod:
+        {
+          intensity=(min(min(red,green),blue)+
+              max(max(red,green),blue))/2.0;
+          break;
+        }
+      case MSPixelIntensityMethod:
+        {
+          intensity=(float) (((float) red*red+green*green+
+                blue*blue)/(3.0*QuantumRange));
+          break;
+        }
+      case Rec601LumaPixelIntensityMethod:
+        {
+          /*
+          if (colorspace == RGBColorspace)
           {
-            result += convert_float4(im[
-              ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+ 
-                ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
-              normalize += 1.0f;
+            red=EncodePixelGamma(red);
+            green=EncodePixelGamma(green);
+            blue=EncodePixelGamma(blue);
           }
-          normalize = PerceptibleReciprocal(normalize);
-          result = result * normalize;
+          */
+          intensity=0.298839*red+0.586811*green+0.114350*blue;
+          break;
         }
-        else {
-          float gamma = 0.0f;
-          for (unsigned int i=0; i<cossin_theta_size; i+=step)
+      case Rec601LuminancePixelIntensityMethod:
+        {
+          /*
+          if (image->colorspace == sRGBColorspace)
           {
-            float4 p = convert_float4(im[
-              ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+ 
-                ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
-            
-            float alpha = (float)(QuantumScale*p.w);
-            result.x += alpha * p.x;
-            result.y += alpha * p.y;
-            result.z += alpha * p.z;
-            result.w += p.w;
-            gamma+=alpha;
-            normalize += 1.0f;
+            red=DecodePixelGamma(red);
+            green=DecodePixelGamma(green);
+            blue=DecodePixelGamma(blue);
           }
-          gamma = PerceptibleReciprocal(gamma);
-          normalize = PerceptibleReciprocal(normalize);
-          result.x = gamma*result.x;
-          result.y = gamma*result.y;
-          result.z = gamma*result.z;
-          result.w = normalize*result.w;
+          */
+          intensity=0.298839*red+0.586811*green+0.114350*blue;
+          break;
         }
-        filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
-          ClampToQuantum(result.z), ClampToQuantum(result.w)); 
-      }
+      case Rec709LumaPixelIntensityMethod:
+      default:
+        {
+          /*
+          if (image->colorspace == RGBColorspace)
+          {
+            red=EncodePixelGamma(red);
+            green=EncodePixelGamma(green);
+            blue=EncodePixelGamma(blue);
+          }
+          */
+          intensity=0.212656*red+0.715158*green+0.072186*blue;
+          break;
+        }
+      case Rec709LuminancePixelIntensityMethod:
+        {
+          /*
+          if (image->colorspace == sRGBColorspace)
+          {
+            red=DecodePixelGamma(red);
+            green=DecodePixelGamma(green);
+            blue=DecodePixelGamma(blue);
+          }
+          */
+          intensity=0.212656*red+0.715158*green+0.072186*blue;
+          break;
+        }
+      case RMSPixelIntensityMethod:
+        {
+          intensity=(float) (sqrt((float) red*red+green*green+
+                blue*blue)/sqrt(3.0));
+          break;
+        }
+
+    }
+
+    setGray(&filteredPixel, ClampToQuantum(intensity));
+
+    filteredPixel.w = pixel.w;
+
+    im[c] = filteredPixel;
+  }
   )
-  STRINGIFY(
 
-  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
-    float3 HueSaturationBrightness;
-    HueSaturationBrightness.x = 0.0f; // Hue
-    HueSaturationBrightness.y = 0.0f; // Saturation
-    HueSaturationBrightness.z = 0.0f; // Brightness
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     L o c a l C o n t r a s t                                               %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-    float r=(float) getRed(pixel);
-    float g=(float) getGreen(pixel);
-    float b=(float) getBlue(pixel);
+    STRINGIFY(
+      inline int mirrorBottom(int value)
+      {
+          return (value < 0) ? - (value) : value;
+      }
+      inline int mirrorTop(int value, int width)
+      {
+          return (value >= width) ? (2 * width - value - 1) : value;
+      }
 
-    float tmin=min(min(r,g),b);
-    float tmax=max(max(r,g),b);
+      __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
+          const int radius, 
+          const int imageWidth,
+          const int imageHeight)
+      {
+        const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
 
-    if (tmax!=0.0f) {
-      float delta=tmax-tmin;
-      HueSaturationBrightness.y=delta/tmax;
-      HueSaturationBrightness.z=QuantumScale*tmax;
+        int x = get_local_id(0);
+        int y = get_global_id(1);
 
-      if (delta != 0.0f) {
-  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
-  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
-        HueSaturationBrightness.x/=6.0f;
-        HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
+        global CLPixelType *src = srcImage + y * imageWidth;
+
+        for (int i = x; i < imageWidth; i += get_local_size(0)) {
+            float sum = 0.0f;
+            float weight = 1.0f;
+
+            int j = i - radius;
+            while ((j + 7) < i) {
+                for (int k = 0; k < 8; ++k) // Unroll 8x
+                    sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
+                weight += 8.0f;
+                j+=8;
+            }
+            while (j < i) {
+                sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
+                weight += 1.0f;
+                ++j;
+            }
+
+            while ((j + 7) < radius + i) {
+                for (int k = 0; k < 8; ++k) // Unroll 8x
+                    sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
+                weight -= 8.0f;
+                j+=8;
+            }
+            while (j < radius + i) {
+                sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
+                weight -= 1.0f;
+                ++j;
+            }
+
+            tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
+        }
       }
-    }
-    return HueSaturationBrightness;
-  }
+    )
 
-  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
+    STRINGIFY(
+      __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
+          const int radius, 
+          const float strength,
+          const int imageWidth,
+          const int imageHeight)
+      {
+        const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
 
-    float hue = HueSaturationBrightness.x;
-    float brightness = HueSaturationBrightness.z;
-    float saturation = HueSaturationBrightness.y;
-   
-    CLPixelType rgb;
+        int x = get_global_id(0);
+        int y = get_global_id(1);
 
-    if (saturation == 0.0f) {
-      setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
-      setGreen(&rgb,getRed(rgb));
-      setBlue(&rgb,getRed(rgb));
-    }
-    else {
+        if ((x >= imageWidth) || (y >= imageHeight))
+                return;
 
-      float h=6.0f*(hue-floor(hue));
-      float f=h-floor(h);
-      float p=brightness*(1.0f-saturation);
-      float q=brightness*(1.0f-saturation*f);
-      float t=brightness*(1.0f-(saturation*(1.0f-f)));
-      float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
-      float clamped_t = ClampToQuantum(QuantumRange*t);
-      float clamped_p = ClampToQuantum(QuantumRange*p);
-      float clamped_q = ClampToQuantum(QuantumRange*q);
-      int ih = (int)h;
-      setRed(&rgb, (ih == 1)?clamped_q:
-        (ih == 2 || ih == 3)?clamped_p:
-        (ih == 4)?clamped_t:
-                 clampedBrightness);
-      setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
-        (ih == 3)?clamped_q:
-        (ih == 4 || ih == 5)?clamped_p:
-                 clamped_t);
+        global float *src = blurImage + x;
 
-      setBlue(&rgb, (ih == 2)?clamped_t:
-        (ih == 3 || ih == 4)?clampedBrightness:
-        (ih == 5)?clamped_q:
-                 clamped_p);
-    }
-    return rgb;
-  }
+        float sum = 0.0f;
+        float weight = 1.0f;
 
-  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
-  {
+        int j = y - radius;
+        while ((j + 7) < y) {
+            for (int k = 0; k < 8; ++k) // Unroll 8x
+                sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
+            weight += 8.0f;
+            j+=8;
+        }
+        while (j < y) {
+            sum += weight * src[mirrorBottom(j) * imageWidth];
+            weight += 1.0f;
+            ++j;
+        }
 
-    const int sign = sharpen!=0?1:-1;
-    const int x = get_global_id(0);  
-    const int y = get_global_id(1);
-    const int columns = get_global_size(0);
-    const int c = x + y * columns;
+        while ((j + 7) < radius + y) {
+            for (int k = 0; k < 8; ++k) // Unroll 8x
+                sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
+            weight -= 8.0f;
+            j+=8;
+        }
+        while (j < radius + y) {
+            sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
+            weight -= 1.0f;
+            ++j;
+        }
 
-    CLPixelType pixel = im[c];
-    float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
-    float brightness = HueSaturationBrightness.z;
-    brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
-    brightness = clamp(brightness,0.0f,1.0f);
-    HueSaturationBrightness.z = brightness;
+        CLPixelType pixel = srcImage[x + y * imageWidth];
+        float srcVal = dot(RGB, convert_float4(pixel));
+        float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
+        mult = (srcVal + mult) / srcVal;
 
-    CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
-    filteredPixel.w = pixel.w;
-    im[c] = filteredPixel;
-  }
+        pixel.x = ClampToQuantum(pixel.x * mult);
+        pixel.y = ClampToQuantum(pixel.y * mult);
+        pixel.z = ClampToQuantum(pixel.z * mult);
 
+        dstImage[x + y * imageWidth] = pixel;
+      }
+    )
 
-  )
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     M o d u l a t e                                                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
   STRINGIFY(
 
@@ -2051,126 +2556,183 @@ const char* accelerateKernels =
   }
   )
 
-  STRINGIFY(
-  __kernel void Grayscale(__global CLPixelType *im, 
-    const int method, const int colorspace)
-  {
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     M o t i o n B l u r                                                     %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-    const int x = get_global_id(0);  
-    const int y = get_global_id(1);
-    const int columns = get_global_size(0);
-    const int c = x + y * columns;
+  STRINGIFY(
+    __kernel 
+    void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
+                    const unsigned int imageWidth, const unsigned int imageHeight,
+                    const __global float *filter, const unsigned int width, const __global int2* offset,
+                    const float4 bias,
+                    const ChannelType channel, const unsigned int matte) {
 
-    CLPixelType pixel = im[c];
+      int2 currentPixel;
+      currentPixel.x = get_global_id(0);
+      currentPixel.y = get_global_id(1);
 
-    float
-        blue,
-        green,
-        intensity,
-        red;
+      if (currentPixel.x >= imageWidth
+          || currentPixel.y >= imageHeight)
+          return;
 
-    red=(float)getRed(pixel);
-    green=(float)getGreen(pixel);
-    blue=(float)getBlue(pixel);
+      float4 pixel;
+      pixel.x = (float)bias.x;
+      pixel.y = (float)bias.y;
+      pixel.z = (float)bias.z;
+      pixel.w = (float)bias.w;
 
-    intensity=0.0;
+      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
+        
+        for (int i = 0; i < width; i++) {
+          // only support EdgeVirtualPixelMethod through ClampToCanvas
+          // TODO: implement other virtual pixel method
+          int2 samplePixel = currentPixel + offset[i];
+          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
+          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
+          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
 
-    CLPixelType filteredPixel;
-    switch (method)
-    {
-      case AveragePixelIntensityMethod:
-        {
-          intensity=(red+green+blue)/3.0;
-          break;
-        }
-      case BrightnessPixelIntensityMethod:
-        {
-          intensity=max(max(red,green),blue);
-          break;
-        }
-      case LightnessPixelIntensityMethod:
-        {
-          intensity=(min(min(red,green),blue)+
-              max(max(red,green),blue))/2.0;
-          break;
-        }
-      case MSPixelIntensityMethod:
-        {
-          intensity=(float) (((float) red*red+green*green+
-                blue*blue)/(3.0*QuantumRange));
-          break;
-        }
-      case Rec601LumaPixelIntensityMethod:
-        {
-          /*
-          if (colorspace == RGBColorspace)
-          {
-            red=EncodePixelGamma(red);
-            green=EncodePixelGamma(green);
-            blue=EncodePixelGamma(blue);
-          }
-          */
-          intensity=0.298839*red+0.586811*green+0.114350*blue;
-          break;
-        }
-      case Rec601LuminancePixelIntensityMethod:
-        {
-          /*
-          if (image->colorspace == sRGBColorspace)
-          {
-            red=DecodePixelGamma(red);
-            green=DecodePixelGamma(green);
-            blue=DecodePixelGamma(blue);
-          }
-          */
-          intensity=0.298839*red+0.586811*green+0.114350*blue;
-          break;
-        }
-      case Rec709LumaPixelIntensityMethod:
-      default:
-        {
-          /*
-          if (image->colorspace == RGBColorspace)
-          {
-            red=EncodePixelGamma(red);
-            green=EncodePixelGamma(green);
-            blue=EncodePixelGamma(blue);
-          }
-          */
-          intensity=0.212656*red+0.715158*green+0.072186*blue;
-          break;
-        }
-      case Rec709LuminancePixelIntensityMethod:
-        {
-          /*
-          if (image->colorspace == sRGBColorspace)
-          {
-            red=DecodePixelGamma(red);
-            green=DecodePixelGamma(green);
-            blue=DecodePixelGamma(blue);
-          }
-          */
-          intensity=0.212656*red+0.715158*green+0.072186*blue;
-          break;
+          pixel.x += (filter[i] * (float)samplePixelValue.x);
+          pixel.y += (filter[i] * (float)samplePixelValue.y);
+          pixel.z += (filter[i] * (float)samplePixelValue.z);
+          pixel.w += (filter[i] * (float)samplePixelValue.w);
         }
-      case RMSPixelIntensityMethod:
-        {
-          intensity=(float) (sqrt((float) red*red+green*green+
-                blue*blue)/sqrt(3.0));
-          break;
+
+        CLPixelType outputPixel;
+        outputPixel.x = ClampToQuantum(pixel.x);
+        outputPixel.y = ClampToQuantum(pixel.y);
+        outputPixel.z = ClampToQuantum(pixel.z);
+        outputPixel.w = ClampToQuantum(pixel.w);
+        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
+      }
+      else {
+
+        float gamma = 0.0f;
+        for (int i = 0; i < width; i++) {
+          // only support EdgeVirtualPixelMethod through ClampToCanvas
+          // TODO: implement other virtual pixel method
+          int2 samplePixel = currentPixel + offset[i];
+          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
+          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
+
+          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
+
+          float alpha = QuantumScale*samplePixelValue.w;
+          float k = filter[i];
+          pixel.x = pixel.x + k * alpha * samplePixelValue.x;
+          pixel.y = pixel.y + k * alpha * samplePixelValue.y;
+          pixel.z = pixel.z + k * alpha * samplePixelValue.z;
+
+          pixel.w += k * alpha * samplePixelValue.w;
+
+          gamma+=k*alpha;
         }
+        gamma = PerceptibleReciprocal(gamma);
+        pixel.xyz = gamma*pixel.xyz;
 
+        CLPixelType outputPixel;
+        outputPixel.x = ClampToQuantum(pixel.x);
+        outputPixel.y = ClampToQuantum(pixel.y);
+        outputPixel.z = ClampToQuantum(pixel.z);
+        outputPixel.w = ClampToQuantum(pixel.w);
+        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
+      }
     }
+  )
 
-    setGray(&filteredPixel, ClampToQuantum(intensity));
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     R a n d o m                                                             %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-    filteredPixel.w = pixel.w;
+STRINGIFY(
 
-    im[c] = filteredPixel;
+  inline float GetPseudoRandomValue(uint4* seed, const float normalizeRand) {
+    uint4 s = *seed;
+    do {
+      unsigned int alpha = (unsigned int)(s.y ^ (s.y << 11));
+      s.y = s.z;
+      s.z = s.w;
+      s.w = s.x;
+      s.x = (s.x ^ (s.x >> 19)) ^ (alpha ^ (alpha >> 8));
+    } while (s.x == ~0UL);
+    *seed = s;
+    return (normalizeRand*s.x);
+  }
+
+  __kernel void RandomNumberGenerator(__global uint* seeds, const float normalizeRand
+    , __global float* randomNumbers, const uint init
+    , const uint numRandomNumbers) {
+
+    unsigned int id = get_global_id(0);
+    unsigned int seed[4];
+
+    if (init != 0) {
+      seed[0] = seeds[id * 4];
+      seed[1] = 0x50a7f451;
+      seed[2] = 0x5365417e;
+      seed[3] = 0xc3a4171a;
+    }
+    else {
+      seed[0] = seeds[id * 4];
+      seed[1] = seeds[id * 4 + 1];
+      seed[2] = seeds[id * 4 + 2];
+      seed[3] = seeds[id * 4 + 3];
+    }
+
+    unsigned int numRandomNumbersPerItem = (numRandomNumbers + get_global_size(0) - 1) / get_global_size(0);
+    for (unsigned int i = 0; i < numRandomNumbersPerItem; i++) {
+      do
+      {
+        unsigned int alpha = (unsigned int)(seed[1] ^ (seed[1] << 11));
+        seed[1] = seed[2];
+        seed[2] = seed[3];
+        seed[3] = seed[0];
+        seed[0] = (seed[0] ^ (seed[0] >> 19)) ^ (alpha ^ (alpha >> 8));
+      } while (seed[0] == ~0UL);
+      unsigned int pos = (get_group_id(0)*get_local_size(0)*numRandomNumbersPerItem)
+        + get_local_size(0) * i + get_local_id(0);
+
+      if (pos >= numRandomNumbers)
+        break;
+      randomNumbers[pos] = normalizeRand*seed[0];
+    }
+
+    /* save the seeds for the time*/
+    seeds[id * 4] = seed[0];
+    seeds[id * 4 + 1] = seed[1];
+    seeds[id * 4 + 2] = seed[2];
+    seeds[id * 4 + 3] = seed[3];
   }
   )
 
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     R e s i z e                                                             %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
   STRINGIFY(
   // Based on Box from resize.c
   float BoxResizeFilter(const float x)
@@ -2286,28 +2848,6 @@ const char* accelerateKernels =
   }
   )
 
-
-  STRINGIFY(
-  typedef enum {
-    BoxWeightingFunction = 0,
-    TriangleWeightingFunction,
-    CubicBCWeightingFunction,
-    HannWeightingFunction,
-    HammingWeightingFunction,
-    BlackmanWeightingFunction,
-    GaussianWeightingFunction,
-    QuadraticWeightingFunction,
-    JincWeightingFunction,
-    SincWeightingFunction,
-    SincFastWeightingFunction,
-    KaiserWeightingFunction,
-    WelshWeightingFunction,
-    BohmanWeightingFunction,
-    LagrangeWeightingFunction,
-    CosineWeightingFunction,
-  } ResizeWeightingFunctionType;
-  )
-
   STRINGIFY(
   inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
   {
@@ -2726,739 +3266,397 @@ const char* accelerateKernels =
           filteredPixel *= (float4)density;
           gamma *= density;
         }
-        gamma = PerceptibleReciprocal(gamma);
-
-        CLPixelType fp;
-        fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
-          , ClampToQuantum(gamma*filteredPixel.y)
-          , ClampToQuantum(gamma*filteredPixel.z)
-          , ClampToQuantum(filteredPixel.w));
-
-        filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
-
-      }
-    }
-
-    } // end of chunking loop
-  }
-  )
-
-
-
-  STRINGIFY(
- __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
- void ResizeVerticalFilterSinc(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
-  , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
-  , const int resizeFilterType, const int resizeWindowType
-  , const __global float* resizeFilterCubicCoefficients
-  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
-  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
-  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
-    ResizeVerticalFilter(inputImage,inputColumns,inputRows,matte
-      ,yFactor,filteredImage,filteredColumns,filteredRows
-      ,SincWeightingFunction, SincWeightingFunction
-      ,resizeFilterCubicCoefficients
-      ,resizeFilterScale,resizeFilterSupport,resizeFilterWindowSupport,resizeFilterBlur
-      ,inputImageCache,numCachedPixels,pixelPerWorkgroup,pixelChunkSize
-      ,outputPixelCache,densityCache,gammaCache);
-  }
-  )
-
-  STRINGIFY(
-
-  inline float GetPseudoRandomValue(uint4* seed, const float normalizeRand) {
-    uint4 s = *seed;
-    do {
-      unsigned int alpha = (unsigned int) (s.y ^ (s.y << 11));
-      s.y=s.z;
-      s.z=s.w;
-      s.w=s.x;
-      s.x = (s.x ^ (s.x >> 19)) ^ (alpha ^ (alpha >> 8));
-    } while (s.x == ~0UL);
-    *seed = s;
-    return (normalizeRand*s.x);
-  }
-
-  __kernel void randomNumberGeneratorKernel(__global uint* seeds, const float normalizeRand
-                                           , __global float* randomNumbers, const uint init
-                                           ,const uint numRandomNumbers) {
-
-    unsigned int id = get_global_id(0);
-    unsigned int seed[4];
-
-    if (init!=0) {
-      seed[0] = seeds[id*4];
-      seed[1] = 0x50a7f451;
-      seed[2] = 0x5365417e;
-      seed[3] = 0xc3a4171a;
-    }
-    else {
-      seed[0] = seeds[id*4];
-      seed[1] = seeds[id*4+1];
-      seed[2] = seeds[id*4+2];
-      seed[3] = seeds[id*4+3];
-    }
-
-    unsigned int numRandomNumbersPerItem = (numRandomNumbers+get_global_size(0)-1)/get_global_size(0);
-    for (unsigned int i = 0; i < numRandomNumbersPerItem; i++) {
-      do
-      {
-        unsigned int alpha=(unsigned int) (seed[1] ^ (seed[1] << 11));
-        seed[1]=seed[2];
-        seed[2]=seed[3];
-        seed[3]=seed[0];
-        seed[0]=(seed[0] ^ (seed[0] >> 19)) ^ (alpha ^ (alpha >> 8));
-      } while (seed[0] == ~0UL);
-      unsigned int pos = (get_group_id(0)*get_local_size(0)*numRandomNumbersPerItem) 
-                          + get_local_size(0) * i + get_local_id(0);
-
-      if (pos >= numRandomNumbers)
-        break;
-      randomNumbers[pos] = normalizeRand*seed[0];
-    }
-
-    /* save the seeds for the time*/
-    seeds[id*4]   = seed[0];
-    seeds[id*4+1] = seed[1];
-    seeds[id*4+2] = seed[2];
-    seeds[id*4+3] = seed[3];
-  }
-
-  )
-
-
-OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
-OPENCL_DEFINE(SigmaGaussian,(attenuate*0.015625f))
-OPENCL_DEFINE(SigmaImpulse,  (attenuate*0.1f))
-OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
-OPENCL_DEFINE(SigmaMultiplicativeGaussian,  (attenuate*0.5f))
-OPENCL_DEFINE(SigmaPoisson,  (attenuate*12.5f))
-OPENCL_DEFINE(SigmaRandom,  (attenuate))
-OPENCL_DEFINE(TauGaussian,  (attenuate*0.078125f))
-
-STRINGIFY(
-
-/*
-Part of MWC64X by David Thomas, dt10@imperial.ac.uk
-This is provided under BSD, full license is with the main package.
-See http://www.doc.ic.ac.uk/~dt10/research
-*/
-
-// Pre: a<M, b<M
-// Post: r=(a+b) mod M
-ulong MWC_AddMod64(ulong a, ulong b, ulong M)
-{
-       ulong v=a+b;
-       //if( (v>=M) || (v<a) )
-       if( (v>=M) || (convert_float(v) < convert_float(a)) )   // workaround for what appears to be an optimizer bug.
-               v=v-M;
-       return v;
-}
-
-// Pre: a<M,b<M
-// Post: r=(a*b) mod M
-// This could be done more efficently, but it is portable, and should
-// be easy to understand. It can be replaced with any of the better
-// modular multiplication algorithms (for example if you know you have
-// double precision available or something).
-ulong MWC_MulMod64(ulong a, ulong b, ulong M)
-{      
-       ulong r=0;
-       while(a!=0){
-               if(a&1)
-                       r=MWC_AddMod64(r,b,M);
-               b=MWC_AddMod64(b,b,M);
-               a=a>>1;
-       }
-       return r;
-}
-
-
-// Pre: a<M, e>=0
-// Post: r=(a^b) mod M
-// This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
-// most architectures
-ulong MWC_PowMod64(ulong a, ulong e, ulong M)
-{
-       ulong sqr=a, acc=1;
-       while(e!=0){
-               if(e&1)
-                       acc=MWC_MulMod64(acc,sqr,M);
-               sqr=MWC_MulMod64(sqr,sqr,M);
-               e=e>>1;
-       }
-       return acc;
-}
-
-uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
-{
-       ulong m=MWC_PowMod64(A, distance, M);
-       ulong x=curr.x*(ulong)A+curr.y;
-       x=MWC_MulMod64(x, m, M);
-       return (uint2)((uint)(x/A), (uint)(x%A));
-}
-
-uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
-{
-       // This is an arbitrary constant for starting LCG jumping from. I didn't
-       // want to start from 1, as then you end up with the two or three first values
-       // being a bit poor in ones - once you've decided that, one constant is as
-       // good as any another. There is no deep mathematical reason for it, I just
-       // generated a random number.
-       enum{ MWC_BASEID = 4077358422479273989UL };
-       
-       ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
-       ulong m=MWC_PowMod64(A, dist, M);
-       
-       ulong x=MWC_MulMod64(MWC_BASEID, m, M);
-       return (uint2)((uint)(x/A), (uint)(x%A));
-}
-
-//! Represents the state of a particular generator
-typedef struct{ uint x; uint c; } mwc64x_state_t;
-
-enum{ MWC64X_A = 4294883355U };
-enum{ MWC64X_M = 18446383549859758079UL };
-
-void MWC64X_Step(mwc64x_state_t *s)
-{
-       uint X=s->x, C=s->c;
-       
-       uint Xn=MWC64X_A*X+C;
-       uint carry=(uint)(Xn<C);                                // The (Xn<C) will be zero or one for scalar
-       uint Cn=mad_hi(MWC64X_A,X,carry);  
-       
-       s->x=Xn;
-       s->c=Cn;
-}
-
-void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
-{
-       uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
-       s->x=tmp.x;
-       s->c=tmp.y;
-}
+        gamma = PerceptibleReciprocal(gamma);
 
-void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
-{
-       uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
-       s->x=tmp.x;
-       s->c=tmp.y;
-}
+        CLPixelType fp;
+        fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
+          , ClampToQuantum(gamma*filteredPixel.y)
+          , ClampToQuantum(gamma*filteredPixel.z)
+          , ClampToQuantum(filteredPixel.w));
 
-//! Return a 32-bit integer in the range [0..2^32)
-uint MWC64X_NextUint(mwc64x_state_t *s)
-{
-       uint res=s->x ^ s->c;
-       MWC64X_Step(s);
-       return res;
-}
+        filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
 
-//
-// End of MWC64X excerpt
-//
+      }
+    }
 
+    } // end of chunking loop
+  }
+  )
 
-  typedef enum
-  {
-    UndefinedNoise,
-    UniformNoise,
-    GaussianNoise,
-    MultiplicativeGaussianNoise,
-    ImpulseNoise,
-    LaplacianNoise,
-    PoissonNoise,
-    RandomNoise
-  } NoiseType;
 
 
-  float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {
-       return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff);     // normalized to 1.0
+  STRINGIFY(
+ __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
+ void ResizeVerticalFilterSinc(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
+  , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
+  , const int resizeFilterType, const int resizeWindowType
+  , const __global float* resizeFilterCubicCoefficients
+  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
+  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
+  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
+    ResizeVerticalFilter(inputImage,inputColumns,inputRows,matte
+      ,yFactor,filteredImage,filteredColumns,filteredRows
+      ,SincWeightingFunction, SincWeightingFunction
+      ,resizeFilterCubicCoefficients
+      ,resizeFilterScale,resizeFilterSupport,resizeFilterWindowSupport,resizeFilterBlur
+      ,inputImageCache,numCachedPixels,pixelPerWorkgroup,pixelChunkSize
+      ,outputPixelCache,densityCache,gammaCache);
   }
+  )
 
-  
-  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
-    float 
-      alpha,
-      beta,
-      noise,
-      sigma;
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     R o t a t i o n a l B l u r                                             %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-    noise = 0.0f;
-    alpha=mwcReadPseudoRandomValue(r);
-    switch(noise_type) {
-    case UniformNoise:
-    default:
-      {
-        noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
-        break;
-      }
-    case GaussianNoise:
+  STRINGIFY(
+    __kernel void RotationalBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
+                                 const float4 bias,
+                                 const unsigned int channel, const unsigned int matte,
+                                 const float2 blurCenter,
+                                 __constant float *cos_theta, __constant float *sin_theta, 
+                                 const unsigned int cossin_theta_size)
       {
-        float
-          gamma,
-          tau;
-
-        if (alpha == 0.0f)
-          alpha=1.0f;
-        beta=mwcReadPseudoRandomValue(r);
-        gamma=sqrt(-2.0f*log(alpha));
-        sigma=gamma*cospi((2.0f*beta));
-        tau=gamma*sinpi((2.0f*beta));
-        noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
-                      QuantumRange*TauGaussian*tau);
-        break;
-      }
-
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);
+        const int columns = get_global_size(0);
+        const int rows = get_global_size(1);  
+        unsigned int step = 1;
+        float center_x = (float) x - blurCenter.x;
+        float center_y = (float) y - blurCenter.y;
+        float radius = hypot(center_x, center_y);
+        
+        //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
+        float blur_radius = hypot(blurCenter.x, blurCenter.y);
 
-    case ImpulseNoise:
-    {
-      if (alpha < (SigmaImpulse/2.0f))
-        noise=0.0f;
-      else
-        if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
-          noise=(float)QuantumRange;
-        else
-          noise=(float)pixel;
-      break;
-    }
-    case LaplacianNoise:
-    {
-      if (alpha <= 0.5f)
+        if (radius > MagickEpsilon)
         {
-          if (alpha <= MagickEpsilon)
-            noise=(float) (pixel-QuantumRange);
-          else
-            noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
-              0.5f);
-          break;
+          step = (unsigned int) (blur_radius / radius);
+          if (step == 0)
+            step = 1;
+          if (step >= cossin_theta_size)
+            step = cossin_theta_size-1;
         }
-      beta=1.0f-alpha;
-      if (beta <= (0.5f*MagickEpsilon))
-        noise=(float) (pixel+QuantumRange);
-      else
-        noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
-      break;
-    }
-    case MultiplicativeGaussianNoise:
-    {
-      sigma=1.0f;
-      if (alpha > MagickEpsilon)
-        sigma=sqrt(-2.0f*log(alpha));
-      beta=mwcReadPseudoRandomValue(r);
-      noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
-        cospi((float) (2.0f*beta))/2.0f);
-      break;
-    }
-    case PoissonNoise:
-    {
-      float 
-        poisson;
-      unsigned int i;
-      poisson=exp(-SigmaPoisson*QuantumScale*pixel);
-      for (i=0; alpha > poisson; i++)
-      {
-        beta=mwcReadPseudoRandomValue(r);
-        alpha*=beta;
-      }
-      noise=(float) (QuantumRange*i/SigmaPoisson);
-      break;
-    }
-    case RandomNoise:
-    {
-      noise=(float) (QuantumRange*SigmaRandom*alpha);
-      break;
-    }
-
-    };
-    return noise;
-  }
-
 
+        float4 result;
+        result.x = (float)bias.x;
+        result.y = (float)bias.y;
+        result.z = (float)bias.z;
+        result.w = (float)bias.w;
+        float normalize = 0.0f;
 
+        if (((channel & AlphaChannel) == 0) || (matte == 0)) {
+          for (unsigned int i=0; i<cossin_theta_size; i+=step)
+          {
+            result += convert_float4(im[
+              ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+ 
+                ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
+              normalize += 1.0f;
+          }
+          normalize = PerceptibleReciprocal(normalize);
+          result = result * normalize;
+        }
+        else {
+          float gamma = 0.0f;
+          for (unsigned int i=0; i<cossin_theta_size; i+=step)
+          {
+            float4 p = convert_float4(im[
+              ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+ 
+                ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
+            
+            float alpha = (float)(QuantumScale*p.w);
+            result.x += alpha * p.x;
+            result.y += alpha * p.y;
+            result.z += alpha * p.z;
+            result.w += p.w;
+            gamma+=alpha;
+            normalize += 1.0f;
+          }
+          gamma = PerceptibleReciprocal(gamma);
+          normalize = PerceptibleReciprocal(normalize);
+          result.x = gamma*result.x;
+          result.y = gamma*result.y;
+          result.z = gamma*result.z;
+          result.w = normalize*result.w;
+        }
+        filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
+          ClampToQuantum(result.z), ClampToQuantum(result.w)); 
+      }
+  )
 
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     U n s h a r p M a s k                                                   %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-  __kernel
-  void GenerateNoiseImage(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
-                    ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem
-                    ,const ChannelType channel 
-                    ,const NoiseType noise_type, const float attenuate
-                    ,const unsigned int seed0, const unsigned int seed1
-                                       ,const unsigned int numRandomNumbersPerPixel) {
+    STRINGIFY(
+    __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage, 
+          const __global float4 *blurRowData, __global CLPixelType *filtered_im,
+          const unsigned int imageColumns, const unsigned int imageRows, 
+          __local float4* cachedData, __local float* cachedFilter,
+          const ChannelType channel, const __global float *filter, const unsigned int width, 
+          const float gain, const float threshold)
+    {
+      const unsigned int radius = (width-1)/2;
 
-       mwc64x_state_t rng;
-       rng.x = seed0;
-       rng.c = seed1;
+      // cache the pixel shared by the workgroup
+      const int groupX = get_group_id(0);
+      const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
+      const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
 
-       uint span = pixelsPerWorkItem * numRandomNumbersPerPixel;       // length of RNG substream each workitem will use
-       uint offset = span * get_local_size(0) * get_group_id(0);       // offset of this workgroup's RNG substream (in master stream);
+      if (groupStartY >= 0
+          && groupStopY < imageRows) {
+        event_t e = async_work_group_strided_copy(cachedData
+                                                ,blurRowData+groupStartY*imageColumns+groupX
+                                                ,groupStopY-groupStartY,imageColumns,0);
+        wait_group_events(1,&e);
+      }
+      else {
+        for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
+          cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+      // cache the filter as well
+      event_t e = async_work_group_copy(cachedFilter,filter,width,0);
+      wait_group_events(1,&e);
 
-       MWC64X_SeedStreams(&rng, offset, span);                                         // Seed the RNG streams
+      // only do the work if this is not a patched item
+      //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
+      const int cy = get_global_id(1);
 
-       uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0);   // pixel to process
+      if (cy < imageRows) {
+        float4 blurredPixel = (float4) 0.0f;
 
-       uint count = pixelsPerWorkItem;
+        int i = 0;
 
-       while (count > 0) {
-               if (pos < inputPixelCount) {
-                       CLPixelType p = inputImage[pos];
+        \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
 
-                       if ((channel&RedChannel)!=0) {
-                         setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
-                       }
-    
-                       if ((channel&GreenChannel)!=0) {
-                         setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
-                       }
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR \n
+              for (int j=0; j < UFACTOR; j++, i++)
+              {
+                blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+              }
+          }
 
-                       if ((channel&BlueChannel)!=0) {
-                         setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
-                       }
+        for ( ; i < width; i++)
+        {
+          blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+        }
 
-                       if ((channel & AlphaChannel) != 0) {
-                         setAlpha(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getAlpha(p),noise_type,attenuate)));
-                       }
+        blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
+                                      ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
 
-                       filteredImage[pos] = p;
-                       //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);
-               }
-               pos += get_local_size(0);
-               --count;
-       }
-  }
-  )
+        float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
+        float4 outputPixel = inputImagePixel - blurredPixel;
 
+        float quantumThreshold = QuantumRange*threshold;
 
-  STRINGIFY(
-    __kernel 
-    void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
-                    const unsigned int imageWidth, const unsigned int imageHeight,
-                    const __global float *filter, const unsigned int width, const __global int2* offset,
-                    const float4 bias,
-                    const ChannelType channel, const unsigned int matte) {
+        int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
+        outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
 
-      int2 currentPixel;
-      currentPixel.x = get_global_id(0);
-      currentPixel.y = get_global_id(1);
+        //write back
+        filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
+                                                            ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
 
-      if (currentPixel.x >= imageWidth
-          || currentPixel.y >= imageHeight)
-          return;
+      }
+    }
 
-      float4 pixel;
-      pixel.x = (float)bias.x;
-      pixel.y = (float)bias.y;
-      pixel.z = (float)bias.z;
-      pixel.w = (float)bias.w;
+    __kernel void UnsharpMaskBlurColumnSection(const __global CLPixelType* inputImage, 
+          const __global float4 *blurRowData, __global CLPixelType *filtered_im,
+          const unsigned int imageColumns, const unsigned int imageRows, 
+          __local float4* cachedData, __local float* cachedFilter,
+          const ChannelType channel, const __global float *filter, const unsigned int width, 
+          const float gain, const float threshold, 
+          const unsigned int offsetRows, const unsigned int section)
+    {
+      const unsigned int radius = (width-1)/2;
 
-      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
-        
-        for (int i = 0; i < width; i++) {
-          // only support EdgeVirtualPixelMethod through ClampToCanvas
-          // TODO: implement other virtual pixel method
-          int2 samplePixel = currentPixel + offset[i];
-          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
-          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
-          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
+      // cache the pixel shared by the workgroup
+      const int groupX = get_group_id(0);
+      const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
+      const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
 
-          pixel.x += (filter[i] * (float)samplePixelValue.x);
-          pixel.y += (filter[i] * (float)samplePixelValue.y);
-          pixel.z += (filter[i] * (float)samplePixelValue.z);
-          pixel.w += (filter[i] * (float)samplePixelValue.w);
-        }
+      // offset the input data
+      blurRowData += imageColumns * radius * section;
 
-        CLPixelType outputPixel;
-        outputPixel.x = ClampToQuantum(pixel.x);
-        outputPixel.y = ClampToQuantum(pixel.y);
-        outputPixel.z = ClampToQuantum(pixel.z);
-        outputPixel.w = ClampToQuantum(pixel.w);
-        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
+      if (groupStartY >= 0
+          && groupStopY < imageRows) {
+        event_t e = async_work_group_strided_copy(cachedData
+                                                ,blurRowData+groupStartY*imageColumns+groupX
+                                                ,groupStopY-groupStartY,imageColumns,0);
+        wait_group_events(1,&e);
       }
       else {
-
-        float gamma = 0.0f;
-        for (int i = 0; i < width; i++) {
-          // only support EdgeVirtualPixelMethod through ClampToCanvas
-          // TODO: implement other virtual pixel method
-          int2 samplePixel = currentPixel + offset[i];
-          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
-          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
-
-          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
-
-          float alpha = QuantumScale*samplePixelValue.w;
-          float k = filter[i];
-          pixel.x = pixel.x + k * alpha * samplePixelValue.x;
-          pixel.y = pixel.y + k * alpha * samplePixelValue.y;
-          pixel.z = pixel.z + k * alpha * samplePixelValue.z;
-
-          pixel.w += k * alpha * samplePixelValue.w;
-
-          gamma+=k*alpha;
+        for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
+          int pos = ClampToCanvasWithHalo(groupStartY+i,imageRows, radius, section)*imageColumns+ groupX;
+          cachedData[i] = *(blurRowData + pos);
         }
-        gamma = PerceptibleReciprocal(gamma);
-        pixel.xyz = gamma*pixel.xyz;
-
-        CLPixelType outputPixel;
-        outputPixel.x = ClampToQuantum(pixel.x);
-        outputPixel.y = ClampToQuantum(pixel.y);
-        outputPixel.z = ClampToQuantum(pixel.z);
-        outputPixel.w = ClampToQuantum(pixel.w);
-        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
+        barrier(CLK_LOCAL_MEM_FENCE);
       }
-    }
-  )
-
-  STRINGIFY(
-    typedef enum
-    {
-      UndefinedCompositeOp,
-      NoCompositeOp,
-      ModulusAddCompositeOp,
-      AtopCompositeOp,
-      BlendCompositeOp,
-      BumpmapCompositeOp,
-      ChangeMaskCompositeOp,
-      ClearCompositeOp,
-      ColorBurnCompositeOp,
-      ColorDodgeCompositeOp,
-      ColorizeCompositeOp,
-      CopyBlackCompositeOp,
-      CopyBlueCompositeOp,
-      CopyCompositeOp,
-      CopyCyanCompositeOp,
-      CopyGreenCompositeOp,
-      CopyMagentaCompositeOp,
-      CopyOpacityCompositeOp,
-      CopyRedCompositeOp,
-      CopyYellowCompositeOp,
-      DarkenCompositeOp,
-      DstAtopCompositeOp,
-      DstCompositeOp,
-      DstInCompositeOp,
-      DstOutCompositeOp,
-      DstOverCompositeOp,
-      DifferenceCompositeOp,
-      DisplaceCompositeOp,
-      DissolveCompositeOp,
-      ExclusionCompositeOp,
-      HardLightCompositeOp,
-      HueCompositeOp,
-      InCompositeOp,
-      LightenCompositeOp,
-      LinearLightCompositeOp,
-      LuminizeCompositeOp,
-      MinusDstCompositeOp,
-      ModulateCompositeOp,
-      MultiplyCompositeOp,
-      OutCompositeOp,
-      OverCompositeOp,
-      OverlayCompositeOp,
-      PlusCompositeOp,
-      ReplaceCompositeOp,
-      SaturateCompositeOp,
-      ScreenCompositeOp,
-      SoftLightCompositeOp,
-      SrcAtopCompositeOp,
-      SrcCompositeOp,
-      SrcInCompositeOp,
-      SrcOutCompositeOp,
-      SrcOverCompositeOp,
-      ModulusSubtractCompositeOp,
-      ThresholdCompositeOp,
-      XorCompositeOp,
-      /* These are new operators, added after the above was last sorted.
-       * The list should be re-sorted only when a new library version is
-       * created.
-       */
-      DivideDstCompositeOp,
-      DistortCompositeOp,
-      BlurCompositeOp,
-      PegtopLightCompositeOp,
-      VividLightCompositeOp,
-      PinLightCompositeOp,
-      LinearDodgeCompositeOp,
-      LinearBurnCompositeOp,
-      MathematicsCompositeOp,
-      DivideSrcCompositeOp,
-      MinusSrcCompositeOp,
-      DarkenIntensityCompositeOp,
-      LightenIntensityCompositeOp
-    } CompositeOperator;
-  )
+      // cache the filter as well
+      event_t e = async_work_group_copy(cachedFilter,filter,width,0);
+      wait_group_events(1,&e);
 
-  STRINGIFY(
-    inline float ColorDodge(const float Sca,
-      const float Sa,const float Dca,const float Da)
-    {
-      /*
-        Oct 2004 SVG specification.
-      */
-      if ((Sca*Da+Dca*Sa) >= Sa*Da)
-        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
-      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
+      // only do the work if this is not a patched item
+      //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
+      const int cy = get_global_id(1);
 
+      if (cy < imageRows) {
+        float4 blurredPixel = (float4) 0.0f;
 
-      /*
-        New specification, March 2009 SVG specification.  This specification was
-        also wrong of non-overlap cases.
-      */
-      /*
-      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
-        return(Sca*(1.0-Da));
-      if (fabs(Sca-Sa) < MagickEpsilon)
-        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
-      return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
-      */
+        int i = 0;
 
-      /*
-        Working from first principles using the original formula:
+        \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
 
-           f(Sc,Dc) = Dc/(1-Sc)
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR \n
+              for (int j=0; j < UFACTOR; j++, i++)
+              {
+                blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+              }
+          }
 
-        This works correctly! Looks like the 2004 model was right but just
-        required a extra condition for correct handling.
-      */
+        for ( ; i < width; i++)
+        {
+          blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+        }
 
-      /*
-      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
-        return(Sca*(1.0-Da)+Dca*(1.0-Sa));
-      if (fabs(Sca-Sa) < MagickEpsilon)
-        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
-      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
-      */
-    }
+        blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
+                                      ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
 
-    inline void CompositeColorDodge(const float4 *p,
-      const float4 *q,float4 *composite) {
+        // offset the output data
+        inputImage += imageColumns * offsetRows; 
+        filtered_im += imageColumns * offsetRows;
 
-      float 
-      Da,
-      gamma,
-      Sa;
+        float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
+        float4 outputPixel = inputImagePixel - blurredPixel;
 
-      Sa=QuantumScale*getAlphaF4(*p);  /* simplify and speed up equations */
-      Da=QuantumScale*getAlphaF4(*q);
-      gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
-      setAlphaF4(composite,QuantumRange*gamma);
-      gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
-      setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
-        getRedF4(*q)*Da,Da));
-      setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
-        getGreenF4(*q)*Da,Da));
-      setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
-        getBlueF4(*q)*Da,Da));
-    }
-  )
+        float quantumThreshold = QuantumRange*threshold;
 
-  STRINGIFY(
-    inline void MagickPixelCompositePlus(const float4 *p,
-      const float alpha,const float4 *q,
-      const float beta,float4 *composite)
-    {
-      float 
-        gamma;
+        int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
+        outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
 
-      float
-        Da,
-        Sa;
-      /*
-        Add two pixels with the given opacities.
-      */
-      Sa=QuantumScale*alpha;
-      Da=QuantumScale*beta;
-      gamma=RoundToUnity(Sa+Da);  /* 'Plus' blending -- not 'Over' blending */
-      setAlphaF4(composite,(float) QuantumRange*gamma);
-      gamma=PerceptibleReciprocal(gamma);
-      setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
-      setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
-      setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
-    }
-  )
+        //write back
+        filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
+                                                            ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
 
-  STRINGIFY(
-    inline void MagickPixelCompositeBlend(const float4 *p,
-      const float alpha,const float4 *q,
-      const float beta,float4 *composite)
-    {
-      MagickPixelCompositePlus(p,(float) (alpha*
-      (getAlphaF4(*p))),q,(float) (beta*
-      (getAlphaF4(*q))),composite);
+      }
+     
     }
-  )
-  
-  STRINGIFY(
-    __kernel 
-    void Composite(__global CLPixelType *image,
-                   const unsigned int imageWidth, 
-                   const unsigned int imageHeight,
-                   const __global CLPixelType *compositeImage,
-                   const unsigned int compositeWidth, 
-                   const unsigned int compositeHeight,
-                   const unsigned int compose,
-                   const ChannelType channel, 
-                   const unsigned int matte,
-                   const float destination_dissolve,
-                   const float source_dissolve) {
+    )
 
-      uint2 index;
-      index.x = get_global_id(0);
-      index.y = get_global_id(1);
 
+    STRINGIFY(
+      __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,
+                         __constant float *filter,
+                         const unsigned int width, 
+                         const unsigned int imageColumns, const unsigned int imageRows,
+                         __local float4 *pixels, 
+                         const float gain, const float threshold, const unsigned int justBlur)
+      {
+        const int x = get_global_id(0);
+        const int y = get_global_id(1);
+
+        const unsigned int radius = (width - 1) / 2;
+                               
+               int row = y - radius;
+               int baseRow = get_group_id(1) * get_local_size(1) - radius;
+               int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
+                               
+               while (row < endRow) {
+                       int srcy =  (row < 0) ? -row : row;                     // mirror pad
+                       srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;
+                                       
+                       float4 value = 0.0f;
+                                       
+                       int ix = x - radius;
+                       int i = 0;
 
-      if (index.x >= imageWidth
-        || index.y >= imageHeight) {
-          return;
-      }
-      const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
-      float4 destination;
-      setRedF4(&destination,getRed(inputPixel));
-      setGreenF4(&destination,getGreen(inputPixel));
-      setBlueF4(&destination,getBlue(inputPixel));
+                       while (i + 7 < width) {
+                               for (int j = 0; j < 8; ++j) {           // unrolled
+                                       int srcx = ix + j;
+                                       srcx = (srcx < 0) ? -srcx : srcx;
+                                       srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
+                                       value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);
+                               }
+                               ix += 8;
+                               i += 8;
+                       }
 
-      
-      const CLPixelType compositePixel 
-        = compositeImage[index.y*imageWidth+index.x];
-      float4 source;
-      setRedF4(&source,getRed(compositePixel));
-      setGreenF4(&source,getGreen(compositePixel));
-      setBlueF4(&source,getBlue(compositePixel));
+                       while (i < width) {
+                               int srcx = (ix < 0) ? -ix : ix;                 // mirror pad
+                               srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
+                               value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);
+                               ++i;
+                               ++ix;
+                       }       
+                       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
+                       row += get_local_size(1);
+               }
+                               
+                       
+               barrier(CLK_LOCAL_MEM_FENCE);
 
-      if (matte != 0) {
-        setAlphaF4(&destination,getAlpha(inputPixel));
-        setAlphaF4(&source,getAlpha(compositePixel));
-      }
-      else {
-        setAlphaF4(&destination,1.0f);
-        setAlphaF4(&source,1.0f);
-      }
+                                               
+               const int px = get_local_id(0);
+               const int py = get_local_id(1);
+               const int prp = get_local_size(0);
+               float4 value = (float4)(0.0f);
+                       
+               int i = 0;
+               while (i + 7 < width) {                 // unrolled
+                       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];
+                       i += 8;
+               }
+               while (i < width) {
+                       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
+                       ++i;
+               }
 
-      float4 composite=destination;
+               if (justBlur == 0) {            // apply sharpening
+                       float4 srcPixel = convert_float4(im[x + y * imageColumns]);
+                       float4 diff = srcPixel - value;
 
-      CompositeOperator op = (CompositeOperator)compose;
-      switch (op) {
-      case ColorDodgeCompositeOp:
-        CompositeColorDodge(&source,&destination,&composite);
-        break;
-      case BlendCompositeOp:
-        MagickPixelCompositeBlend(&source,source_dissolve,&destination,
-            destination_dissolve,&composite);
-        break;
-      default:
-        // unsupported operators
-        break;
-      };
+                       float quantumThreshold = QuantumRange*threshold;
+
+                       int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
+                       value = select(srcPixel + diff * gain, srcPixel, mask);
+               }
+       
+               if ((x < imageColumns) && (y < imageRows))
+                       filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));
+               }       
+       )
 
-      CLPixelType outputPixel;
-      setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
-      setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
-      setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
-      setAlpha(&outputPixel, ClampToQuantum(getAlphaF4(composite)));
-      image[index.y*imageWidth+index.x] = outputPixel;
-    }
-  )   
-    
   ;
 
 #endif // MAGICKCORE_OPENCL_SUPPORT
index cc7a5f633d2b9ebd49a9c0137bb818ff727d6190..aca16528e9823e7ee7cbb9cfb0662e75c9e6aca9 100644 (file)
@@ -88,47 +88,37 @@ Include declarations.
 
 #if defined(MAGICKCORE_OPENCL_SUPPORT)
 
+/*
+  Define declarations.
+*/
 #define ALIGNED(pointer,type) ((((size_t)(pointer)) & (sizeof(type)-1)) == 0)
 
-/* pad the global workgroup size to the next multiple of 
-   the local workgroup size */
-inline static unsigned int padGlobalWorkgroupSizeToLocalWorkgroupSize(
-  const unsigned int orgGlobalSize,const unsigned int localGroupSize) 
-{
-  return ((orgGlobalSize+(localGroupSize-1))/localGroupSize*localGroupSize);
-}
-
-static MagickBooleanType checkOpenCLEnvironment(ExceptionInfo* exception)
+/*
+  Static declarations.
+*/
+static const ResizeWeightingFunctionType supportedResizeWeighting[] =
 {
-  MagickBooleanType
-    flag;
-
-  MagickCLEnv
-    clEnv;
-
-  clEnv=GetDefaultOpenCLEnv();
-
-  GetMagickOpenCLEnvParam(clEnv,MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED,
-    sizeof(MagickBooleanType),&flag,exception);
-  if (flag != MagickFalse)
-    return(MagickFalse);
-
-  GetMagickOpenCLEnvParam(clEnv,MAGICK_OPENCL_ENV_PARAM_OPENCL_INITIALIZED,
-    sizeof(MagickBooleanType),&flag,exception);
-  if (flag == MagickFalse)
-    {
-      if (InitOpenCLEnv(clEnv,exception) == MagickFalse)
-        return(MagickFalse);
-
-      GetMagickOpenCLEnvParam(clEnv,MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED,
-        sizeof(MagickBooleanType),&flag,exception);
-      if (flag != MagickFalse)
-        return(MagickFalse);
-    }
+  BoxWeightingFunction,
+  TriangleWeightingFunction,
+  HannWeightingFunction,
+  HammingWeightingFunction,
+  BlackmanWeightingFunction,
+  CubicBCWeightingFunction,
+  SincWeightingFunction,
+  SincFastWeightingFunction,
+  LastWeightingFunction
+};
 
-  return(MagickTrue);
-}
+/*
+  Forward declarations.
+*/
+static Image *ComputeUnsharpMaskImageSingle(const Image *image,
+  const ChannelType channel,const double radius,const double sigma,
+  const double gain,const double threshold,int blurOnly, ExceptionInfo *exception);
 
+/*
+  Helper functions.
+*/
 static MagickBooleanType checkAccelerateCondition(const Image* image,
   const ChannelType channel)
 {
@@ -187,6 +177,45 @@ static MagickBooleanType checkHistogramCondition(Image *image,
   return MagickTrue;
 }
 
+static MagickBooleanType checkOpenCLEnvironment(ExceptionInfo* exception)
+{
+  MagickBooleanType
+    flag;
+
+  MagickCLEnv
+    clEnv;
+
+  clEnv=GetDefaultOpenCLEnv();
+
+  GetMagickOpenCLEnvParam(clEnv,MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED,
+    sizeof(MagickBooleanType),&flag,exception);
+  if (flag != MagickFalse)
+    return(MagickFalse);
+
+  GetMagickOpenCLEnvParam(clEnv,MAGICK_OPENCL_ENV_PARAM_OPENCL_INITIALIZED,
+    sizeof(MagickBooleanType),&flag,exception);
+  if (flag == MagickFalse)
+    {
+      if (InitOpenCLEnv(clEnv,exception) == MagickFalse)
+        return(MagickFalse);
+
+      GetMagickOpenCLEnvParam(clEnv,MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED,
+        sizeof(MagickBooleanType),&flag,exception);
+      if (flag != MagickFalse)
+        return(MagickFalse);
+    }
+
+  return(MagickTrue);
+}
+
+/* pad the global workgroup size to the next multiple of 
+   the local workgroup size */
+inline static unsigned int padGlobalWorkgroupSizeToLocalWorkgroupSize(
+  const unsigned int orgGlobalSize,const unsigned int localGroupSize) 
+{
+  return ((orgGlobalSize+(localGroupSize-1))/localGroupSize*localGroupSize);
+}
+
 static MagickBooleanType splitImage(const Image* image)
 {
   MagickBooleanType
@@ -213,35 +242,16 @@ static MagickBooleanType splitImage(const Image* image)
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     C o n v o l v e I m a g e  w i t h  O p e n C L                         %
+%     A c c e l e r a t e A d d N o i s e I m a g e                           %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  ConvolveImage() applies a custom convolution kernel to the image.
-%
-%  The format of the ConvolveImage method is:
-%
-%      Image *ConvolveImage(const Image *image,const size_t order,
-%        const double *kernel,ExceptionInfo *exception)
-%      Image *ConvolveImageChannel(const Image *image,const ChannelType channel,
-%        const size_t order,const double *kernel,ExceptionInfo *exception)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o channel: the channel type.
-%
-%    o kernel: kernel info.
-%
-%    o exception: return any errors or warnings in this structure.
-%
 */
 
-static Image *ComputeConvolveImage(const Image* image,
-  const ChannelType channel,const KernelInfo *kernel,ExceptionInfo *exception)
+static Image *ComputeAddNoiseImage(const Image *image,
+  const ChannelType channel,const NoiseType noise_type,
+  ExceptionInfo *exception)
 {
   CacheView
     *filteredImage_view,
@@ -253,31 +263,33 @@ static Image *ComputeConvolveImage(const Image* image,
   cl_context
     context;
 
-  cl_kernel
-    clkernel;
-
   cl_int
+    inputPixelCount,
+    pixelsPerWorkitem,
     clStatus;
 
-  cl_mem
-    convolutionKernel,
-    filteredImageBuffer,
-    imageBuffer;
+  cl_uint
+    seed0,
+    seed1;
+
+  cl_kernel
+    addNoiseKernel;
 
   cl_mem_flags
     mem_flags;
 
-  cl_ulong
-    deviceLocalMemorySize;
+  cl_mem
+    filteredImageBuffer,
+    imageBuffer;
+
+  const char
+    *option;
 
   const void
     *inputPixels;
 
   float
-    *kernelBufferPtr;
-
-  Image
-    *filteredImage;
+    attenuate;
 
   MagickBooleanType
     outputReady;
@@ -288,54 +300,53 @@ static Image *ComputeConvolveImage(const Image* image,
   MagickSizeType
     length;
 
-  size_t
-    global_work_size[3],
-    localGroupSize[3],
-    localMemoryRequirement;
+  Image
+    *filteredImage;
 
-  unsigned
-    kernelSize;
+  RandomInfo
+    **restrict random_info;
+
+  size_t
+    global_work_size[1],
+    local_work_size[1];
 
   unsigned int
-    filterHeight,
-    filterWidth,
-    i,
-    imageHeight,
-    imageWidth,
-    matte;
+    k,
+    numRandomNumberPerPixel;
+
+#if defined(MAGICKCORE_OPENMP_SUPPORT)
+  unsigned long
+    key;
+#endif
 
   void
     *filteredPixels,
     *hostPtr;
 
-  /* intialize all CL objects to NULL */
+  outputReady = MagickFalse;
+  clEnv = NULL;
+  inputPixels = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  filteredPixels = NULL;
   context = NULL;
   imageBuffer = NULL;
   filteredImageBuffer = NULL;
-  convolutionKernel = NULL;
-  clkernel = NULL;
   queue = NULL;
+  addNoiseKernel = NULL;
 
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  outputReady = MagickFalse;
-  
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
-
+  queue = AcquireOpenCLCommandQueue(clEnv);
   image_view=AcquireVirtualCacheView(image,exception);
   inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (inputPixels == (const void *) NULL)
+  if (inputPixels == (void *) NULL)
   {
     (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
     goto cleanup;
   }
 
-  /* Create and initialize OpenCL buffers. */
-
-  /* If the host pointer is aligned to the size of CLPixelPacket, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
   if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
     mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
@@ -353,6 +364,7 @@ static Image *ComputeConvolveImage(const Image* image,
     goto cleanup;
   }
 
+
   filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
   assert(filteredImage != NULL);
   if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
@@ -387,142 +399,87 @@ static Image *ComputeConvolveImage(const Image* image,
     goto cleanup;
   }
 
-  kernelSize = (unsigned int) (kernel->width * kernel->height);
-  convolutionKernel = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, kernelSize * sizeof(float), NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
+  /* find out how many random numbers needed by pixel */
+  numRandomNumberPerPixel = 0;
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
+    unsigned int numRandPerChannel = 0;
+    switch (noise_type)
+    {
+    case UniformNoise:
+    case ImpulseNoise:
+    case LaplacianNoise:
+    case RandomNoise:
+    default:
+      numRandPerChannel = 1;
+      break;
+    case GaussianNoise:
+    case MultiplicativeGaussianNoise:
+    case PoissonNoise:
+      numRandPerChannel = 2;
+      break;
+    };
+
+    if ((channel & RedChannel) != 0)
+      numRandomNumberPerPixel+=numRandPerChannel;
+    if ((channel & GreenChannel) != 0)
+      numRandomNumberPerPixel+=numRandPerChannel;
+    if ((channel & BlueChannel) != 0)
+      numRandomNumberPerPixel+=numRandPerChannel;
+    if ((channel & OpacityChannel) != 0)
+      numRandomNumberPerPixel+=numRandPerChannel;
   }
 
-  queue = AcquireOpenCLCommandQueue(clEnv);
+  /* set up the random number generators */
+  attenuate=1.0;
+  option=GetImageArtifact(image,"attenuate");
+  if (option != (char *) NULL)
+    attenuate=StringToDouble(option,(char **) NULL);
+  random_info=AcquireRandomInfoThreadSet();
+#if defined(MAGICKCORE_OPENMP_SUPPORT)
+  key=GetRandomSecretKey(random_info[0]);
+  (void) key;
+#endif
+
+  addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"AddNoise");
 
-  kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, convolutionKernel, CL_TRUE, CL_MAP_WRITE, 0, kernelSize * sizeof(float)
-          , 0, NULL, NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
-    goto cleanup;
-  }
-  for (i = 0; i < kernelSize; i++)
   {
-    kernelBufferPtr[i] = (float) kernel->values[i];
+    cl_uint computeUnitCount;
+    cl_uint workItemCount;
+    clEnv->library->clGetDeviceInfo(clEnv->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &computeUnitCount, NULL);
+    workItemCount = computeUnitCount * 2 * 256;                        // 256 work items per group, 2 groups per CU
+    inputPixelCount = (cl_int) (image->columns * image->rows);
+    pixelsPerWorkitem = (inputPixelCount + workItemCount - 1) / workItemCount;
+    pixelsPerWorkitem = ((pixelsPerWorkitem + 3) / 4) * 4;
+
+    local_work_size[0] = 256;
+    global_work_size[0] = workItemCount;
   }
-  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, convolutionKernel, kernelBufferPtr, 0, NULL, NULL);
-  if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
-    goto cleanup;
+    RandomInfo* randomInfo = AcquireRandomInfo();
+       const unsigned long* s = GetRandomInfoSeed(randomInfo);
+       seed0 = s[0];
+       GetPseudoRandomValue(randomInfo);
+       seed1 = s[0];
+       randomInfo = DestroyRandomInfo(randomInfo);
   }
-  clEnv->library->clFlush(queue);
 
-  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
-
-  /* Compute the local memory requirement for a 16x16 workgroup.
-     If it's larger than 16k, reduce the workgroup size to 8x8 */
-  localGroupSize[0] = 16;
-  localGroupSize[1] = 16;
-  localMemoryRequirement = (localGroupSize[0]+kernel->width-1) * (localGroupSize[1]+kernel->height-1) * sizeof(CLPixelPacket)
-    + kernel->width*kernel->height*sizeof(float);
-
-  if (localMemoryRequirement > deviceLocalMemorySize)
-  {
-    localGroupSize[0] = 8;
-    localGroupSize[1] = 8;
-    localMemoryRequirement = (localGroupSize[0]+kernel->width-1) * (localGroupSize[1]+kernel->height-1) * sizeof(CLPixelPacket)
-      + kernel->width*kernel->height*sizeof(float);
-  }
-  if (localMemoryRequirement <= deviceLocalMemorySize) 
-  {
-    /* get the OpenCL kernel */
-    clkernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ConvolveOptimized");
-    if (clkernel == NULL)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }
-
-    /* set the kernel arguments */
-    i = 0;
-    clStatus =clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-    imageWidth = (unsigned int) image->columns;
-    imageHeight = (unsigned int) image->rows;
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageWidth);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageHeight);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&convolutionKernel);
-    filterWidth = (unsigned int) kernel->width;
-    filterHeight = (unsigned int) kernel->height;
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterWidth);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterHeight);
-    matte = (image->alpha_trait == UndefinedPixelTrait)?1:0;
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&matte);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(ChannelType),(void *)&channel);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++, (localGroupSize[0] + kernel->width-1)*(localGroupSize[1] + kernel->height-1)*sizeof(CLPixelPacket),NULL);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++, kernel->width*kernel->height*sizeof(float),NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
-
-    /* pad the global size to a multiple of the local work size dimension */
-    global_work_size[0] = ((image->columns + localGroupSize[0]  - 1)/localGroupSize[0] ) * localGroupSize[0] ;
-    global_work_size[1] = ((image->rows + localGroupSize[1] - 1)/localGroupSize[1]) * localGroupSize[1];
-
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, clkernel, 2, NULL, global_work_size, localGroupSize, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }
-  }
-  else
-  {
-    /* get the OpenCL kernel */
-    clkernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Convolve");
-    if (clkernel == NULL)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }
-
-    /* set the kernel arguments */
-    i = 0;
-    clStatus =clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-    imageWidth = (unsigned int) image->columns;
-    imageHeight = (unsigned int) image->rows;
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageWidth);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageHeight);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&convolutionKernel);
-    filterWidth = (unsigned int) kernel->width;
-    filterHeight = (unsigned int) kernel->height;
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterWidth);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterHeight);
-    matte = (image->alpha_trait == UndefinedPixelTrait)?1:0;
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&matte);
-    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(ChannelType),(void *)&channel);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
+  k = 0;
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&imageBuffer);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&inputPixelCount);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&pixelsPerWorkitem);  
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(ChannelType),(void *)&channel);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(NoiseType),(void *)&noise_type);
+  attenuate=1.0f;
+  option=GetImageArtifact(image,"attenuate");
+  if (option != (char *) NULL)
+    attenuate=(float)StringToDouble(option,(char **) NULL);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(float),(void *)&attenuate);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&seed0);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&seed1);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&numRandomNumberPerPixel);
 
-    localGroupSize[0] = 8;
-    localGroupSize[1] = 8;
-    global_work_size[0] = (image->columns + (localGroupSize[0]-1))/localGroupSize[0] * localGroupSize[0];
-    global_work_size[1] = (image->rows    + (localGroupSize[1]-1))/localGroupSize[1] * localGroupSize[1];
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, clkernel, 2, NULL, global_work_size, localGroupSize, 0, NULL, NULL);
-    
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }
-  }
-  clEnv->library->clFlush(queue);
+  clEnv->library->clEnqueueNDRangeKernel(queue,addNoiseKernel,1,NULL,global_work_size,NULL,0,NULL,NULL);
 
   if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
@@ -549,59 +506,57 @@ cleanup:
   if (filteredImage_view != NULL)
     filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  if (imageBuffer != NULL)
-    clEnv->library->clReleaseMemObject(imageBuffer);
-
-  if (filteredImageBuffer != NULL)
-    clEnv->library->clReleaseMemObject(filteredImageBuffer);
-
-  if (convolutionKernel != NULL)
-    clEnv->library->clReleaseMemObject(convolutionKernel);
-
-  if (clkernel != NULL)
-    RelinquishOpenCLKernel(clEnv, clkernel);
-
-  if (queue != NULL)
-    RelinquishOpenCLCommandQueue(clEnv, queue);
-
-  if (outputReady == MagickFalse)
-  {
-    if (filteredImage != NULL)
-    {
-      DestroyImage(filteredImage);
-      filteredImage = NULL;
-    }
-  }
+  if (queue!=NULL)                  RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (addNoiseKernel!=NULL)         RelinquishOpenCLKernel(clEnv, addNoiseKernel);
+  if (imageBuffer!=NULL)                   clEnv->library->clReleaseMemObject(imageBuffer);
+  if (filteredImageBuffer!=NULL)         clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (outputReady == MagickFalse && filteredImage != NULL) 
+    filteredImage=DestroyImage(filteredImage);
 
   return(filteredImage);
 }
 
-MagickExport Image *AccelerateConvolveImageChannel(const Image *image,
-  const ChannelType channel,const KernelInfo *kernel,ExceptionInfo *exception)
+MagickExport Image *AccelerateAddNoiseImage(const Image *image,
+  const ChannelType channel,const NoiseType noise_type,
+  ExceptionInfo *exception) 
 {
   Image
     *filteredImage;
 
   assert(image != NULL);
-  assert(kernel != (KernelInfo *) NULL);
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
       (checkAccelerateCondition(image, channel) == MagickFalse))
     return NULL;
 
-  filteredImage=ComputeConvolveImage(image, channel, kernel, exception);
+  filteredImage = ComputeAddNoiseImage(image,channel,noise_type,exception);
+  
   return(filteredImage);
 }
 
-static MagickBooleanType ComputeFunctionImage(Image *image,
-  const ChannelType channel,const MagickFunction function,
-  const size_t number_parameters,const double *parameters,
-  ExceptionInfo *exception)
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     A c c e l e r a t e B l u r I m a g e                                   %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+static Image *ComputeBlurImage(const Image* image,const ChannelType channel,
+  const double radius,const double sigma,ExceptionInfo *exception)
 {
   CacheView
+    *filteredImage_view,
     *image_view;
 
+  char
+    geometry[MagickPathExtent];
+
   cl_command_queue
     queue;
 
@@ -612,20 +567,29 @@ static MagickBooleanType ComputeFunctionImage(Image *image,
     clStatus;
 
   cl_kernel
-    clkernel;
+    blurColumnKernel,
+    blurRowKernel;
 
   cl_mem
+    filteredImageBuffer,
     imageBuffer,
-    parametersBuffer;
-
+    imageKernelBuffer,
+    tempImageBuffer;
+  
   cl_mem_flags
     mem_flags;
 
+  const void
+    *inputPixels;
+
   float
-    *parametersBufferPtr;
+    *kernelBufferPtr;
+
+  Image
+    *filteredImage;
 
   MagickBooleanType
-    status;
+    outputReady;
 
   MagickCLEnv
     clEnv;
@@ -633,283 +597,38 @@ static MagickBooleanType ComputeFunctionImage(Image *image,
   MagickSizeType
     length;
 
-  size_t
-    globalWorkSize[2];
+  KernelInfo
+    *kernel;
 
   unsigned int
-    i;
+    i,
+    imageColumns,
+    imageRows,
+    kernelWidth;
 
   void
-    *pixels;
-
-  status = MagickFalse;
+    *filteredPixels,
+    *hostPtr;
 
   context = NULL;
-  clkernel = NULL;
-  queue = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
   imageBuffer = NULL;
-  parametersBuffer = NULL;
+  tempImageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  imageKernelBuffer = NULL;
+  blurRowKernel = NULL;
+  blurColumnKernel = NULL;
+  queue = NULL;
+  kernel = NULL;
+
+  outputReady = MagickFalse;
 
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
 
-  image_view=AcquireAuthenticCacheView(image,exception);
-  pixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (pixels == (void *) NULL)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), CacheWarning,
-      "GetPixelCachePixels failed.",
-      "'%s'", image->filename);
-    goto cleanup;
-  }
-
-
-  if (ALIGNED(pixels,CLPixelPacket)) 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
-  }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)pixels, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-
-  parametersBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, number_parameters * sizeof(float), NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-
-  queue = AcquireOpenCLCommandQueue(clEnv);
-
-  parametersBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, parametersBuffer, CL_TRUE, CL_MAP_WRITE, 0, number_parameters * sizeof(float)
-                , 0, NULL, NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
-    goto cleanup;
-  }
-  for (i = 0; i < number_parameters; i++)
-  {
-    parametersBufferPtr[i] = (float)parameters[i];
-  }
-  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, parametersBuffer, parametersBufferPtr, 0, NULL, NULL);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  clEnv->library->clFlush(queue);
-
-  clkernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "FunctionImage");
-  if (clkernel == NULL)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  /* set the kernel arguments */
-  i = 0;
-  clStatus =clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(ChannelType),(void *)&channel);
-  clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(MagickFunction),(void *)&function);
-  clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&number_parameters);
-  clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&parametersBuffer);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  globalWorkSize[0] = image->columns;
-  globalWorkSize[1] = image->rows;
-  /* launch the kernel */
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, clkernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  clEnv->library->clFlush(queue);
-
-
-  if (ALIGNED(pixels,CLPixelPacket)) 
-  {
-    length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
-  }
-  else 
-  {
-    length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), pixels, 0, NULL, NULL);
-  }
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  status=SyncCacheViewAuthenticPixels(image_view,exception);
-
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
-
-  image_view=DestroyCacheView(image_view);
-  
-  if (clkernel != NULL) RelinquishOpenCLKernel(clEnv, clkernel);
-  if (queue != NULL) RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (imageBuffer != NULL) clEnv->library->clReleaseMemObject(imageBuffer);
-  if (parametersBuffer != NULL) clEnv->library->clReleaseMemObject(parametersBuffer);
-
-  return(status);
-}
-
-MagickExport MagickBooleanType AccelerateFunctionImage(Image *image,
-  const ChannelType channel,const MagickFunction function,
-  const size_t number_parameters,const double *parameters,
-  ExceptionInfo *exception)
-{
-  MagickBooleanType
-    status;
-
-  assert(image != NULL);
-  assert(exception != (ExceptionInfo *) NULL);
-
-  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, channel) == MagickFalse))
-    return(MagickFalse);
-
-  status=ComputeFunctionImage(image, channel, function, number_parameters, parameters, exception);
-  return(status);
-}
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     B l u r I m a g e  w i t h  O p e n C L                                 %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  BlurImage() blurs an image.  We convolve the image with a Gaussian operator
-%  of the given radius and standard deviation (sigma).  For reasonable results,
-%  the radius should be larger than sigma.  Use a radius of 0 and BlurImage()
-%  selects a suitable radius for you.
-%
-%  The format of the BlurImage method is:
-%
-%      Image *BlurImage(const Image *image,const double radius,
-%        const double sigma,ExceptionInfo *exception)
-%      Image *BlurImageChannel(const Image *image,const ChannelType channel,
-%        const double radius,const double sigma,ExceptionInfo *exception)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o channel: the channel type.
-%
-%    o radius: the radius of the Gaussian, in pixels, not counting the center
-%      pixel.
-%
-%    o sigma: the standard deviation of the Gaussian, in pixels.
-%
-%    o exception: return any errors or warnings in this structure.
-%
-*/
-
-static Image *ComputeBlurImage(const Image* image,const ChannelType channel,
-  const double radius,const double sigma,ExceptionInfo *exception)
-{
-  CacheView
-    *filteredImage_view,
-    *image_view;
-
-  char
-    geometry[MagickPathExtent];
-
-  cl_command_queue
-    queue;
-
-  cl_context
-    context;
-
-  cl_int
-    clStatus;
-
-  cl_kernel
-    blurColumnKernel,
-    blurRowKernel;
-
-  cl_mem
-    filteredImageBuffer,
-    imageBuffer,
-    imageKernelBuffer,
-    tempImageBuffer;
-  
-  cl_mem_flags
-    mem_flags;
-
-  const void
-    *inputPixels;
-
-  float
-    *kernelBufferPtr;
-
-  Image
-    *filteredImage;
-
-  MagickBooleanType
-    outputReady;
-
-  MagickCLEnv
-    clEnv;
-
-  MagickSizeType
-    length;
-
-  KernelInfo
-    *kernel;
-
-  unsigned int
-    i,
-    imageColumns,
-    imageRows,
-    kernelWidth;
-
-  void
-    *filteredPixels,
-    *hostPtr;
-
-  context = NULL;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  imageBuffer = NULL;
-  tempImageBuffer = NULL;
-  filteredImageBuffer = NULL;
-  imageKernelBuffer = NULL;
-  blurRowKernel = NULL;
-  blurColumnKernel = NULL;
-  queue = NULL;
-  kernel = NULL;
-
-  outputReady = MagickFalse;
-
-  clEnv = GetDefaultOpenCLEnv();
-  context = GetOpenCLContext(clEnv);
-  queue = AcquireOpenCLCommandQueue(clEnv);
-
-  /* Create and initialize OpenCL buffers. */
+  /* Create and initialize OpenCL buffers. */
   {
     image_view=AcquireVirtualCacheView(image,exception);
     inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
@@ -1377,14 +1096,14 @@ static Image* ComputeBlurImageSection(const Image* image,
 
     /* get the OpenCL kernels */
     {
-      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurRowSection");
+      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurSectionRow");
       if (blurRowKernel == NULL)
       {
         (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
         goto cleanup;
       };
 
-      blurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurColumnSection");
+      blurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurSectionColumn");
       if (blurColumnKernel == NULL)
       {
         (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
@@ -1550,10 +1269,6 @@ cleanup:
   return filteredImage;
 }
 
-static Image *ComputeUnsharpMaskImageSingle(const Image *image,
-  const ChannelType channel,const double radius,const double sigma,
-  const double gain,const double threshold,int blurOnly, ExceptionInfo *exception);
-
 static Image* ComputeBlurImageSingle(const Image* image,
   const ChannelType channel,const double radius,const double sigma,
   ExceptionInfo *exception)
@@ -1590,80 +1305,105 @@ MagickExport Image* AccelerateBlurImage(const Image *image,
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     LocalContrastImage  w i t h  O p e n C L                                %
+%     A c c e l e r a t e C o m p o s i t e I m a g e                         %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  ComputeLocalContrastImage() attempts to increase the appearance of
-%  large-scale light-dark transitions. Local contrast enhancement works
-%  similarly to sharpening with an unsharp mask, however the mask is instead
-%  created using an image with a greater blur distance.
-%
-%  The format of the ComputeLocalContrastImage method is:
-%
-%    Image *ComputeLocalContrastImage(const Image *image,
-%      const ChannelType channel,const double radius,const double strength,
-%      ExceptionInfo *exception)
-%    Image *AccelerateLocalContrastImage(const Image *image,
-%      const ChannelType channel,const double radius,const double strength,
-%      ExceptionInfo *exception)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o radius: the radius of the Gaussian, in pixels, not counting
-%      the center pixel.
-%
-%    o strength: the strength of the blur mask in percentage.
-%
-%    o exception: return any errors or warnings in this structure.
-%
 */
 
-static Image *ComputeLocalContrastImage(const Image *image,
-  const double radius,const double strength,ExceptionInfo *exception)
+static MagickBooleanType LaunchCompositeKernel(MagickCLEnv clEnv,
+  cl_command_queue queue,cl_mem imageBuffer,const unsigned int inputWidth,
+  const unsigned int inputHeight,const unsigned int matte,
+  const ChannelType channel,const CompositeOperator compose,
+  const cl_mem compositeImageBuffer,const unsigned int compositeWidth,
+  const unsigned int compositeHeight,const float destination_dissolve,
+  const float source_dissolve,ExceptionInfo *magick_unused(exception))
 {
-  CacheView
-    *filteredImage_view,
-    *image_view;
-
-  cl_command_queue
-    queue;
-
-  cl_context
-    context;
-
   cl_int
-    clStatus,
-    iRadius;
+    clStatus;
 
   cl_kernel
-    blurRowKernel,
-    blurColumnKernel;
+    compositeKernel;
 
-  cl_event
-    event;
+  int
+    k;
 
-  cl_mem
-    filteredImageBuffer,
-    imageBuffer,
-    imageKernelBuffer,
-    tempImageBuffer;
+  size_t
+    global_work_size[2],
+    local_work_size[2];
 
-  cl_mem_flags
-    mem_flags;
+  unsigned int
+    composeOp;
 
-  const void
-    *inputPixels;
+  magick_unreferenced(exception);
 
-  Image
-    *filteredImage;
+  compositeKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE,
+    "Composite");
 
-  MagickBooleanType
-    outputReady;
+  k = 0;
+  clStatus=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(cl_mem),(void*)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&inputWidth);
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&inputHeight);
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(cl_mem),(void*)&compositeImageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&compositeWidth);
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&compositeHeight);
+  composeOp = (unsigned int)compose;
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&composeOp);
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(ChannelType),(void*)&channel);
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&matte);
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(float),(void*)&destination_dissolve);
+  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(float),(void*)&source_dissolve);
+
+  if (clStatus!=CL_SUCCESS)
+    return MagickFalse;
+
+  local_work_size[0] = 64;
+  local_work_size[1] = 1;
+
+  global_work_size[0] = padGlobalWorkgroupSizeToLocalWorkgroupSize(inputWidth,
+    (unsigned int) local_work_size[0]);
+  global_work_size[1] = inputHeight;
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, compositeKernel, 2, NULL, 
+    global_work_size, local_work_size, 0, NULL, NULL);
+
+
+  RelinquishOpenCLKernel(clEnv, compositeKernel);
+
+  return((clStatus==CL_SUCCESS) ? MagickTrue : MagickFalse);
+}
+
+static MagickBooleanType ComputeCompositeImage(Image *image,
+  const ChannelType channel,const CompositeOperator compose,
+  const Image *compositeImage,const ssize_t magick_unused(x_offset),
+  const ssize_t magick_unused(y_offset),const float destination_dissolve,
+  const float source_dissolve,ExceptionInfo *exception)
+{
+  CacheView
+    *image_view;
+
+  cl_command_queue
+    queue;
+
+  cl_context
+    context;
+
+  cl_int
+    clStatus;
+
+  cl_mem_flags
+    mem_flags;
+
+  cl_mem
+    compositeImageBuffer,
+    imageBuffer;
+
+  const void
+    *composePixels;
+
+  MagickBooleanType
+    outputReady,
+    status;
 
   MagickCLEnv
     clEnv;
@@ -1672,281 +1412,160 @@ static Image *ComputeLocalContrastImage(const Image *image,
     length;
 
   void
-    *filteredPixels,
-    *hostPtr;
+    *inputPixels;
 
-  unsigned int
-    i,
-    imageColumns,
-    imageRows,
-    passes;
+  magick_unreferenced(x_offset);
+  magick_unreferenced(y_offset);
 
-  clEnv = NULL;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  context = NULL;
-  imageBuffer = NULL;
-  filteredImageBuffer = NULL;
-  tempImageBuffer = NULL;
-  imageKernelBuffer = NULL;
-  blurRowKernel = NULL;
-  blurColumnKernel = NULL;
-  queue = NULL;
+  status = MagickFalse;
   outputReady = MagickFalse;
+  composePixels = NULL;
+  imageBuffer = NULL;
+  compositeImageBuffer = NULL;
 
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
   queue = AcquireOpenCLCommandQueue(clEnv);
 
   /* Create and initialize OpenCL buffers. */
+  image_view=AcquireAuthenticCacheView(image,exception);
+  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
+  if (inputPixels == (void *) NULL)
   {
-    image_view=AcquireVirtualCacheView(image,exception);
-    inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
-    if (inputPixels == (const void *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
-      goto cleanup;
-    }
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,
+      "UnableToReadPixelCache.","`%s'",image->filename);
+    goto cleanup;
+  }
 
-    /* If the host pointer is aligned to the size of CLPixelPacket, 
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
      then use the host buffer directly from the GPU; otherwise, 
      create a buffer on the GPU and copy the data over */
-    if (ALIGNED(inputPixels,CLPixelPacket)) 
-    {
-      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
-    }
-    else 
-    {
-      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
-    }
-    /* create a CL buffer from image pixel buffer */
-    length = image->columns * image->rows;
-    imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
   }
-
-  /* create output */
+  else 
   {
-    filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
-    assert(filteredImage != NULL);
-    if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
-    filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
-    if (filteredPixels == (void *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
-      goto cleanup;
-    }
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = image->columns * image->rows;
+  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, 
+    length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), 
+      ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
 
-    if (ALIGNED(filteredPixels,CLPixelPacket)) 
-    {
-      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
-      hostPtr = filteredPixels;
-    }
-    else 
-    {
-      mem_flags = CL_MEM_WRITE_ONLY;
-      hostPtr = NULL;
-    }
 
-    /* create a CL buffer from image pixel buffer */
-    length = image->columns * image->rows;
-    filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
+  /* Create and initialize OpenCL buffers. */
+  composePixels = AcquirePixelCachePixels(compositeImage, &length, exception); 
+  if (composePixels == (void *) NULL)
+  {
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,
+      "UnableToReadPixelCache.","`%s'",compositeImage->filename);
+    goto cleanup;
   }
 
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(composePixels,CLPixelPacket)) 
   {
-    /* create temp buffer */
-    {
-      length = image->columns * image->rows;
-      tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length * sizeof(float), NULL, &clStatus);
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-        goto cleanup;
-      }
-    }
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = compositeImage->columns * compositeImage->rows;
+  compositeImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, 
+    length * sizeof(CLPixelPacket), (void*)composePixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), 
+      ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+  
+  status = LaunchCompositeKernel(clEnv,queue,imageBuffer,
+           (unsigned int) image->columns,
+           (unsigned int) image->rows,
+           (unsigned int) (image->alpha_trait == UndefinedPixelTrait) ? 1 : 0,
+           channel, compose, compositeImageBuffer,
+           (unsigned int) compositeImage->columns,
+           (unsigned int) compositeImage->rows,
+           destination_dissolve,source_dissolve,
+           exception);
 
-    /* get the opencl kernel */
-    {
-      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "LocalContrastBlurRow");
-      if (blurRowKernel == NULL)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-        goto cleanup;
-      };
+  if (status==MagickFalse)
+    goto cleanup;
 
-      blurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "LocalContrastBlurApplyColumn");
-      if (blurColumnKernel == NULL)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-        goto cleanup;
-      };
-    }
+  length = image->columns * image->rows;
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, 
+      CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, 
+      NULL, &clStatus);
+  }
+  else
+  {
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, 
+      length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
+  }
+  if (clStatus==CL_SUCCESS)
+    outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
 
-    {
-      imageColumns = (unsigned int) image->columns;
-      imageRows = (unsigned int) image->rows;
-      iRadius = (cl_int) fabs(radius);
-
-      passes = ((1.0f * imageColumns) * imageColumns * iRadius) / 4000000000.0f;
-      passes = (passes < 1) ? 1: passes;
-
-      /* set the kernel arguments */
-      i = 0;
-      clStatus=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_int),(void *)&iRadius);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
-      
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-        goto cleanup;
-      }
-    }
-
-    /* launch the kernel */
-    {
-      int x;
-      for (x = 0; x < passes; ++x) {
-        size_t gsize[2];
-        size_t wsize[2];
-        size_t goffset[2];
-
-        gsize[0] = 256;
-        gsize[1] = image->rows / passes;
-        wsize[0] = 256;
-        wsize[1] = 1;
-        goffset[0] = 0;
-        goffset[1] = x * gsize[1];
-
-        clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, blurRowKernel, 2, goffset, gsize, wsize, 0, NULL, &event);
-        if (clStatus != CL_SUCCESS)
-        {
-          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-          goto cleanup;
-        }
-      }
-    }
-
-    {
-      cl_float FStrength = strength;
-      i = 0;
-      clStatus=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&iRadius);
-      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(cl_float),(void *)&FStrength);
-      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
-      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
-
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-        goto cleanup;
-      }
-    }
-
-    /* launch the kernel */
-    {
-      int x;
-      for (x = 0; x < passes; ++x) {
-        size_t gsize[2];
-        size_t wsize[2];
-        size_t goffset[2];
-
-        gsize[0] = ((image->columns + 3) / 4) * 4;
-        gsize[1] = ((((image->rows + 63) / 64) + (passes + 1)) / passes) * 64;
-        wsize[0] = 4;
-        wsize[1] = 64;
-        goffset[0] = 0;
-        goffset[1] = x * gsize[1];
-
-        clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, blurColumnKernel, 2, goffset, gsize, wsize, 0, NULL, &event);
-        if (clStatus != CL_SUCCESS)
-        {
-          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-          goto cleanup;
-        }
-      }
-    }
-  }
-
-  /* get result */
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
-  {
-    length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
-  }
-  else 
-  {
-    length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
-  }
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
-
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+cleanup:
 
   image_view=DestroyCacheView(image_view);
-  if (filteredImage_view != NULL)
-    filteredImage_view=DestroyCacheView(filteredImage_view);
+  if (imageBuffer!=NULL)      clEnv->library->clReleaseMemObject(imageBuffer);
+  if (compositeImageBuffer!=NULL)  clEnv->library->clReleaseMemObject(compositeImageBuffer);
+  if (queue != NULL)               RelinquishOpenCLCommandQueue(clEnv, queue);
 
-  if (imageBuffer!=NULL)                      clEnv->library->clReleaseMemObject(imageBuffer);
-  if (filteredImageBuffer!=NULL)              clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (tempImageBuffer!=NULL)                  clEnv->library->clReleaseMemObject(tempImageBuffer);
-  if (imageKernelBuffer!=NULL)                clEnv->library->clReleaseMemObject(imageKernelBuffer);
-  if (blurRowKernel!=NULL)                    RelinquishOpenCLKernel(clEnv, blurRowKernel);
-  if (blurColumnKernel!=NULL)                 RelinquishOpenCLKernel(clEnv, blurColumnKernel);
-  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (outputReady == MagickFalse)
-  {
-    if (filteredImage != NULL)
-    {
-      DestroyImage(filteredImage);
-      filteredImage = NULL;
-    }
-  }
-  return(filteredImage);
+  return(outputReady);
 }
 
-MagickExport Image *AccelerateLocalContrastImage(const Image *image,
-  const double radius,const double strength,ExceptionInfo *exception)
+MagickExport MagickBooleanType AccelerateCompositeImage(Image *image,
+  const ChannelType channel,const CompositeOperator compose,
+  const Image *composite,const ssize_t x_offset,const ssize_t y_offset,
+  const float destination_dissolve,const float source_dissolve,
+  ExceptionInfo *exception)
 {
-  Image
-    *filteredImage;
+  MagickBooleanType
+    status;
 
   assert(image != NULL);
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-    (checkAccelerateCondition(image, AllChannels) == MagickFalse))
-    return NULL;
+      (checkAccelerateCondition(image, channel) == MagickFalse))
+    return(MagickFalse);
 
-  filteredImage=ComputeLocalContrastImage(image,radius,strength,exception);
+  /* only support zero offset and
+     images with the size for now */
+  if (x_offset!=0
+    || y_offset!=0
+    || image->columns!=composite->columns
+    || image->rows!=composite->rows)
+    return MagickFalse;
 
-  return(filteredImage);
+  switch(compose) {
+  case ColorDodgeCompositeOp: 
+  case BlendCompositeOp:
+    break;
+  default:
+    // unsupported compose operator, quit
+    return MagickFalse;
+  };
+
+  status = ComputeCompositeImage(image,channel,compose,composite,
+    x_offset,y_offset,destination_dissolve,source_dissolve,exception);
+
+  return(status);
 }
 
 /*
@@ -1954,41 +1573,18 @@ MagickExport Image *AccelerateLocalContrastImage(const Image *image,
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     R o t a t i o n a l B l u r I m a g e  w i t h  O p e n C L             %
+%     A c c e l e r a t e C o n t r a s t I m a g e                           %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  RotationalBlurImage() applies a rotational blur to the image.
-%
-%  Andrew Protano contributed this effect.
-%
-%  The format of the RotationalBlurImage method is:
-%
-%    Image *RotationalBlurImage(const Image *image,const double angle,
-%      ExceptionInfo *exception)
-%    Image *RotationalBlurImageChannel(const Image *image,const ChannelType channel,
-%      const double angle,ExceptionInfo *exception)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o channel: the channel type.
-%
-%    o angle: the angle of the rotational blur.
-%
-%    o exception: return any errors or warnings in this structure.
-%
 */
 
-static Image* ComputeRotationalBlurImage(const Image *image,
-  const ChannelType channel,const double angle,ExceptionInfo *exception)
+static MagickBooleanType ComputeContrastImage(Image *image,
+  const MagickBooleanType sharpen,ExceptionInfo *exception)
 {
   CacheView
-    *image_view,
-    *filteredImage_view;
+    *image_view;
 
   cl_command_queue
     queue;
@@ -1996,49 +1592,24 @@ static Image* ComputeRotationalBlurImage(const Image *image,
   cl_context
     context;
 
-  cl_float2
-    blurCenter;
-
-  cl_float4
-    biasPixel;
-
   cl_int
     clStatus;
 
+  cl_kernel
+    filterKernel;
+
   cl_mem
-    cosThetaBuffer,
-    filteredImageBuffer,
-    imageBuffer,
-    sinThetaBuffer;
+    imageBuffer;
 
   cl_mem_flags
     mem_flags;
 
-  cl_kernel
-    rotationalBlurKernel;
-
-  const void
-    *inputPixels;
-
-  float
-    blurRadius,
-    *cosThetaPtr,
-    offset,
-    *sinThetaPtr,
-    theta;
-
-  Image
-    *filteredImage;
-
   MagickBooleanType
     outputReady;
 
   MagickCLEnv
     clEnv;
 
-  PixelInfo
-    bias;
-
   MagickSizeType
     length;
 
@@ -2046,35 +1617,27 @@ static Image* ComputeRotationalBlurImage(const Image *image,
     global_work_size[2];
 
   unsigned int
-    cossin_theta_size,
     i,
-    matte;
+    uSharpen;
 
   void
-    *filteredPixels,
-    *hostPtr;
+    *inputPixels;
 
   outputReady = MagickFalse;
+  clEnv = NULL;
+  inputPixels = NULL;
   context = NULL;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
   imageBuffer = NULL;
-  filteredImageBuffer = NULL;
-  sinThetaBuffer = NULL;
-  cosThetaBuffer = NULL;
+  filterKernel = NULL;
   queue = NULL;
-  rotationalBlurKernel = NULL;
-
 
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
 
-
   /* Create and initialize OpenCL buffers. */
-
-  image_view=AcquireVirtualCacheView(image,exception);
-  inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (inputPixels == (const void *) NULL)
+  image_view=AcquireAuthenticCacheView(image,exception);
+  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
+  if (inputPixels == (void *) NULL)
   {
     (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
     goto cleanup;
@@ -2085,11 +1648,11 @@ static Image* ComputeRotationalBlurImage(const Image *image,
      create a buffer on the GPU and copy the data over */
   if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
   }
   else 
   {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
   }
   /* create a CL buffer from image pixel buffer */
   length = image->columns * image->rows;
@@ -2099,134 +1662,30 @@ static Image* ComputeRotationalBlurImage(const Image *image,
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
-
-
-  filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
-  assert(filteredImage != NULL);
-  if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
+  
+  filterKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Contrast");
+  if (filterKernel == NULL)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
     goto cleanup;
   }
-  filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
-  filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
-  if (filteredPixels == (void *) NULL)
+
+  i = 0;
+  clStatus=clEnv->library->clSetKernelArg(filterKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+
+  uSharpen = (sharpen == MagickFalse)?0:1;
+  clStatus|=clEnv->library->clSetKernelArg(filterKernel,i++,sizeof(cl_uint),&uSharpen);
+  if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
-  {
-    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
-    hostPtr = filteredPixels;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_WRITE_ONLY;
-    hostPtr = NULL;
-  }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-
-  blurCenter.s[0] = (float) (image->columns-1)/2.0;
-  blurCenter.s[1] = (float) (image->rows-1)/2.0;
-  blurRadius=hypot(blurCenter.s[0],blurCenter.s[1]);
-  cossin_theta_size=(unsigned int) fabs(4.0*DegreesToRadians(angle)*sqrt((double)blurRadius)+2UL);
-
-  /* create a buffer for sin_theta and cos_theta */
-  sinThetaBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, cossin_theta_size * sizeof(float), NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-  cosThetaBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, cossin_theta_size * sizeof(float), NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-
-
-  queue = AcquireOpenCLCommandQueue(clEnv);
-  sinThetaPtr = (float*) clEnv->library->clEnqueueMapBuffer(queue, sinThetaBuffer, CL_TRUE, CL_MAP_WRITE, 0, cossin_theta_size*sizeof(float), 0, NULL, NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueuemapBuffer failed.",".");
-    goto cleanup;
-  }
-
-  cosThetaPtr = (float*) clEnv->library->clEnqueueMapBuffer(queue, cosThetaBuffer, CL_TRUE, CL_MAP_WRITE, 0, cossin_theta_size*sizeof(float), 0, NULL, NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueuemapBuffer failed.",".");
-    goto cleanup;
-  }
-
-  theta=DegreesToRadians(angle)/(MagickRealType) (cossin_theta_size-1);
-  offset=theta*(MagickRealType) (cossin_theta_size-1)/2.0;
-  for (i=0; i < (ssize_t) cossin_theta_size; i++)
-  {
-    cosThetaPtr[i]=(float)cos((double) (theta*i-offset));
-    sinThetaPtr[i]=(float)sin((double) (theta*i-offset));
-  }
-  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, sinThetaBuffer, sinThetaPtr, 0, NULL, NULL);
-  clStatus |= clEnv->library->clEnqueueUnmapMemObject(queue, cosThetaBuffer, cosThetaPtr, 0, NULL, NULL);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  /* get the OpenCL kernel */
-  rotationalBlurKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "RotationalBlur");
-  if (rotationalBlurKernel == NULL)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  
-  /* set the kernel arguments */
-  i = 0;
-  clStatus=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-
-  GetPixelInfo(image,&bias);
-  biasPixel.s[0] = bias.red;
-  biasPixel.s[1] = bias.green;
-  biasPixel.s[2] = bias.blue;
-  biasPixel.s[3] = bias.alpha;
-  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_float4), &biasPixel);
-  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(ChannelType), &channel);
-
-  matte = (image->alpha_trait == UndefinedPixelTrait)?1:0;
-  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(unsigned int), &matte);
-
-  clStatus=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_float2), &blurCenter);
-
-  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_mem),(void *)&cosThetaBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_mem),(void *)&sinThetaBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(unsigned int), &cossin_theta_size);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-
   global_work_size[0] = image->columns;
   global_work_size[1] = image->rows;
   /* launch the kernel */
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, rotationalBlurKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, filterKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
   if (clStatus != CL_SUCCESS)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
@@ -2234,62 +1693,49 @@ static Image* ComputeRotationalBlurImage(const Image *image,
   }
   clEnv->library->clFlush(queue);
 
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
     length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
   else 
   {
     length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
   }
   if (clStatus != CL_SUCCESS)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
-  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
+  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
 
 cleanup:
   OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
   image_view=DestroyCacheView(image_view);
-  if (filteredImage_view != NULL)
-    filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  if (filteredImageBuffer!=NULL)  clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (imageBuffer!=NULL)     clEnv->library->clReleaseMemObject(imageBuffer);
-  if (sinThetaBuffer!=NULL)       clEnv->library->clReleaseMemObject(sinThetaBuffer);
-  if (cosThetaBuffer!=NULL)       clEnv->library->clReleaseMemObject(cosThetaBuffer);
-  if (rotationalBlurKernel!=NULL) RelinquishOpenCLKernel(clEnv, rotationalBlurKernel);
-  if (queue != NULL)              RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (outputReady == MagickFalse)
-  {
-    if (filteredImage != NULL)
-    {
-      DestroyImage(filteredImage);
-      filteredImage = NULL;
-    }
-  }
-  return filteredImage;
+  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
+  if (filterKernel!=NULL)                     RelinquishOpenCLKernel(clEnv, filterKernel);
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  return(outputReady);
 }
 
-MagickExport Image* AccelerateRotationalBlurImage(const Image *image,
-  const ChannelType channel,const double angle,ExceptionInfo *exception)
+MagickExport MagickBooleanType AccelerateContrastImage(Image *image,
+  const MagickBooleanType sharpen,ExceptionInfo *exception)
 {
-  Image
-    *filteredImage;
+  MagickBooleanType
+    status;
 
   assert(image != NULL);
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, channel) == MagickFalse))
-    return NULL;
+      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
+    return(MagickFalse);
 
-  filteredImage=ComputeRotationalBlurImage(image, channel, angle, exception);
-  return filteredImage;
+  status = ComputeContrastImage(image,sharpen,exception);
+  return(status);
 }
 
 /*
@@ -2297,57 +1743,95 @@ MagickExport Image* AccelerateRotationalBlurImage(const Image *image,
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     U n s h a r p M a s k I m a g e  w i t h  O p e n C L                   %
+%     A c c e l e r a t e C o n t r a s t S t r e t c h I m a g e             %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  UnsharpMaskImage() sharpens one or more image channels.  We convolve the
-%  image with a Gaussian operator of the given radius and standard deviation
-%  (sigma).  For reasonable results, radius should be larger than sigma.  Use a
-%  radius of 0 and UnsharpMaskImage() selects a suitable radius for you.
-%
-%  The format of the UnsharpMaskImage method is:
-%
-%    Image *UnsharpMaskImage(const Image *image,const double radius,
-%      const double sigma,const double amount,const double threshold,
-%      ExceptionInfo *exception)
-%    Image *UnsharpMaskImageChannel(const Image *image,
-%      const ChannelType channel,const double radius,const double sigma,
-%      const double gain,const double threshold,ExceptionInfo *exception)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o channel: the channel type.
-%
-%    o radius: the radius of the Gaussian, in pixels, not counting the center
-%      pixel.
-%
-%    o sigma: the standard deviation of the Gaussian, in pixels.
-%
-%    o gain: the percentage of the difference between the original and the
-%      blur image that is added back into the original.
-%
-%    o threshold: the threshold in pixels needed to apply the diffence gain.
-%
-%    o exception: return any errors or warnings in this structure.
-%
 */
 
-static Image *ComputeUnsharpMaskImage(const Image *image,
-  const ChannelType channel,const double radius,const double sigma,
-  const double gain,const double threshold,ExceptionInfo *exception)
+static MagickBooleanType LaunchHistogramKernel(MagickCLEnv clEnv,
+  cl_command_queue queue,cl_mem imageBuffer,cl_mem histogramBuffer,
+  Image *image,const ChannelType channel,ExceptionInfo *exception)
+{
+  MagickBooleanType
+    outputReady;
+
+  cl_int
+    clStatus,
+    colorspace,
+    method;
+
+  cl_kernel
+    histogramKernel; 
+
+  register ssize_t
+    i;
+
+  size_t
+    global_work_size[2];
+
+  histogramKernel = NULL; 
+
+  outputReady = MagickFalse;
+  method = image->intensity;
+  colorspace = image->colorspace;
+
+  /* get the OpenCL kernel */
+  histogramKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Histogram");
+  if (histogramKernel == NULL)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* set the kernel arguments */
+  i = 0;
+  clStatus=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(ChannelType),&channel);
+  clStatus|=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(cl_int),&method);
+  clStatus|=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(cl_int),&colorspace);
+  clStatus|=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(cl_mem),(void *)&histogramBuffer);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* launch the kernel */
+  global_work_size[0] = image->columns;
+  global_work_size[1] = image->rows;
+
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, histogramKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clEnv->library->clFlush(queue);
+
+  outputReady = MagickTrue;
+
+cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  if (histogramKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, histogramKernel);
+
+  return(outputReady);
+}
+
+static MagickBooleanType ComputeContrastStretchImage(Image *image,
+  const ChannelType channel,const double black_point,const double white_point, 
+  ExceptionInfo *exception) 
 {
+#define ContrastStretchImageTag  "ContrastStretch/Image"
+#define MaxRange(color)  ((MagickRealType) ScaleQuantumToMap((Quantum) (color)))
+
   CacheView
-    *filteredImage_view,
     *image_view;
 
-  char
-    geometry[MagickPathExtent];
-
   cl_command_queue
     queue;
 
@@ -2357,38 +1841,31 @@ static Image *ComputeUnsharpMaskImage(const Image *image,
   cl_int
     clStatus;
 
-  cl_kernel
-    blurRowKernel,
-    unsharpMaskBlurColumnKernel;
+  cl_mem_flags
+    mem_flags;
 
   cl_mem
-    filteredImageBuffer,
+    histogramBuffer,
     imageBuffer,
-    imageKernelBuffer,
-    tempImageBuffer;
-
-  cl_mem_flags
-    mem_flags;
+    stretchMapBuffer;
 
-  const void
-    *inputPixels;
+  cl_kernel
+    histogramKernel,
+    stretchKernel;
 
-  float
-    fGain,
-    fThreshold,
-    *kernelBufferPtr;
-
-  Image
-    *filteredImage;
+  cl_uint4
+    *histogram;
 
-  int
-    chunkSize;
+  double
+    intensity;
 
-  KernelInfo
-    *kernel;
+  FloatPixelPacket
+    black,
+    white;
 
   MagickBooleanType
-    outputReady;
+    outputReady,
+    status;
 
   MagickCLEnv
     clEnv;
@@ -2396,655 +1873,486 @@ static Image *ComputeUnsharpMaskImage(const Image *image,
   MagickSizeType
     length;
 
-  void
-    *filteredPixels,
-    *hostPtr;
+  PixelPacket
+    *stretch_map;
 
-  unsigned int
-    i,
-    imageColumns,
-    imageRows,
-    kernelWidth;
+  register ssize_t
+    i;
 
-  clEnv = NULL;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  kernel = NULL;
-  context = NULL;
+  size_t
+    global_work_size[2];
+
+  void
+    *hostPtr,
+    *inputPixels;
+
+  histogram=NULL;
+  stretch_map=NULL;
+  inputPixels = NULL;
   imageBuffer = NULL;
-  filteredImageBuffer = NULL;
-  tempImageBuffer = NULL;
-  imageKernelBuffer = NULL;
-  blurRowKernel = NULL;
-  unsharpMaskBlurColumnKernel = NULL;
+  histogramBuffer = NULL;
+  stretchMapBuffer = NULL;
+  histogramKernel = NULL; 
+  stretchKernel = NULL; 
+  context = NULL;
   queue = NULL;
   outputReady = MagickFalse;
 
+
+  assert(image != (Image *) NULL);
+  assert(image->signature == MagickCoreSignature);
+  if (image->debug != MagickFalse)
+    (void) LogMagickEvent(TraceEvent,GetMagickModule(),"%s",image->filename);
+
+  //exception=(&image->exception);
+
+  /*
+   * initialize opencl env
+   */
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
   queue = AcquireOpenCLCommandQueue(clEnv);
 
+  /*
+    Allocate and initialize histogram arrays.
+  */
+  histogram=(cl_uint4 *) AcquireQuantumMemory(MaxMap+1UL, sizeof(*histogram));
+
+  if (histogram == (cl_uint4 *) NULL)
+    ThrowBinaryException(ResourceLimitError,"MemoryAllocationFailed", image->filename);
+  /* reset histogram */
+  (void) ResetMagickMemory(histogram,0,(MaxMap+1)*sizeof(*histogram));
+
+  /*
+  if (IsGrayImage(image,exception) != MagickFalse)
+    (void) SetImageColorspace(image,GRAYColorspace);
+  */
+
+  status=MagickTrue;
+
+
+  /*
+    Form histogram.
+  */
   /* Create and initialize OpenCL buffers. */
-  {
-    image_view=AcquireVirtualCacheView(image,exception);
-    inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
-    if (inputPixels == (const void *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
-      goto cleanup;
-    }
+  /* inputPixels = AcquirePixelCachePixels(image, &length, exception); */
+  /* assume this  will get a writable image */
+  image_view=AcquireAuthenticCacheView(image,exception);
+  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
 
-    /* If the host pointer is aligned to the size of CLPixelPacket, 
+  if (inputPixels == (void *) NULL)
+  {
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+    goto cleanup;
+  }
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
      then use the host buffer directly from the GPU; otherwise, 
      create a buffer on the GPU and copy the data over */
-    if (ALIGNED(inputPixels,CLPixelPacket)) 
-    {
-      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
-    }
-    else 
-    {
-      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
-    }
-    /* create a CL buffer from image pixel buffer */
-    length = image->columns * image->rows;
-    imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = image->columns * image->rows;
+  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
   }
 
-  /* create output */
+  /* If the host pointer is aligned to the size of cl_uint, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(histogram,cl_uint4)) 
   {
-    filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
-    assert(filteredImage != NULL);
-    if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
-    filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
-    if (filteredPixels == (void *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
-      goto cleanup;
-    }
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+    hostPtr = histogram;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    hostPtr = histogram;
+  }
+  /* create a CL buffer for histogram  */
+  length = (MaxMap+1); 
+  histogramBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(cl_uint4), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
 
-    if (ALIGNED(filteredPixels,CLPixelPacket)) 
-    {
-      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
-      hostPtr = filteredPixels;
-    }
-    else 
-    {
-      mem_flags = CL_MEM_WRITE_ONLY;
-      hostPtr = NULL;
-    }
+  status = LaunchHistogramKernel(clEnv, queue, imageBuffer, histogramBuffer, image, channel, exception);
+  if (status == MagickFalse)
+    goto cleanup;
 
-    /* create a CL buffer from image pixel buffer */
-    length = image->columns * image->rows;
-    filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
+  /* read from the kenel output */
+  if (ALIGNED(histogram,cl_uint4)) 
+  {
+    length = (MaxMap+1); 
+    clEnv->library->clEnqueueMapBuffer(queue, histogramBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(cl_uint4), 0, NULL, NULL, &clStatus);
   }
-
-  /* create the blur kernel */
+  else 
   {
-    (void) FormatLocaleString(geometry,MagickPathExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
-    kernel=AcquireKernelInfo(geometry,exception);
-    if (kernel == (KernelInfo *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.",".");
-      goto cleanup;
-    }
+    length = (MaxMap+1); 
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, histogramBuffer, CL_TRUE, 0, length * sizeof(cl_uint4), histogram, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-    imageKernelBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus);
+  /* unmap, don't block gpu to use this buffer again.  */
+  if (ALIGNED(histogram,cl_uint4))
+  {
+    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, histogramBuffer, histogram, 0, NULL, NULL);
     if (clStatus != CL_SUCCESS)
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
       goto cleanup;
     }
+  }
 
+  /* recreate input buffer later, in case image updated */
+#ifdef RECREATEBUFFER 
+  if (imageBuffer!=NULL)                     
+    clEnv->library->clReleaseMemObject(imageBuffer);
+#endif
 
-    kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
+  /* CPU stuff */
+  /*
+     Find the histogram boundaries by locating the black/white levels.
+  */
+  black.red=0.0;
+  white.red=MaxRange(QuantumRange);
+  if ((channel & RedChannel) != 0)
+  {
+    intensity=0.0;
+    for (i=0; i <= (ssize_t) MaxMap; i++)
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
-      goto cleanup;
+      intensity+=histogram[i].s[2];
+      if (intensity > black_point)
+        break;
     }
-    for (i = 0; i < kernel->width; i++)
+    black.red=(MagickRealType) i;
+    intensity=0.0;
+    for (i=(ssize_t) MaxMap; i != 0; i--)
     {
-      kernelBufferPtr[i] = (float) kernel->values[i];
-    }
-    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
-      goto cleanup;
+      intensity+=histogram[i].s[2];
+      if (intensity > ((double) image->columns*image->rows-white_point))
+        break;
     }
+    white.red=(MagickRealType) i;
   }
-
+  black.green=0.0;
+  white.green=MaxRange(QuantumRange);
+  if ((channel & GreenChannel) != 0)
   {
-    /* create temp buffer */
+    intensity=0.0;
+    for (i=0; i <= (ssize_t) MaxMap; i++)
     {
-      length = image->columns * image->rows;
-      tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length * 4 * sizeof(float), NULL, &clStatus);
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-        goto cleanup;
-      }
+      intensity+=histogram[i].s[2];
+      if (intensity > black_point)
+        break;
     }
-
-    /* get the opencl kernel */
+    black.green=(MagickRealType) i;
+    intensity=0.0;
+    for (i=(ssize_t) MaxMap; i != 0; i--)
     {
-      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurRow");
-      if (blurRowKernel == NULL)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-        goto cleanup;
-      };
-
-      unsharpMaskBlurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMaskBlurColumn");
-      if (unsharpMaskBlurColumnKernel == NULL)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-        goto cleanup;
-      };
+      intensity+=histogram[i].s[2];
+      if (intensity > ((double) image->columns*image->rows-white_point))
+        break;
     }
-
+    white.green=(MagickRealType) i;
+  }
+  black.blue=0.0;
+  white.blue=MaxRange(QuantumRange);
+  if ((channel & BlueChannel) != 0)
+  {
+    intensity=0.0;
+    for (i=0; i <= (ssize_t) MaxMap; i++)
     {
-      chunkSize = 256;
-
-      imageColumns = (unsigned int) image->columns;
-      imageRows = (unsigned int) image->rows;
-
-      kernelWidth = (unsigned int) kernel->width;
-
-      /* set the kernel arguments */
-      i = 0;
-      clStatus=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(ChannelType),&channel);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
-      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(CLPixelPacket)*(chunkSize+kernel->width),(void *) NULL);
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-        goto cleanup;
-      }
+      intensity+=histogram[i].s[2];
+      if (intensity > black_point)
+        break;
     }
-
-    /* launch the kernel */
+    black.blue=(MagickRealType) i;
+    intensity=0.0;
+    for (i=(ssize_t) MaxMap; i != 0; i--)
     {
-      size_t gsize[2];
-      size_t wsize[2];
-
-      gsize[0] = chunkSize*((image->columns+chunkSize-1)/chunkSize);
-      gsize[1] = image->rows;
-      wsize[0] = chunkSize;
-      wsize[1] = 1;
-
-      clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, blurRowKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-        goto cleanup;
-      }
-      clEnv->library->clFlush(queue);
+      intensity+=histogram[i].s[2];
+      if (intensity > ((double) image->columns*image->rows-white_point))
+        break;
     }
-
-
+    white.blue=(MagickRealType) i;
+  }
+  black.alpha=0.0;
+  white.alpha=MaxRange(QuantumRange);
+  if ((channel & OpacityChannel) != 0)
+  {
+    intensity=0.0;
+    for (i=0; i <= (ssize_t) MaxMap; i++)
     {
-      chunkSize = 256;
-      imageColumns = (unsigned int) image->columns;
-      imageRows = (unsigned int) image->rows;
-      kernelWidth = (unsigned int) kernel->width;
-      fGain = (float) gain;
-      fThreshold = (float) threshold;
-
-      i = 0;
-      clStatus=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++, (chunkSize+kernelWidth-1)*sizeof(cl_float4),NULL);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++, kernelWidth*sizeof(float),NULL);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(ChannelType),&channel);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fGain);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fThreshold);
-
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-        goto cleanup;
-      }
+      intensity+=histogram[i].s[2];
+      if (intensity > black_point)
+        break;
     }
-
-    /* launch the kernel */
+    black.alpha=(MagickRealType) i;
+    intensity=0.0;
+    for (i=(ssize_t) MaxMap; i != 0; i--)
     {
-      size_t gsize[2];
-      size_t wsize[2];
-
-      gsize[0] = image->columns;
-      gsize[1] = chunkSize*((image->rows+chunkSize-1)/chunkSize);
-      wsize[0] = 1;
-      wsize[1] = chunkSize;
-
-      clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, unsharpMaskBlurColumnKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-        goto cleanup;
-      }
-      clEnv->library->clFlush(queue);
+      intensity+=histogram[i].s[2];
+      if (intensity > ((double) image->columns*image->rows-white_point))
+        break;
     }
-
-  }
-
-  /* get result */
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
-  {
-    length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
-  }
-  else 
-  {
-    length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+    white.alpha=(MagickRealType) i;
   }
-  if (clStatus != CL_SUCCESS)
+  /*
+  black.index=0.0;
+  white.index=MaxRange(QuantumRange);
+  if (((channel & IndexChannel) != 0) && (image->colorspace == CMYKColorspace))
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
-    goto cleanup;
+    intensity=0.0;
+    for (i=0; i <= (ssize_t) MaxMap; i++)
+    {
+      intensity+=histogram[i].index;
+      if (intensity > black_point)
+        break;
+    }
+    black.index=(MagickRealType) i;
+    intensity=0.0;
+    for (i=(ssize_t) MaxMap; i != 0; i--)
+    {
+      intensity+=histogram[i].index;
+      if (intensity > ((double) image->columns*image->rows-white_point))
+        break;
+    }
+    white.index=(MagickRealType) i;
   }
+  */
 
-  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
-
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
-  image_view=DestroyCacheView(image_view);
-  if (filteredImage_view != NULL)
-    filteredImage_view=DestroyCacheView(filteredImage_view);
+  stretch_map=(PixelPacket *) AcquireQuantumMemory(MaxMap+1UL,
+    sizeof(*stretch_map));
 
-  if (kernel != NULL)                        kernel=DestroyKernelInfo(kernel);
-  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
-  if (filteredImageBuffer!=NULL)              clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (tempImageBuffer!=NULL)                  clEnv->library->clReleaseMemObject(tempImageBuffer);
-  if (imageKernelBuffer!=NULL)                clEnv->library->clReleaseMemObject(imageKernelBuffer);
-  if (blurRowKernel!=NULL)                    RelinquishOpenCLKernel(clEnv, blurRowKernel);
-  if (unsharpMaskBlurColumnKernel!=NULL)      RelinquishOpenCLKernel(clEnv, unsharpMaskBlurColumnKernel);
-  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (outputReady == MagickFalse)
+  if (stretch_map == (PixelPacket *) NULL)
+    ThrowBinaryException(ResourceLimitError,"MemoryAllocationFailed",
+      image->filename);
+  /*
+    Stretch the histogram to create the stretched image mapping.
+  */
+  (void) ResetMagickMemory(stretch_map,0,(MaxMap+1)*sizeof(*stretch_map));
+  for (i=0; i <= (ssize_t) MaxMap; i++)
   {
-    if (filteredImage != NULL)
+    if ((channel & RedChannel) != 0)
     {
-      DestroyImage(filteredImage);
-      filteredImage = NULL;
+      if (i < (ssize_t) black.red)
+        stretch_map[i].red=(Quantum) 0;
+      else
+        if (i > (ssize_t) white.red)
+          stretch_map[i].red=QuantumRange;
+        else
+          if (black.red != white.red)
+            stretch_map[i].red=ScaleMapToQuantum((MagickRealType) (MaxMap*
+                  (i-black.red)/(white.red-black.red)));
     }
-  }
-  return(filteredImage);
-}
-
-static Image *ComputeUnsharpMaskImageSection(const Image *image,
-  const ChannelType channel,const double radius,const double sigma,
-  const double gain,const double threshold,ExceptionInfo *exception)
-{
-  CacheView
-    *filteredImage_view,
-    *image_view;
-
-  char
-    geometry[MagickPathExtent];
-
-  cl_command_queue
-    queue;
-
-  cl_context
-    context;
-
-  cl_int
-    clStatus;
-
-  cl_kernel
-    blurRowKernel,
-    unsharpMaskBlurColumnKernel;
-
-  cl_mem
-    filteredImageBuffer,
-    imageBuffer,
-    imageKernelBuffer,
-    tempImageBuffer;
-
-  cl_mem_flags
-    mem_flags;
-
-  const void
-    *inputPixels;
-
-  float
-    fGain,
-    fThreshold,
-    *kernelBufferPtr;
-
-  Image
-    *filteredImage;
-
-  int
-    chunkSize;
-
-  KernelInfo
-    *kernel;
-
-  MagickBooleanType
-    outputReady;
-
-  MagickCLEnv
-    clEnv;
-
-  MagickSizeType
-    length;
-
-  void
-    *filteredPixels,
-    *hostPtr;
-
-  unsigned int
-    i,
-    imageColumns,
-    imageRows,
-    kernelWidth;
-
-  clEnv = NULL;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  kernel = NULL;
-  context = NULL;
-  imageBuffer = NULL;
-  filteredImageBuffer = NULL;
-  tempImageBuffer = NULL;
-  imageKernelBuffer = NULL;
-  blurRowKernel = NULL;
-  unsharpMaskBlurColumnKernel = NULL;
-  queue = NULL;
-  outputReady = MagickFalse;
-
-  clEnv = GetDefaultOpenCLEnv();
-  context = GetOpenCLContext(clEnv);
-  queue = AcquireOpenCLCommandQueue(clEnv);
-
-  /* Create and initialize OpenCL buffers. */
-  {
-    image_view=AcquireVirtualCacheView(image,exception);
-    inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
-    if (inputPixels == (const void *) NULL)
+    if ((channel & GreenChannel) != 0)
     {
-      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
-      goto cleanup;
+      if (i < (ssize_t) black.green)
+        stretch_map[i].green=0;
+      else
+        if (i > (ssize_t) white.green)
+          stretch_map[i].green=QuantumRange;
+        else
+          if (black.green != white.green)
+            stretch_map[i].green=ScaleMapToQuantum((MagickRealType) (MaxMap*
+                  (i-black.green)/(white.green-black.green)));
     }
-
-    /* If the host pointer is aligned to the size of CLPixelPacket, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
-    if (ALIGNED(inputPixels,CLPixelPacket)) 
+    if ((channel & BlueChannel) != 0)
     {
-      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+      if (i < (ssize_t) black.blue)
+        stretch_map[i].blue=0;
+      else
+        if (i > (ssize_t) white.blue)
+          stretch_map[i].blue= QuantumRange;
+        else
+          if (black.blue != white.blue)
+            stretch_map[i].blue=ScaleMapToQuantum((MagickRealType) (MaxMap*
+                  (i-black.blue)/(white.blue-black.blue)));
     }
-    else 
+    if ((channel & OpacityChannel) != 0)
     {
-      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+      if (i < (ssize_t) black.alpha)
+        stretch_map[i].alpha=0;
+      else
+        if (i > (ssize_t) white.alpha)
+          stretch_map[i].alpha=QuantumRange;
+        else
+          if (black.alpha != white.alpha)
+            stretch_map[i].alpha=ScaleMapToQuantum((MagickRealType) (MaxMap*
+                  (i-black.alpha)/(white.alpha-black.alpha)));
     }
-    /* create a CL buffer from image pixel buffer */
-    length = image->columns * image->rows;
-    imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
-    if (clStatus != CL_SUCCESS)
+    /*
+    if (((channel & IndexChannel) != 0) &&
+        (image->colorspace == CMYKColorspace))
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
+      if (i < (ssize_t) black.index)
+        stretch_map[i].index=0;
+      else
+        if (i > (ssize_t) white.index)
+          stretch_map[i].index=QuantumRange;
+        else
+          if (black.index != white.index)
+            stretch_map[i].index=ScaleMapToQuantum((MagickRealType) (MaxMap*
+                  (i-black.index)/(white.index-black.index)));
     }
+    */
   }
 
-  /* create output */
+  /*
+    Stretch the image.
+  */
+  if (((channel & OpacityChannel) != 0) || (((channel & IndexChannel) != 0) &&
+      (image->colorspace == CMYKColorspace)))
+    image->storage_class=DirectClass;
+  if (image->storage_class == PseudoClass)
   {
-    filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
-    assert(filteredImage != NULL);
-    if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
-    filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
-    if (filteredPixels == (void *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
-      goto cleanup;
-    }
-
-    if (ALIGNED(filteredPixels,CLPixelPacket)) 
-    {
-      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
-      hostPtr = filteredPixels;
-    }
-    else 
-    {
-      mem_flags = CL_MEM_WRITE_ONLY;
-      hostPtr = NULL;
-    }
-
-    /* create a CL buffer from image pixel buffer */
-    length = image->columns * image->rows;
-    filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
-    if (clStatus != CL_SUCCESS)
+    /*
+       Stretch colormap.
+       */
+    for (i=0; i < (ssize_t) image->colors; i++)
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
+      if ((channel & RedChannel) != 0)
+      {
+        if (black.red != white.red)
+          image->colormap[i].red=stretch_map[
+            ScaleQuantumToMap(image->colormap[i].red)].red;
+      }
+      if ((channel & GreenChannel) != 0)
+      {
+        if (black.green != white.green)
+          image->colormap[i].green=stretch_map[
+            ScaleQuantumToMap(image->colormap[i].green)].green;
+      }
+      if ((channel & BlueChannel) != 0)
+      {
+        if (black.blue != white.blue)
+          image->colormap[i].blue=stretch_map[
+            ScaleQuantumToMap(image->colormap[i].blue)].blue;
+      }
+      if ((channel & OpacityChannel) != 0)
+      {
+        if (black.alpha != white.alpha)
+          image->colormap[i].alpha=stretch_map[
+            ScaleQuantumToMap(image->colormap[i].alpha)].alpha;
+      }
     }
   }
 
-  /* create the blur kernel */
-  {
-    (void) FormatLocaleString(geometry,MagickPathExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
-    kernel=AcquireKernelInfo(geometry,exception);
-    if (kernel == (KernelInfo *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.",".");
-      goto cleanup;
-    }
-
-    imageKernelBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
+  /*
+    Stretch image.
+  */
 
 
-    kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
-      goto cleanup;
-    }
-    for (i = 0; i < kernel->width; i++)
-    {
-      kernelBufferPtr[i] = (float) kernel->values[i];
-    }
-    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
-      goto cleanup;
-    }
-  }
+  /* GPU can work on this again, image and equalize map as input
+    image:        uchar4 (CLPixelPacket)
+    stretch_map:  uchar4 (PixelPacket)
+    black, white: float4 (FloatPixelPacket) */
 
+#ifdef RECREATEBUFFER 
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
-    unsigned int offsetRows;
-    unsigned int sec;
-
-    /* create temp buffer */
-    {
-      length = image->columns * (image->rows / 2 + 1 + (kernel->width-1) / 2);
-      tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length * 4 * sizeof(float), NULL, &clStatus);
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-        goto cleanup;
-      }
-    }
-
-    /* get the opencl kernel */
-    {
-      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurRowSection");
-      if (blurRowKernel == NULL)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-        goto cleanup;
-      };
-
-      unsharpMaskBlurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMaskBlurColumnSection");
-      if (unsharpMaskBlurColumnKernel == NULL)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-        goto cleanup;
-      };
-    }
-
-    for (sec = 0; sec < 2; sec++)
-    {
-      {
-        chunkSize = 256;
-
-        imageColumns = (unsigned int) image->columns;
-        if (sec == 0)
-          imageRows = (unsigned int) (image->rows / 2 + (kernel->width-1) / 2);
-        else
-          imageRows = (unsigned int) ((image->rows - image->rows / 2) + (kernel->width-1) / 2);
-
-        offsetRows = (unsigned int) (sec * image->rows / 2);
-
-        kernelWidth = (unsigned int) kernel->width;
-
-        /* set the kernel arguments */
-        i = 0;
-        clStatus=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
-        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(ChannelType),&channel);
-        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
-        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
-        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
-        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
-        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(CLPixelPacket)*(chunkSize+kernel->width),(void *) NULL);
-        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&offsetRows);
-        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&sec);
-        if (clStatus != CL_SUCCESS)
-        {
-          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-          goto cleanup;
-        }
-      }
-      /* launch the kernel */
-      {
-        size_t gsize[2];
-        size_t wsize[2];
-
-        gsize[0] = chunkSize*((imageColumns+chunkSize-1)/chunkSize);
-        gsize[1] = imageRows;
-        wsize[0] = chunkSize;
-        wsize[1] = 1;
-
-        clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, blurRowKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
-        if (clStatus != CL_SUCCESS)
-        {
-          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-          goto cleanup;
-        }
-        clEnv->library->clFlush(queue);
-      }
-
-
-      {
-        chunkSize = 256;
-
-        imageColumns = (unsigned int) image->columns;
-        if (sec == 0)
-          imageRows = (unsigned int) (image->rows / 2);
-        else
-          imageRows = (unsigned int) (image->rows - image->rows / 2);
-
-        offsetRows = (unsigned int) (sec * image->rows / 2);
-
-        kernelWidth = (unsigned int) kernel->width;
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = image->columns * image->rows;
+  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+#endif
 
-        fGain = (float) gain;
-        fThreshold = (float) threshold;
+  /* Create and initialize OpenCL buffers. */
+  if (ALIGNED(stretch_map, PixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = stretch_map;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    hostPtr = stretch_map;
+  }
+  /* create a CL buffer for stretch_map  */
+  length = (MaxMap+1); 
+  stretchMapBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(PixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
 
-        i = 0;
-        clStatus=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++, (chunkSize+kernelWidth-1)*sizeof(cl_float4),NULL);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++, kernelWidth*sizeof(float),NULL);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(ChannelType),&channel);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fGain);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fThreshold);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&offsetRows);
-        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&sec);
+  /* get the OpenCL kernel */
+  stretchKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ContrastStretch");
+  if (stretchKernel == NULL)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-        if (clStatus != CL_SUCCESS)
-        {
-          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-          goto cleanup;
-        }
-      }
+  /* set the kernel arguments */
+  i = 0;
+  clStatus=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(ChannelType),&channel);
+  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&stretchMapBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&white);
+  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&black);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-      /* launch the kernel */
-      {
-        size_t gsize[2];
-        size_t wsize[2];
+  /* launch the kernel */
+  global_work_size[0] = image->columns;
+  global_work_size[1] = image->rows;
 
-        gsize[0] = imageColumns;
-        gsize[1] = chunkSize*((imageRows+chunkSize-1)/chunkSize);
-        wsize[0] = 1;
-        wsize[1] = chunkSize;
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, stretchKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
 
-        clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, unsharpMaskBlurColumnKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
-        if (clStatus != CL_SUCCESS)
-        {
-          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-          goto cleanup;
-        }
-        clEnv->library->clFlush(queue);
-      }
-    }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
   }
+  clEnv->library->clFlush(queue);
 
-  /* get result */
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  /* read the data back */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
     length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
   else 
   {
     length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
   }
   if (clStatus != CL_SUCCESS)
   {
@@ -3052,80 +2360,109 @@ static Image *ComputeUnsharpMaskImageSection(const Image *image,
     goto cleanup;
   }
 
-  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
+  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
 
 cleanup:
   OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
   image_view=DestroyCacheView(image_view);
-  if (filteredImage_view != NULL)
-    filteredImage_view=DestroyCacheView(filteredImage_view);
-
-  if (kernel != NULL)                        kernel=DestroyKernelInfo(kernel);
-  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
-  if (filteredImageBuffer!=NULL)              clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (tempImageBuffer!=NULL)                  clEnv->library->clReleaseMemObject(tempImageBuffer);
-  if (imageKernelBuffer!=NULL)                clEnv->library->clReleaseMemObject(imageKernelBuffer);
-  if (blurRowKernel!=NULL)                    RelinquishOpenCLKernel(clEnv, blurRowKernel);
-  if (unsharpMaskBlurColumnKernel!=NULL)      RelinquishOpenCLKernel(clEnv, unsharpMaskBlurColumnKernel);
-  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (outputReady == MagickFalse)
-  {
-    if (filteredImage != NULL)
-    {
-      DestroyImage(filteredImage);
-      filteredImage = NULL;
-    }
-  }
-  return filteredImage;
-}
 
-static Image *ComputeUnsharpMaskImageSingle(const Image *image,
-  const ChannelType channel,const double radius,const double sigma,
-  const double gain,const double threshold,int blurOnly, ExceptionInfo *exception)
-{
-  CacheView
-    *filteredImage_view,
-    *image_view;
+  if (imageBuffer!=NULL)                     
+    clEnv->library->clReleaseMemObject(imageBuffer);
 
-  char
-    geometry[MagickPathExtent];
+  if (stretchMapBuffer!=NULL)
+    clEnv->library->clReleaseMemObject(stretchMapBuffer);
+  if (stretch_map!=NULL)
+    stretch_map=(PixelPacket *) RelinquishMagickMemory(stretch_map);
 
-  cl_command_queue
-    queue;
 
-  cl_context
-    context;
+  if (histogramBuffer!=NULL)
+    clEnv->library->clReleaseMemObject(histogramBuffer);
+  if (histogram!=NULL)
+    histogram=(cl_uint4 *) RelinquishMagickMemory(histogram);
 
-  cl_int
-    justBlur,
-    clStatus;
 
-  cl_kernel
-    unsharpMaskKernel;
+  if (histogramKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, histogramKernel);
+  if (stretchKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, stretchKernel);
+
+  if (queue != NULL)                          
+    RelinquishOpenCLCommandQueue(clEnv, queue);
+
+  return(outputReady);
+}
+
+MagickExport MagickBooleanType AccelerateContrastStretchImage(
+  Image *image,const ChannelType channel,const double black_point,
+  const double white_point,ExceptionInfo *exception)
+{
+  MagickBooleanType
+    status;
+
+  assert(image != NULL);
+  assert(exception != (ExceptionInfo *) NULL);
+
+  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
+      (checkAccelerateCondition(image, channel) == MagickFalse) ||
+      (checkHistogramCondition(image, channel) == MagickFalse))
+    return(MagickFalse);
+
+  status=ComputeContrastStretchImage(image,channel, black_point, white_point, exception);
+  return(status);
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     A c c e l e r a t e C o n v o l v e I m a g e                           %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+static Image *ComputeConvolveImage(const Image* image,
+  const ChannelType channel,const KernelInfo *kernel,ExceptionInfo *exception)
+{
+  CacheView
+    *filteredImage_view,
+    *image_view;
+
+  cl_command_queue
+    queue;
+
+  cl_context
+    context;
+
+  cl_kernel
+    clkernel;
+
+  cl_int
+    clStatus;
 
   cl_mem
+    convolutionKernel,
     filteredImageBuffer,
-    imageBuffer,
-    imageKernelBuffer;
+    imageBuffer;
 
   cl_mem_flags
     mem_flags;
 
+  cl_ulong
+    deviceLocalMemorySize;
+
   const void
     *inputPixels;
 
   float
-    fGain,
-    fThreshold,
     *kernelBufferPtr;
 
   Image
     *filteredImage;
 
-  KernelInfo
-    *kernel;
-
   MagickBooleanType
     outputReady;
 
@@ -3135,196 +2472,242 @@ static Image *ComputeUnsharpMaskImageSingle(const Image *image,
   MagickSizeType
     length;
 
-  void
-    *filteredPixels,
-    *hostPtr;
+  size_t
+    global_work_size[3],
+    localGroupSize[3],
+    localMemoryRequirement;
+
+  unsigned
+    kernelSize;
 
   unsigned int
+    filterHeight,
+    filterWidth,
     i,
-    imageColumns,
-    imageRows,
-    kernelWidth;
+    imageHeight,
+    imageWidth,
+    matte;
 
-  clEnv = NULL;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  kernel = NULL;
+  void
+    *filteredPixels,
+    *hostPtr;
+
+  /* intialize all CL objects to NULL */
   context = NULL;
   imageBuffer = NULL;
   filteredImageBuffer = NULL;
-  imageKernelBuffer = NULL;
-  unsharpMaskKernel = NULL;
+  convolutionKernel = NULL;
+  clkernel = NULL;
   queue = NULL;
-  outputReady = MagickFalse;
 
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  outputReady = MagickFalse;
+  
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
-  queue = AcquireOpenCLCommandQueue(clEnv);
 
-  /* Create and initialize OpenCL buffers. */
+  image_view=AcquireVirtualCacheView(image,exception);
+  inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+  if (inputPixels == (const void *) NULL)
   {
-    image_view=AcquireVirtualCacheView(image,exception);
-    inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
-    if (inputPixels == (const void *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
-      goto cleanup;
-    }
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+    goto cleanup;
+  }
 
-    /* If the host pointer is aligned to the size of CLPixelPacket, 
+  /* Create and initialize OpenCL buffers. */
+
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
      then use the host buffer directly from the GPU; otherwise, 
      create a buffer on the GPU and copy the data over */
-    if (ALIGNED(inputPixels,CLPixelPacket)) 
-    {
-      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
-    }
-    else 
-    {
-      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
-    }
-    /* create a CL buffer from image pixel buffer */
-    length = image->columns * image->rows;
-    imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
   }
-
-  /* create output */
+  else 
   {
-    filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
-    assert(filteredImage != NULL);
-    if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
-    filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
-    if (filteredPixels == (void *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
-      goto cleanup;
-    }
-
-    if (ALIGNED(filteredPixels,CLPixelPacket)) 
-    {
-      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
-      hostPtr = filteredPixels;
-    }
-    else 
-    {
-      mem_flags = CL_MEM_WRITE_ONLY;
-      hostPtr = NULL;
-    }
-
-    /* create a CL buffer from image pixel buffer */
-    length = image->columns * image->rows;
-    filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
   }
-
-  /* create the blur kernel */
+  /* create a CL buffer from image pixel buffer */
+  length = image->columns * image->rows;
+  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
   {
-    (void) FormatLocaleString(geometry,MagickPathExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
-    kernel=AcquireKernelInfo(geometry,exception);
-    if (kernel == (KernelInfo *) NULL)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.",".");
-      goto cleanup;
-    }
-
-    imageKernelBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
 
+  filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
+  assert(filteredImage != NULL);
+  if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
+  filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
+  if (filteredPixels == (void *) NULL)
+  {
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+    goto cleanup;
+  }
 
-    kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
-      goto cleanup;
-    }
-    for (i = 0; i < kernel->width; i++)
-    {
-      kernelBufferPtr[i] = (float) kernel->values[i];
-    }
-    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
-      goto cleanup;
-    }
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = filteredPixels;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY;
+    hostPtr = NULL;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = image->columns * image->rows;
+  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
   }
 
+  kernelSize = (unsigned int) (kernel->width * kernel->height);
+  convolutionKernel = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, kernelSize * sizeof(float), NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
   {
-    /* get the opencl kernel */
-    {
-      unsharpMaskKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMask");
-      if (unsharpMaskKernel == NULL)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-        goto cleanup;
-      };
-    }
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, convolutionKernel, CL_TRUE, CL_MAP_WRITE, 0, kernelSize * sizeof(float)
+          , 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
+    goto cleanup;
+  }
+  for (i = 0; i < kernelSize; i++)
+  {
+    kernelBufferPtr[i] = (float) kernel->values[i];
+  }
+  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, convolutionKernel, kernelBufferPtr, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clEnv->library->clFlush(queue);
+
+  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
+
+  /* Compute the local memory requirement for a 16x16 workgroup.
+     If it's larger than 16k, reduce the workgroup size to 8x8 */
+  localGroupSize[0] = 16;
+  localGroupSize[1] = 16;
+  localMemoryRequirement = (localGroupSize[0]+kernel->width-1) * (localGroupSize[1]+kernel->height-1) * sizeof(CLPixelPacket)
+    + kernel->width*kernel->height*sizeof(float);
 
+  if (localMemoryRequirement > deviceLocalMemorySize)
+  {
+    localGroupSize[0] = 8;
+    localGroupSize[1] = 8;
+    localMemoryRequirement = (localGroupSize[0]+kernel->width-1) * (localGroupSize[1]+kernel->height-1) * sizeof(CLPixelPacket)
+      + kernel->width*kernel->height*sizeof(float);
+  }
+  if (localMemoryRequirement <= deviceLocalMemorySize) 
+  {
+    /* get the OpenCL kernel */
+    clkernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ConvolveOptimized");
+    if (clkernel == NULL)
     {
-      imageColumns = (unsigned int) image->columns;
-      imageRows = (unsigned int) image->rows;
-      kernelWidth = (unsigned int) kernel->width;
-      fGain = (float) gain;
-      fThreshold = (float) threshold;
-      justBlur = blurOnly;
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }
 
-      /* set the kernel arguments */
-      i = 0;
-      clStatus=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&imageRows);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_float4)*(8 * (32 + kernel->width)),(void *) NULL);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(float),(void *)&fGain);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(float),(void *)&fThreshold);
-      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_uint),(void *)&justBlur);
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-        goto cleanup;
-      }
+    /* set the kernel arguments */
+    i = 0;
+    clStatus =clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+    imageWidth = (unsigned int) image->columns;
+    imageHeight = (unsigned int) image->rows;
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageWidth);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageHeight);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&convolutionKernel);
+    filterWidth = (unsigned int) kernel->width;
+    filterHeight = (unsigned int) kernel->height;
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterWidth);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterHeight);
+    matte = (image->alpha_trait == UndefinedPixelTrait)?1:0;
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&matte);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(ChannelType),(void *)&channel);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++, (localGroupSize[0] + kernel->width-1)*(localGroupSize[1] + kernel->height-1)*sizeof(CLPixelPacket),NULL);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++, kernel->width*kernel->height*sizeof(float),NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
     }
 
+    /* pad the global size to a multiple of the local work size dimension */
+    global_work_size[0] = ((image->columns + localGroupSize[0]  - 1)/localGroupSize[0] ) * localGroupSize[0] ;
+    global_work_size[1] = ((image->rows + localGroupSize[1] - 1)/localGroupSize[1]) * localGroupSize[1];
+
     /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, clkernel, 2, NULL, global_work_size, localGroupSize, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
     {
-      size_t gsize[2];
-      size_t wsize[2];
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
+  else
+  {
+    /* get the OpenCL kernel */
+    clkernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Convolve");
+    if (clkernel == NULL)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }
 
-      gsize[0] = ((image->columns + 7) / 8) * 8;
-      gsize[1] = ((image->rows + 31) / 32) * 32;
-      wsize[0] = 8;
-      wsize[1] = 32;
+    /* set the kernel arguments */
+    i = 0;
+    clStatus =clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+    imageWidth = (unsigned int) image->columns;
+    imageHeight = (unsigned int) image->rows;
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageWidth);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageHeight);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&convolutionKernel);
+    filterWidth = (unsigned int) kernel->width;
+    filterHeight = (unsigned int) kernel->height;
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterWidth);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterHeight);
+    matte = (image->alpha_trait == UndefinedPixelTrait)?1:0;
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&matte);
+    clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(ChannelType),(void *)&channel);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
 
-      clStatus = clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, unsharpMaskKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
-      if (clStatus != CL_SUCCESS)
-      {
-        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-        goto cleanup;
-      }
-      clEnv->library->clFlush(queue);
+    localGroupSize[0] = 8;
+    localGroupSize[1] = 8;
+    global_work_size[0] = (image->columns + (localGroupSize[0]-1))/localGroupSize[0] * localGroupSize[0];
+    global_work_size[1] = (image->rows    + (localGroupSize[1]-1))/localGroupSize[1] * localGroupSize[1];
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, clkernel, 2, NULL, global_work_size, localGroupSize, 0, NULL, NULL);
+    
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
     }
   }
+  clEnv->library->clFlush(queue);
 
-  /* get result */
   if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
     length = image->columns * image->rows;
@@ -3350,12 +2733,21 @@ cleanup:
   if (filteredImage_view != NULL)
     filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  if (kernel != NULL)                        kernel=DestroyKernelInfo(kernel);
-  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
-  if (filteredImageBuffer!=NULL)              clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (imageKernelBuffer!=NULL)                clEnv->library->clReleaseMemObject(imageKernelBuffer);
-  if (unsharpMaskKernel!=NULL)                RelinquishOpenCLKernel(clEnv, unsharpMaskKernel);
-  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (imageBuffer != NULL)
+    clEnv->library->clReleaseMemObject(imageBuffer);
+
+  if (filteredImageBuffer != NULL)
+    clEnv->library->clReleaseMemObject(filteredImageBuffer);
+
+  if (convolutionKernel != NULL)
+    clEnv->library->clReleaseMemObject(convolutionKernel);
+
+  if (clkernel != NULL)
+    RelinquishOpenCLKernel(clEnv, clkernel);
+
+  if (queue != NULL)
+    RelinquishOpenCLCommandQueue(clEnv, queue);
+
   if (outputReady == MagickFalse)
   {
     if (filteredImage != NULL)
@@ -3364,30 +2756,25 @@ cleanup:
       filteredImage = NULL;
     }
   }
+
   return(filteredImage);
 }
 
-
-MagickExport Image *AccelerateUnsharpMaskImage(const Image *image,
-  const ChannelType channel,const double radius,const double sigma,
-  const double gain,const double threshold,ExceptionInfo *exception)
+MagickExport Image *AccelerateConvolveImage(const Image *image,
+  const ChannelType channel,const KernelInfo *kernel,ExceptionInfo *exception)
 {
   Image
     *filteredImage;
 
   assert(image != NULL);
+  assert(kernel != (KernelInfo *) NULL);
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
       (checkAccelerateCondition(image, channel) == MagickFalse))
     return NULL;
 
-  if (radius < 12.1)
-    filteredImage = ComputeUnsharpMaskImageSingle(image,channel,radius,sigma,gain,threshold, 0, exception);
-  else if (splitImage(image) && (image->rows / 2 > radius)) 
-    filteredImage = ComputeUnsharpMaskImageSection(image,channel,radius,sigma,gain,threshold,exception);
-  else
-    filteredImage = ComputeUnsharpMaskImage(image,channel,radius,sigma,gain,threshold,exception);
+  filteredImage=ComputeConvolveImage(image, channel, kernel, exception);
   return(filteredImage);
 }
 
@@ -3396,494 +2783,426 @@ MagickExport Image *AccelerateUnsharpMaskImage(const Image *image,
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%   A c c e l e r a t e R e s i z e I m a g e                                 %
+%     A c c e l e r a t e D e s p e c k l e I m a g e                         %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  AccelerateResizeImage() is an OpenCL implementation of ResizeImage()
-%
-%  AccelerateResizeImage() scales an image to the desired dimensions, using the given
-%  filter (see AcquireFilterInfo()).
-%
-%  If an undefined filter is given the filter defaults to Mitchell for a
-%  colormapped image, a image with a matte channel, or if the image is
-%  enlarged.  Otherwise the filter defaults to a Lanczos.
-%
-%  AccelerateResizeImage() was inspired by Paul Heckbert's "zoom" program.
-%
-%  The format of the AccelerateResizeImage method is:
-%
-%      Image *ResizeImage(Image *image,const size_t columns,
-%        const size_t rows, const ResizeFilter* filter,
-%        ExceptionInfo *exception)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o columns: the number of columns in the scaled image.
-%
-%    o rows: the number of rows in the scaled image.
-%
-%    o filter: Image filter to use.
-%
-%    o exception: return any errors or warnings in this structure.
-%
 */
 
-static MagickBooleanType resizeHorizontalFilter(cl_mem image,
-  const unsigned int imageColumns,const unsigned int imageRows,
-  const unsigned int matte,cl_mem resizedImage,
-  const unsigned int resizedColumns,const unsigned int resizedRows,
-  const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients,
-  const float xFactor,MagickCLEnv clEnv,cl_command_queue queue,
-  ExceptionInfo *exception)
+static Image *ComputeDespeckleImage(const Image *image,
+  ExceptionInfo*exception)
 {
-  cl_kernel
-    horizontalKernel;
+  static const int 
+    X[4] = {0, 1, 1,-1},
+    Y[4] = {1, 0, 1, 1};
 
-  cl_int clStatus;
+  CacheView
+    *filteredImage_view,
+    *image_view;
 
-  const unsigned int
-    workgroupSize = 256;
+  cl_command_queue
+    queue;
 
-  float
-    resizeFilterScale,
-    resizeFilterSupport,
-    resizeFilterWindowSupport,
-    resizeFilterBlur,
-    scale,
-    support;
+  cl_context
+    context;
 
-  int
-    cacheRangeStart,
-    cacheRangeEnd,
-    numCachedPixels,
-    resizeFilterType,
-    resizeWindowType;
+  cl_int
+    clStatus;
 
-  MagickBooleanType
-    status = MagickFalse;
+  cl_kernel
+    hullPass1,
+    hullPass2;
 
-  size_t
-    deviceLocalMemorySize,
-    gammaAccumulatorLocalMemorySize,
-    global_work_size[2],
-    imageCacheLocalMemorySize,
-    pixelAccumulatorLocalMemorySize,
-    local_work_size[2],
-    totalLocalMemorySize,
-    weightAccumulatorLocalMemorySize;
+  cl_mem_flags
+    mem_flags;
 
-  unsigned int
-    chunkSize,
-    i,
-    pixelPerWorkgroup;
+  cl_mem
+    filteredImageBuffer,
+    imageBuffer,
+    tempImageBuffer[2];
 
-  horizontalKernel = NULL;
-  status = MagickFalse;
+  const void
+    *inputPixels;
 
-  /*
-  Apply filter to resize vertically from image to resize image.
-  */
-  scale=MAGICK_MAX(1.0/xFactor+MagickEpsilon,1.0);
-  support=scale*GetResizeFilterSupport(resizeFilter);
-  if (support < 0.5)
+  Image
+    *filteredImage;
+
+  int
+    k,
+    matte;
+
+  MagickBooleanType
+    outputReady;
+
+  MagickCLEnv
+    clEnv;
+
+  MagickSizeType
+    length;
+
+  size_t
+    global_work_size[2];
+
+  unsigned int
+    imageHeight,
+    imageWidth;
+
+  void
+    *filteredPixels,
+    *hostPtr;
+
+  outputReady = MagickFalse;
+  clEnv = NULL;
+  inputPixels = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  filteredPixels = NULL;
+  context = NULL;
+  imageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  hullPass1 = NULL;
+  hullPass2 = NULL;
+  queue = NULL;
+  tempImageBuffer[0] = tempImageBuffer[1] = NULL;
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+  image_view=AcquireVirtualCacheView(image,exception);
+  inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+  if (inputPixels == (void *) NULL)
   {
-    /*
-    Support too small even for nearest neighbour: Reduce to point
-    sampling.
-    */
-    support=(MagickRealType) 0.5;
-    scale=1.0;
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+    goto cleanup;
   }
-  scale=PerceptibleReciprocal(scale);
 
-  if (resizedColumns < workgroupSize
+  if (ALIGNED(inputPixels,CLPixelPacket)
   {
-    chunkSize = 32;
-    pixelPerWorkgroup = 32;
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
   }
-  else
+  else 
   {
-    chunkSize = workgroupSize;
-    pixelPerWorkgroup = workgroupSize;
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
   }
-
-  /* get the local memory size supported by the device */
-  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
-
-DisableMSCWarning(4127)
-  while(1)
-RestoreMSCWarning
+  /* create a CL buffer from image pixel buffer */
+  length = image->columns * image->rows;
+  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
   {
-    /* calculate the local memory size needed per workgroup */
-    cacheRangeStart = (int) (((0 + 0.5)/xFactor+MagickEpsilon)-support+0.5);
-    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/xFactor+MagickEpsilon)+support+0.5);
-    numCachedPixels = cacheRangeEnd - cacheRangeStart + 1;
-    imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket);
-    totalLocalMemorySize = imageCacheLocalMemorySize;
-
-    /* local size for the pixel accumulator */
-    pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4);
-    totalLocalMemorySize+=pixelAccumulatorLocalMemorySize;
-
-    /* local memory size for the weight accumulator */
-    weightAccumulatorLocalMemorySize = chunkSize * sizeof(float);
-    totalLocalMemorySize+=weightAccumulatorLocalMemorySize;
-
-    /* local memory size for the gamma accumulator */
-    if (matte == 0)
-      gammaAccumulatorLocalMemorySize = sizeof(float);
-    else
-      gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float);
-    totalLocalMemorySize+=gammaAccumulatorLocalMemorySize;
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
 
-    if (totalLocalMemorySize <= deviceLocalMemorySize)
-      break;
-    else
+  mem_flags = CL_MEM_READ_WRITE;
+  length = image->columns * image->rows;
+  for (k = 0; k < 2; k++)
+  {
+    tempImageBuffer[k] = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
     {
-      pixelPerWorkgroup = pixelPerWorkgroup/2;
-      chunkSize = chunkSize/2;
-      if (pixelPerWorkgroup == 0
-          || chunkSize == 0)
-      {
-        /* quit, fallback to CPU */
-        goto cleanup;
-      }
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
     }
   }
 
-  resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter);
-  resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter);
-
+  filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
+  assert(filteredImage != NULL);
+  if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
+  filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
+  if (filteredPixels == (void *) NULL)
+  {
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+    goto cleanup;
+  }
 
-  if (resizeFilterType == SincFastWeightingFunction
-    && resizeWindowType == SincFastWeightingFunction)
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
-    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilterSinc");
+    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = filteredPixels;
   }
-  else
+  else 
   {
-    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilter");
+    mem_flags = CL_MEM_WRITE_ONLY;
+    hostPtr = NULL;
   }
-  if (horizontalKernel == NULL)
+  /* create a CL buffer from image pixel buffer */
+  length = image->columns * image->rows;
+  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
 
-  i = 0;
-  clStatus = clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&image);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageColumns);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageRows);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&matte);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&xFactor);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizedImage);
-
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedRows);
-
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeFilterType);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeWindowType);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients);
-
-  resizeFilterScale = (float) GetResizeFilterScale(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterScale);
-
-  resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterSupport);
-
-  resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport);
-
-  resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterBlur);
-
-
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, imageCacheLocalMemorySize, NULL);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), &numCachedPixels);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &chunkSize);
-  
-
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, pixelAccumulatorLocalMemorySize, NULL);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, weightAccumulatorLocalMemorySize, NULL);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, gammaAccumulatorLocalMemorySize, NULL);
+  hullPass1 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass1");
+  hullPass2 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass2");
 
+  clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus |=clEnv->library->clSetKernelArg(hullPass1,1,sizeof(cl_mem),(void *)(tempImageBuffer+1));
+  imageWidth = (unsigned int) image->columns;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass1,2,sizeof(unsigned int),(void *)&imageWidth);
+  imageHeight = (unsigned int) image->rows;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass1,3,sizeof(unsigned int),(void *)&imageHeight);
+  matte = (image->alpha_trait == UndefinedPixelTrait)?0:1;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass1,6,sizeof(int),(void *)&matte);
   if (clStatus != CL_SUCCESS)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  global_work_size[0] = (resizedColumns+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize;
-  global_work_size[1] = resizedRows;
-
-  local_work_size[0] = workgroupSize;
-  local_work_size[1] = 1;
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, horizontalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-  (void) local_work_size;
+  clStatus = clEnv->library->clSetKernelArg(hullPass2,0,sizeof(cl_mem),(void *)(tempImageBuffer+1));
+  clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)tempImageBuffer);
+  imageWidth = (unsigned int) image->columns;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass2,2,sizeof(unsigned int),(void *)&imageWidth);
+  imageHeight = (unsigned int) image->rows;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass2,3,sizeof(unsigned int),(void *)&imageHeight);
+  matte = (image->alpha_trait == UndefinedPixelTrait)?0:1;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass2,6,sizeof(int),(void *)&matte);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
     goto cleanup;
   }
-  clEnv->library->clFlush(queue);
-  status = MagickTrue;
 
 
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
-
-  if (horizontalKernel != NULL) RelinquishOpenCLKernel(clEnv, horizontalKernel);
+  global_work_size[0] = image->columns;
+  global_work_size[1] = image->rows;
 
-  return(status);
-}
+  
+  for (k = 0; k < 4; k++)
+  {
+    cl_int2 offset;
+    int polarity;
 
-static MagickBooleanType resizeVerticalFilter(cl_mem image,
-  const unsigned int imageColumns,const unsigned int imageRows,
-  const unsigned int matte,cl_mem resizedImage,
-  const unsigned int resizedColumns,const unsigned int resizedRows,
-  const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients,
-  const float yFactor,MagickCLEnv clEnv,cl_command_queue queue,
-  ExceptionInfo *exception)
-{
-  cl_kernel
-    verticalKernel;
-
-  cl_int clStatus;
-
-  const unsigned int
-    workgroupSize = 256;
+    
+    offset.s[0] = X[k];
+    offset.s[1] = Y[k];
+    polarity = 1;
+    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
 
-  float
-    resizeFilterScale,
-    resizeFilterSupport,
-    resizeFilterWindowSupport,
-    resizeFilterBlur,
-    scale,
-    support;
 
-  int
-    cacheRangeStart,
-    cacheRangeEnd,
-    numCachedPixels,
-    resizeFilterType,
-    resizeWindowType;
+    if (k == 0)
+      clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)(tempImageBuffer));
+    offset.s[0] = -X[k];
+    offset.s[1] = -Y[k];
+    polarity = 1;
+    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
 
-  MagickBooleanType
-    status = MagickFalse;
+    offset.s[0] = -X[k];
+    offset.s[1] = -Y[k];
+    polarity = -1;
+    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
 
-  size_t
-    deviceLocalMemorySize,
-    gammaAccumulatorLocalMemorySize,
-    global_work_size[2],
-    imageCacheLocalMemorySize,
-    pixelAccumulatorLocalMemorySize,
-    local_work_size[2],
-    totalLocalMemorySize,
-    weightAccumulatorLocalMemorySize;
+    offset.s[0] = X[k];
+    offset.s[1] = Y[k];
+    polarity = -1;
+    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
 
-  unsigned int
-    chunkSize,
-    i,
-    pixelPerWorkgroup;
+    if (k == 3)
+      clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)&filteredImageBuffer);
 
-  verticalKernel = NULL;
-  status = MagickFalse;
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+  }
 
-  /*
-  Apply filter to resize vertically from image to resize image.
-  */
-  scale=MAGICK_MAX(1.0/yFactor+MagickEpsilon,1.0);
-  support=scale*GetResizeFilterSupport(resizeFilter);
-  if (support < 0.5)
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
-    /*
-    Support too small even for nearest neighbour: Reduce to point
-    sampling.
-    */
-    support=(MagickRealType) 0.5;
-    scale=1.0;
+    length = image->columns * image->rows;
+    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
-  scale=PerceptibleReciprocal(scale);
-
-  if (resizedRows < workgroupSize) 
+  else 
   {
-    chunkSize = 32;
-    pixelPerWorkgroup = 32;
+    length = image->columns * image->rows;
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
   }
-  else
+  if (clStatus != CL_SUCCESS)
   {
-    chunkSize = workgroupSize;
-    pixelPerWorkgroup = workgroupSize;
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
   }
 
-  /* get the local memory size supported by the device */
-  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
+  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
 
-DisableMSCWarning(4127)
-  while(1)
-RestoreMSCWarning
-  {
-    /* calculate the local memory size needed per workgroup */
-    cacheRangeStart = (int) (((0 + 0.5)/yFactor+MagickEpsilon)-support+0.5);
-    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/yFactor+MagickEpsilon)+support+0.5);
-    numCachedPixels = cacheRangeEnd - cacheRangeStart + 1;
-    imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket);
-    totalLocalMemorySize = imageCacheLocalMemorySize;
+cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
-    /* local size for the pixel accumulator */
-    pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4);
-    totalLocalMemorySize+=pixelAccumulatorLocalMemorySize;
+  image_view=DestroyCacheView(image_view);
+  if (filteredImage_view != NULL)
+    filteredImage_view=DestroyCacheView(filteredImage_view);
 
-    /* local memory size for the weight accumulator */
-    weightAccumulatorLocalMemorySize = chunkSize * sizeof(float);
-    totalLocalMemorySize+=weightAccumulatorLocalMemorySize;
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
+  for (k = 0; k < 2; k++)
+  {
+    if (tempImageBuffer[k]!=NULL)            clEnv->library->clReleaseMemObject(tempImageBuffer[k]);
+  }
+  if (filteredImageBuffer!=NULL)             clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (hullPass1!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass1);
+  if (hullPass2!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass2);
+  if (outputReady == MagickFalse && filteredImage != NULL)
+    filteredImage=DestroyImage(filteredImage);
+  return(filteredImage);
+}
 
-    /* local memory size for the gamma accumulator */
-    if (matte == 0)
-      gammaAccumulatorLocalMemorySize = sizeof(float);
-    else
-      gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float);
-    totalLocalMemorySize+=gammaAccumulatorLocalMemorySize;
+MagickExport Image *AccelerateDespeckleImage(const Image* image,
+  ExceptionInfo* exception)
+{
+  Image
+    *filteredImage;
 
-    if (totalLocalMemorySize <= deviceLocalMemorySize)
-      break;
-    else
-    {
-      pixelPerWorkgroup = pixelPerWorkgroup/2;
-      chunkSize = chunkSize/2;
-      if (pixelPerWorkgroup == 0
-          || chunkSize == 0)
-      {
-        /* quit, fallback to CPU */
-        goto cleanup;
-      }
-    }
-  }
+  assert(image != NULL);
+  assert(exception != (ExceptionInfo *) NULL);
 
-  resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter);
-  resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter);
+  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
+      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
+    return NULL;
 
-  if (resizeFilterType == SincFastWeightingFunction
-    && resizeWindowType == SincFastWeightingFunction)
-    verticalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilterSinc");
-  else 
-    verticalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilter");
+  filteredImage=ComputeDespeckleImage(image,exception);
+  return(filteredImage);
+}
 
-  if (verticalKernel == NULL)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  i = 0;
-  clStatus = clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(cl_mem), (void*)&image);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&imageColumns);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&imageRows);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&matte);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&yFactor);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(cl_mem), (void*)&resizedImage);
-
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&resizedRows);
-
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(int), (void*)&resizeFilterType);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(int), (void*)&resizeWindowType);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients);
-
-  resizeFilterScale = (float) GetResizeFilterScale(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&resizeFilterScale);
-
-  resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&resizeFilterSupport);
-
-  resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport);
-
-  resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&resizeFilterBlur);
-
-
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, imageCacheLocalMemorySize, NULL);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(int), &numCachedPixels);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), &chunkSize);
-  
-
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, pixelAccumulatorLocalMemorySize, NULL);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, weightAccumulatorLocalMemorySize, NULL);
-  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, gammaAccumulatorLocalMemorySize, NULL);
-
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  global_work_size[0] = resizedColumns;
-  global_work_size[1] = (resizedRows+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize;
-
-  local_work_size[0] = 1;
-  local_work_size[1] = workgroupSize;
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, verticalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  clEnv->library->clFlush(queue);
-  status = MagickTrue;
-
-
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
-
-  if (verticalKernel != NULL) RelinquishOpenCLKernel(clEnv, verticalKernel);
-
-  return(status);
-}
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     A c c e l e r a t e E q u a l i z e I m a g e                           %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-static Image *ComputeResizeImage(const Image* image,
-  const size_t resizedColumns,const size_t resizedRows,
-  const ResizeFilter *resizeFilter,ExceptionInfo *exception)
+static MagickBooleanType ComputeEqualizeImage(Image *image,
+  const ChannelType channel,ExceptionInfo *exception)
 {
+#define EqualizeImageTag  "Equalize/Image"
+
   CacheView
-    *filteredImage_view,
     *image_view;
 
   cl_command_queue
     queue;
 
-  cl_int
-    clStatus;
-
   cl_context
     context;
 
-  cl_mem
-    cubicCoefficientsBuffer,
-    filteredImageBuffer,
-    imageBuffer,
-    tempImageBuffer;
+  cl_int
+    clStatus;
 
   cl_mem_flags
     mem_flags;
 
-  const double
-    *resizeFilterCoefficient;
+  cl_mem
+    equalizeMapBuffer,
+    histogramBuffer,
+    imageBuffer;
 
-  const void
-    *inputPixels;
+  cl_kernel
+    equalizeKernel,
+    histogramKernel;
 
-  float
-    *mappedCoefficientBuffer,
-    xFactor,
-    yFactor;
+  cl_uint4
+    *histogram;
+
+  FloatPixelPacket
+    white,
+    black,
+    intensity,
+    *map;
 
   MagickBooleanType
     outputReady,
@@ -3895,40 +3214,65 @@ static Image *ComputeResizeImage(const Image* image,
   MagickSizeType
     length;
 
-  Image
-    *filteredImage;
+  PixelPacket
+    *equalize_map;
 
-  unsigned int
-    i,
-    matte;
+  register ssize_t
+    i;
+
+  size_t
+    global_work_size[2];
 
   void
-    *filteredPixels,
-    *hostPtr;
+    *hostPtr,
+    *inputPixels;
 
-  outputReady = MagickFalse;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  clEnv = NULL;
-  context = NULL;
+  map=NULL;
+  histogram=NULL;
+  equalize_map=NULL;
+  inputPixels = NULL;
   imageBuffer = NULL;
-  tempImageBuffer = NULL;
-  filteredImageBuffer = NULL;
-  cubicCoefficientsBuffer = NULL;
+  histogramBuffer = NULL;
+  equalizeMapBuffer = NULL;
+  histogramKernel = NULL; 
+  equalizeKernel = NULL; 
+  context = NULL;
   queue = NULL;
+  outputReady = MagickFalse;
+
+  assert(image != (Image *) NULL);
+  assert(image->signature == MagickCoreSignature);
+  if (image->debug != MagickFalse)
+    (void) LogMagickEvent(TraceEvent,GetMagickModule(),"%s",image->filename);
 
+  /*
+   * initialize opencl env
+   */
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  /*
+    Allocate and initialize histogram arrays.
+  */
+  histogram=(cl_uint4 *) AcquireQuantumMemory(MaxMap+1UL, sizeof(*histogram));
+  if (histogram == (cl_uint4 *) NULL)
+      ThrowBinaryException(ResourceLimitWarning,"MemoryAllocationFailed", image->filename);
+
+  /* reset histogram */
+  (void) ResetMagickMemory(histogram,0,(MaxMap+1)*sizeof(*histogram));
 
   /* Create and initialize OpenCL buffers. */
-  image_view=AcquireVirtualCacheView(image,exception);
-  inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (inputPixels == (const void *) NULL)
+  /* inputPixels = AcquirePixelCachePixels(image, &length, exception); */
+  /* assume this  will get a writable image */
+  image_view=AcquireAuthenticCacheView(image,exception);
+  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
+
+  if (inputPixels == (void *) NULL)
   {
     (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
     goto cleanup;
   }
-
   /* If the host pointer is aligned to the size of CLPixelPacket, 
      then use the host buffer directly from the GPU; otherwise, 
      create a buffer on the GPU and copy the data over */
@@ -3949,203 +3293,327 @@ static Image *ComputeResizeImage(const Image* image,
     goto cleanup;
   }
 
-  cubicCoefficientsBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, 7 * sizeof(float), NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-  queue = AcquireOpenCLCommandQueue(clEnv);
-  mappedCoefficientBuffer = (float*)clEnv->library->clEnqueueMapBuffer(queue, cubicCoefficientsBuffer, CL_TRUE, CL_MAP_WRITE, 0, 7 * sizeof(float)
-          , 0, NULL, NULL, &clStatus);
-  if (clStatus != CL_SUCCESS)
+  /* If the host pointer is aligned to the size of cl_uint, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(histogram,cl_uint4)) 
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
-    goto cleanup;
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+    hostPtr = histogram;
   }
-  resizeFilterCoefficient = GetResizeFilterCoefficient(resizeFilter);
-  for (i = 0; i < 7; i++)
+  else 
   {
-    mappedCoefficientBuffer[i] = (float) resizeFilterCoefficient[i];
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    hostPtr = histogram;
   }
-  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, cubicCoefficientsBuffer, mappedCoefficientBuffer, 0, NULL, NULL);
+  /* create a CL buffer for histogram  */
+  length = (MaxMap+1); 
+  histogramBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(cl_uint4), hostPtr, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
 
-  filteredImage = CloneImage(image,resizedColumns,resizedRows,MagickTrue,exception);
-  if (filteredImage == NULL)
-    goto cleanup;
-
-  if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
-  filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
-  if (filteredPixels == (void *) NULL)
-  {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+  status = LaunchHistogramKernel(clEnv, queue, imageBuffer, histogramBuffer, image, channel, exception);
+  if (status == MagickFalse)
     goto cleanup;
-  }
 
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  /* read from the kenel output */
+  if (ALIGNED(histogram,cl_uint4)) 
   {
-    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
-    hostPtr = filteredPixels;
+    length = (MaxMap+1); 
+    clEnv->library->clEnqueueMapBuffer(queue, histogramBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(cl_uint4), 0, NULL, NULL, &clStatus);
   }
   else 
   {
-    mem_flags = CL_MEM_WRITE_ONLY;
-    hostPtr = NULL;
+    length = (MaxMap+1); 
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, histogramBuffer, CL_TRUE, 0, length * sizeof(cl_uint4), histogram, 0, NULL, NULL);
   }
-
-  /* create a CL buffer from image pixel buffer */
-  length = filteredImage->columns * filteredImage->rows;
-  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  xFactor=(float) resizedColumns/(float) image->columns;
-  yFactor=(float) resizedRows/(float) image->rows;
-  matte=(image->alpha_trait != UndefinedPixelTrait)?1:0;
-  if (xFactor > yFactor)
+  /* unmap, don't block gpu to use this buffer again.  */
+  if (ALIGNED(histogram,cl_uint4))
   {
-
-    length = resizedColumns*image->rows;
-    tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(CLPixelPacket), NULL, &clStatus);
+    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, histogramBuffer, histogram, 0, NULL, NULL);
     if (clStatus != CL_SUCCESS)
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
       goto cleanup;
     }
-    
-    status = resizeHorizontalFilter(imageBuffer, (unsigned int) image->columns, (unsigned int) image->rows, matte
-          , tempImageBuffer, (unsigned int) resizedColumns, (unsigned int) image->rows
-          , resizeFilter, cubicCoefficientsBuffer
-          , xFactor, clEnv, queue, exception);
-    if (status != MagickTrue)
-      goto cleanup;
-    
-    status = resizeVerticalFilter(tempImageBuffer, (unsigned int) resizedColumns, (unsigned int) image->rows, matte
-       , filteredImageBuffer, (unsigned int) resizedColumns, (unsigned int) resizedRows
-       , resizeFilter, cubicCoefficientsBuffer
-       , yFactor, clEnv, queue, exception);
-    if (status != MagickTrue)
-      goto cleanup;
   }
-  else
+
+  /* recreate input buffer later, in case image updated */
+#ifdef RECREATEBUFFER 
+  if (imageBuffer!=NULL)                     
+    clEnv->library->clReleaseMemObject(imageBuffer);
+#endif
+  /* CPU stuff */
+  equalize_map=(PixelPacket *) AcquireQuantumMemory(MaxMap+1UL, sizeof(*equalize_map));
+  if (equalize_map == (PixelPacket *) NULL)
+    ThrowBinaryException(ResourceLimitWarning,"MemoryAllocationFailed", image->filename);
+
+  map=(FloatPixelPacket *) AcquireQuantumMemory(MaxMap+1UL,sizeof(*map));
+  if (map == (FloatPixelPacket *) NULL)
+    ThrowBinaryException(ResourceLimitWarning,"MemoryAllocationFailed", image->filename);
+
+  /*
+    Integrate the histogram to get the equalization map.
+  */
+  (void) ResetMagickMemory(&intensity,0,sizeof(intensity));
+  for (i=0; i <= (ssize_t) MaxMap; i++)
   {
-    length = image->columns*resizedRows;
-    tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(CLPixelPacket), NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
+    if ((channel & SyncChannels) != 0)
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
+      intensity.red+=histogram[i].s[2];
+      map[i]=intensity;
+      continue;
     }
+    if ((channel & RedChannel) != 0)
+      intensity.red+=histogram[i].s[2];
+    if ((channel & GreenChannel) != 0)
+      intensity.green+=histogram[i].s[1];
+    if ((channel & BlueChannel) != 0)
+      intensity.blue+=histogram[i].s[0];
+    if ((channel & OpacityChannel) != 0)
+      intensity.alpha+=histogram[i].s[3];
+    /*
+    if (((channel & IndexChannel) != 0) &&
+        (image->colorspace == CMYKColorspace))
+    {
+      intensity.index+=histogram[i].index; 
+    }
+    */
+    map[i]=intensity;
+  }
+  black=map[0];
+  white=map[(int) MaxMap];
+  (void) ResetMagickMemory(equalize_map,0,(MaxMap+1)*sizeof(*equalize_map));
+  for (i=0; i <= (ssize_t) MaxMap; i++)
+  {
+    if ((channel & SyncChannels) != 0)
+    {
+      if (white.red != black.red)
+        equalize_map[i].red=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+                (map[i].red-black.red))/(white.red-black.red)));
+      continue;
+    }
+    if (((channel & RedChannel) != 0) && (white.red != black.red))
+      equalize_map[i].red=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+              (map[i].red-black.red))/(white.red-black.red)));
+    if (((channel & GreenChannel) != 0) && (white.green != black.green))
+      equalize_map[i].green=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+              (map[i].green-black.green))/(white.green-black.green)));
+    if (((channel & BlueChannel) != 0) && (white.blue != black.blue))
+      equalize_map[i].blue=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+              (map[i].blue-black.blue))/(white.blue-black.blue)));
+    if (((channel & OpacityChannel) != 0) && (white.alpha != black.alpha))
+      equalize_map[i].alpha=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+              (map[i].alpha-black.alpha))/(white.alpha-black.alpha)));
+    /*
+    if ((((channel & IndexChannel) != 0) &&
+          (image->colorspace == CMYKColorspace)) &&
+        (white.index != black.index))
+      equalize_map[i].index=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+              (map[i].index-black.index))/(white.index-black.index)));
+    */
+  }
 
-    status = resizeVerticalFilter(imageBuffer, (unsigned int) image->columns, (unsigned int) image->rows, matte
-       , tempImageBuffer, (unsigned int) image->columns, (unsigned int) resizedRows
-       , resizeFilter, cubicCoefficientsBuffer
-       , yFactor, clEnv, queue, exception);
-    if (status != MagickTrue)
-      goto cleanup;
-
-    status = resizeHorizontalFilter(tempImageBuffer, (unsigned int) image->columns, (unsigned int) resizedRows, matte
-       , filteredImageBuffer, (unsigned int) resizedColumns, (unsigned int) resizedRows
-       , resizeFilter, cubicCoefficientsBuffer
-       , xFactor, clEnv, queue, exception);
-    if (status != MagickTrue)
-      goto cleanup;
+  if (image->storage_class == PseudoClass)
+  {
+    /*
+       Equalize colormap.
+       */
+    for (i=0; i < (ssize_t) image->colors; i++)
+    {
+      if ((channel & SyncChannels) != 0)
+      {
+        if (white.red != black.red)
+        {
+          image->colormap[i].red=equalize_map[
+            ScaleQuantumToMap(image->colormap[i].red)].red;
+          image->colormap[i].green=equalize_map[
+            ScaleQuantumToMap(image->colormap[i].green)].red;
+          image->colormap[i].blue=equalize_map[
+            ScaleQuantumToMap(image->colormap[i].blue)].red;
+          image->colormap[i].alpha=equalize_map[
+            ScaleQuantumToMap(image->colormap[i].alpha)].red;
+        }
+        continue;
+      }
+      if (((channel & RedChannel) != 0) && (white.red != black.red))
+        image->colormap[i].red=equalize_map[
+          ScaleQuantumToMap(image->colormap[i].red)].red;
+      if (((channel & GreenChannel) != 0) && (white.green != black.green))
+        image->colormap[i].green=equalize_map[
+          ScaleQuantumToMap(image->colormap[i].green)].green;
+      if (((channel & BlueChannel) != 0) && (white.blue != black.blue))
+        image->colormap[i].blue=equalize_map[
+          ScaleQuantumToMap(image->colormap[i].blue)].blue;
+      if (((channel & OpacityChannel) != 0) &&
+          (white.alpha != black.alpha))
+        image->colormap[i].alpha=equalize_map[
+          ScaleQuantumToMap(image->colormap[i].alpha)].alpha;
+    }
   }
-  length = resizedColumns*resizedRows;
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+
+  /*
+    Equalize image.
+  */
+
+  /* GPU can work on this again, image and equalize map as input
+    image:        uchar4 (CLPixelPacket)
+    equalize_map: uchar4 (PixelPacket)
+    black, white: float4 (FloatPixelPacket) */
+
+#ifdef RECREATEBUFFER 
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
-    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
   }
   else 
   {
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
   }
+  /* create a CL buffer from image pixel buffer */
+  length = image->columns * image->rows;
+  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
-  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
+#endif
 
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  /* Create and initialize OpenCL buffers. */
+  if (ALIGNED(equalize_map, PixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = equalize_map;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    hostPtr = equalize_map;
+  }
+  /* create a CL buffer for eqaulize_map  */
+  length = (MaxMap+1); 
+  equalizeMapBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(PixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
 
-  image_view=DestroyCacheView(image_view);
-  if (filteredImage_view != NULL)
-    filteredImage_view=DestroyCacheView(filteredImage_view);
+  /* get the OpenCL kernel */
+  equalizeKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Equalize");
+  if (equalizeKernel == NULL)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-  if (imageBuffer!=NULL)                 clEnv->library->clReleaseMemObject(imageBuffer);
-  if (tempImageBuffer!=NULL)             clEnv->library->clReleaseMemObject(tempImageBuffer);
-  if (filteredImageBuffer!=NULL)         clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (cubicCoefficientsBuffer!=NULL)      clEnv->library->clReleaseMemObject(cubicCoefficientsBuffer);
-  if (queue != NULL)                     RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (outputReady == MagickFalse && filteredImage != NULL)
-    filteredImage=DestroyImage(filteredImage);
-  return(filteredImage);
-}
+  /* set the kernel arguments */
+  i = 0;
+  clStatus=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(ChannelType),&channel);
+  clStatus|=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(cl_mem),(void *)&equalizeMapBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(FloatPixelPacket),&white);
+  clStatus|=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(FloatPixelPacket),&black);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-const ResizeWeightingFunctionType supportedResizeWeighting[] = 
-{
-  BoxWeightingFunction,
-  TriangleWeightingFunction,
-  HannWeightingFunction,
-  HammingWeightingFunction,
-  BlackmanWeightingFunction,
-  CubicBCWeightingFunction,
-  SincWeightingFunction,
-  SincFastWeightingFunction,
-  LastWeightingFunction
-};
+  /* launch the kernel */
+  global_work_size[0] = image->columns;
+  global_work_size[1] = image->rows;
 
-static MagickBooleanType gpuSupportedResizeWeighting(
-  ResizeWeightingFunctionType f)
-{
-  unsigned int
-    i;
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, equalizeKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
 
-  for (i = 0; ;i++)
+  if (clStatus != CL_SUCCESS)
   {
-    if (supportedResizeWeighting[i] == LastWeightingFunction)
-      break;
-    if (supportedResizeWeighting[i] == f)
-      return(MagickTrue);
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
   }
-  return(MagickFalse);
+  clEnv->library->clFlush(queue);
+
+  /* read the data back */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    length = image->columns * image->rows;
+    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = image->columns * image->rows;
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
+
+cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+
+  image_view=DestroyCacheView(image_view);
+
+  if (imageBuffer!=NULL)                     
+    clEnv->library->clReleaseMemObject(imageBuffer);
+
+  if (map!=NULL)
+    map=(FloatPixelPacket *) RelinquishMagickMemory(map);
+
+  if (equalizeMapBuffer!=NULL)
+    clEnv->library->clReleaseMemObject(equalizeMapBuffer);
+  if (equalize_map!=NULL)
+    equalize_map=(PixelPacket *) RelinquishMagickMemory(equalize_map);
+
+  if (histogramBuffer!=NULL)                 
+    clEnv->library->clReleaseMemObject(histogramBuffer);
+  if (histogram!=NULL)
+    histogram=(cl_uint4 *) RelinquishMagickMemory(histogram);
+
+  if (histogramKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, histogramKernel);
+  if (equalizeKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, equalizeKernel);
+
+  if (queue != NULL)                          
+    RelinquishOpenCLCommandQueue(clEnv, queue);
+
+  return(outputReady);
 }
 
-MagickExport Image *AccelerateResizeImage(const Image *image,
-  const size_t resizedColumns,const size_t resizedRows,
-  const ResizeFilter *resizeFilter,ExceptionInfo *exception) 
+MagickExport MagickBooleanType AccelerateEqualizeImage(Image *image,
+  const ChannelType channel,ExceptionInfo *exception)
 {
-  Image
-    *filteredImage;
+  MagickBooleanType
+    status;
 
   assert(image != NULL);
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
-    return NULL;
-
-  if (gpuSupportedResizeWeighting(GetResizeFilterWeightingType(resizeFilter)) == MagickFalse ||
-      gpuSupportedResizeWeighting(GetResizeFilterWindowWeightingType(resizeFilter)) == MagickFalse)
-    return NULL;
+      (checkAccelerateCondition(image, channel) == MagickFalse) ||
+      (checkHistogramCondition(image, channel) == MagickFalse))
+    return(MagickFalse);
 
-  filteredImage=ComputeResizeImage(image,resizedColumns,resizedRows,resizeFilter,exception);
-  return(filteredImage);
+  status=ComputeEqualizeImage(image,channel,exception);
+  return(status);
 }
 
 /*
@@ -4153,31 +3621,17 @@ MagickExport Image *AccelerateResizeImage(const Image *image,
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     C o n t r a s t I m a g e  w i t h  O p e n C L                         %
+%     A c c e l e r a t e F u n c t i o n I m a g e                           %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  ContrastImage() enhances the intensity differences between the lighter and
-%  darker elements of the image.  Set sharpen to a MagickTrue to increase the
-%  image contrast otherwise the contrast is reduced.
-%
-%  The format of the ContrastImage method is:
-%
-%      MagickBooleanType ContrastImage(Image *image,
-%        const MagickBooleanType sharpen)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o sharpen: Increase or decrease image contrast.
-%
 */
 
-static MagickBooleanType ComputeContrastImage(Image *image,
-  const MagickBooleanType sharpen,ExceptionInfo *exception)
+static MagickBooleanType ComputeFunctionImage(Image *image,
+  const ChannelType channel,const MagickFunction function,
+  const size_t number_parameters,const double *parameters,
+  ExceptionInfo *exception)
 {
   CacheView
     *image_view;
@@ -4192,16 +3646,20 @@ static MagickBooleanType ComputeContrastImage(Image *image,
     clStatus;
 
   cl_kernel
-    filterKernel;
+    clkernel;
 
   cl_mem
-    imageBuffer;
+    imageBuffer,
+    parametersBuffer;
 
   cl_mem_flags
     mem_flags;
 
+  float
+    *parametersBufferPtr;
+
   MagickBooleanType
-    outputReady;
+    status;
 
   MagickCLEnv
     clEnv;
@@ -4210,39 +3668,37 @@ static MagickBooleanType ComputeContrastImage(Image *image,
     length;
 
   size_t
-    global_work_size[2];
+    globalWorkSize[2];
 
   unsigned int
-    i,
-    uSharpen;
+    i;
 
   void
-    *inputPixels;
+    *pixels;
+
+  status = MagickFalse;
 
-  outputReady = MagickFalse;
-  clEnv = NULL;
-  inputPixels = NULL;
   context = NULL;
-  imageBuffer = NULL;
-  filterKernel = NULL;
+  clkernel = NULL;
   queue = NULL;
+  imageBuffer = NULL;
+  parametersBuffer = NULL;
 
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
 
-  /* Create and initialize OpenCL buffers. */
   image_view=AcquireAuthenticCacheView(image,exception);
-  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (inputPixels == (void *) NULL)
+  pixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
+  if (pixels == (void *) NULL)
   {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), CacheWarning,
+      "GetPixelCachePixels failed.",
+      "'%s'", image->filename);
     goto cleanup;
   }
 
-  /* If the host pointer is aligned to the size of CLPixelPacket, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
+
+  if (ALIGNED(pixels,CLPixelPacket)) 
   {
     mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
   }
@@ -4252,36 +3708,65 @@ static MagickBooleanType ComputeContrastImage(Image *image,
   }
   /* create a CL buffer from image pixel buffer */
   length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)pixels, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
-  
-  filterKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Contrast");
-  if (filterKernel == NULL)
+
+  parametersBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, number_parameters * sizeof(float), NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  parametersBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, parametersBuffer, CL_TRUE, CL_MAP_WRITE, 0, number_parameters * sizeof(float)
+                , 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
+    goto cleanup;
+  }
+  for (i = 0; i < number_parameters; i++)
+  {
+    parametersBufferPtr[i] = (float)parameters[i];
+  }
+  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, parametersBuffer, parametersBufferPtr, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clEnv->library->clFlush(queue);
+
+  clkernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "FunctionImage");
+  if (clkernel == NULL)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
     goto cleanup;
   }
 
+  /* set the kernel arguments */
   i = 0;
-  clStatus=clEnv->library->clSetKernelArg(filterKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-
-  uSharpen = (sharpen == MagickFalse)?0:1;
-  clStatus|=clEnv->library->clSetKernelArg(filterKernel,i++,sizeof(cl_uint),&uSharpen);
+  clStatus =clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(ChannelType),(void *)&channel);
+  clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(MagickFunction),(void *)&function);
+  clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&number_parameters);
+  clStatus|=clEnv->library->clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&parametersBuffer);
   if (clStatus != CL_SUCCESS)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  global_work_size[0] = image->columns;
-  global_work_size[1] = image->rows;
+  globalWorkSize[0] = image->columns;
+  globalWorkSize[1] = image->rows;
   /* launch the kernel */
-  queue = AcquireOpenCLCommandQueue(clEnv);
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, filterKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, clkernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
   if (clStatus != CL_SUCCESS)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
@@ -4289,7 +3774,8 @@ static MagickBooleanType ComputeContrastImage(Image *image,
   }
   clEnv->library->clFlush(queue);
 
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
+
+  if (ALIGNED(pixels,CLPixelPacket)) 
   {
     length = image->columns * image->rows;
     clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
@@ -4297,28 +3783,32 @@ static MagickBooleanType ComputeContrastImage(Image *image,
   else 
   {
     length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), pixels, 0, NULL, NULL);
   }
   if (clStatus != CL_SUCCESS)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
-  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
+  status=SyncCacheViewAuthenticPixels(image_view,exception);
 
 cleanup:
   OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
   image_view=DestroyCacheView(image_view);
+  
+  if (clkernel != NULL) RelinquishOpenCLKernel(clEnv, clkernel);
+  if (queue != NULL) RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (imageBuffer != NULL) clEnv->library->clReleaseMemObject(imageBuffer);
+  if (parametersBuffer != NULL) clEnv->library->clReleaseMemObject(parametersBuffer);
 
-  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
-  if (filterKernel!=NULL)                     RelinquishOpenCLKernel(clEnv, filterKernel);
-  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
-  return(outputReady);
+  return(status);
 }
 
-MagickExport MagickBooleanType AccelerateContrastImage(Image *image,
-  const MagickBooleanType sharpen,ExceptionInfo *exception)
+MagickExport MagickBooleanType AccelerateFunctionImage(Image *image,
+  const ChannelType channel,const MagickFunction function,
+  const size_t number_parameters,const double *parameters,
+  ExceptionInfo *exception)
 {
   MagickBooleanType
     status;
@@ -4327,10 +3817,10 @@ MagickExport MagickBooleanType AccelerateContrastImage(Image *image,
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
+      (checkAccelerateCondition(image, channel) == MagickFalse))
     return(MagickFalse);
 
-  status = ComputeContrastImage(image,sharpen,exception);
+  status=ComputeFunctionImage(image, channel, function, number_parameters, parameters, exception);
   return(status);
 }
 
@@ -4339,55 +3829,34 @@ MagickExport MagickBooleanType AccelerateContrastImage(Image *image,
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     M o d u l a t e I m a g e  w i t h  O p e n C L                         %
+%     A c c e l e r a t e G r a y s c a l e I m a g e                         %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  ModulateImage() lets you control the brightness, saturation, and hue
-%  of an image.  Modulate represents the brightness, saturation, and hue
-%  as one parameter (e.g. 90,150,100).  If the image colorspace is HSL, the
-%  modulation is lightness, saturation, and hue.  For HWB, use blackness,
-%  whiteness, and hue. And for HCL, use chrome, luma, and hue.
-%
-%  The format of the ModulateImage method is:
-%
-%      MagickBooleanType ModulateImage(Image *image,const char *modulate)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o percent_*: Define the percent change in brightness, saturation, and
-%      hue.
-%
 */
 
-MagickBooleanType ComputeModulateImage(Image *image,
-  double percent_brightness,double percent_hue,double percent_saturation,
-  ColorspaceType colorspace,ExceptionInfo *exception)
+static MagickBooleanType ComputeGrayscaleImage(Image *image,
+  const PixelIntensityMethod method,ExceptionInfo *exception)
 {
   CacheView
     *image_view;
 
-  cl_float
-    bright,
-    hue,
-    saturation;
+  cl_command_queue
+    queue;
 
   cl_context
     context;
 
-  cl_command_queue
-    queue;
+  cl_int
+    clStatus,
+    intensityMethod;
 
   cl_int
-    color,
-    clStatus;
+    colorspace;
 
   cl_kernel
-    modulateKernel;
+    grayscaleKernel;
 
   cl_mem
     imageBuffer;
@@ -4412,7 +3881,7 @@ MagickBooleanType ComputeModulateImage(Image *image,
 
   inputPixels = NULL;
   imageBuffer = NULL;
-  modulateKernel = NULL; 
+  grayscaleKernel = NULL; 
 
   assert(image != (Image *) NULL);
   assert(image->signature == MagickCoreSignature);
@@ -4461,24 +3930,20 @@ MagickBooleanType ComputeModulateImage(Image *image,
     goto cleanup;
   }
 
-  modulateKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Modulate");
-  if (modulateKernel == NULL)
+  intensityMethod = method;
+  colorspace = image->colorspace;
+
+  grayscaleKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Grayscale");
+  if (grayscaleKernel == NULL)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  bright=percent_brightness;
-  hue=percent_hue;
-  saturation=percent_saturation;
-  color=colorspace;
-
   i = 0;
-  clStatus=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&bright);
-  clStatus|=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&hue);
-  clStatus|=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&saturation);
-  clStatus|=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&color);
+  clStatus=clEnv->library->clSetKernelArg(grayscaleKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(grayscaleKernel,i++,sizeof(cl_int),&intensityMethod);
+  clStatus|=clEnv->library->clSetKernelArg(grayscaleKernel,i++,sizeof(cl_int),&colorspace);
   if (clStatus != CL_SUCCESS)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
@@ -4491,7 +3956,7 @@ MagickBooleanType ComputeModulateImage(Image *image,
     global_work_size[0] = image->columns;
     global_work_size[1] = image->rows;
     /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, modulateKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, grayscaleKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
     if (clStatus != CL_SUCCESS)
     {
       (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
@@ -4525,19 +3990,17 @@ cleanup:
 
   if (imageBuffer!=NULL)                     
     clEnv->library->clReleaseMemObject(imageBuffer);
-  if (modulateKernel!=NULL)                     
-    RelinquishOpenCLKernel(clEnv, modulateKernel);
+  if (grayscaleKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, grayscaleKernel);
   if (queue != NULL)                          
     RelinquishOpenCLCommandQueue(clEnv, queue);
 
-  return outputReady;
-
+  return( outputReady);
 }
 
-MagickExport MagickBooleanType AccelerateModulateImage(Image *image,
-  double percent_brightness,double percent_hue,double percent_saturation,
-  ColorspaceType colorspace,ExceptionInfo *exception)
-{
+MagickExport MagickBooleanType AccelerateGrayscaleImage(Image* image,
+  const PixelIntensityMethod method,ExceptionInfo *exception)
+{
   MagickBooleanType
     status;
 
@@ -4548,10 +4011,13 @@ MagickExport MagickBooleanType AccelerateModulateImage(Image *image,
       (checkAccelerateCondition(image, AllChannels) == MagickFalse))
     return(MagickFalse);
 
-  if ((colorspace != HSLColorspace && colorspace != UndefinedColorspace))
+  if (method == Rec601LuminancePixelIntensityMethod || method == Rec709LuminancePixelIntensityMethod)
     return(MagickFalse);
 
-  status = ComputeModulateImage(image,percent_brightness, percent_hue, percent_saturation, colorspace, exception);
+  if (image->colorspace != sRGBColorspace)
+    return(MagickFalse);
+
+  status=ComputeGrayscaleImage(image,method,exception);
   return(status);
 }
 
@@ -4560,31 +4026,18 @@ MagickExport MagickBooleanType AccelerateModulateImage(Image *image,
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     G r a y s c a l e I m a g e  w i t h  O p e n C L                       %
+%     A c c e l e r a t e L o c a l C o n t r a s t I m a g e                 %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  GrayscaleImage() converts the colors in the reference image to gray.
-%
-%  The format of the GrayscaleImageChannel method is:
-%
-%      MagickBooleanType GrayscaleImage(Image *image,
-%        const PixelIntensityMethod method)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o channel: the channel.
-%
 */
 
-MagickBooleanType ComputeGrayscaleImage(Image *image,
-  const PixelIntensityMethod method,ExceptionInfo *exception)
+static Image *ComputeLocalContrastImage(const Image *image,
+  const double radius,const double strength,ExceptionInfo *exception)
 {
   CacheView
+    *filteredImage_view,
     *image_view;
 
   cl_command_queue
@@ -4595,20 +4048,30 @@ MagickBooleanType ComputeGrayscaleImage(Image *image,
 
   cl_int
     clStatus,
-    intensityMethod;
-
-  cl_int
-    colorspace;
+    iRadius;
 
   cl_kernel
-    grayscaleKernel;
+    blurRowKernel,
+    blurColumnKernel;
+
+  cl_event
+    event;
 
   cl_mem
-    imageBuffer;
+    filteredImageBuffer,
+    imageBuffer,
+    imageKernelBuffer,
+    tempImageBuffer;
 
   cl_mem_flags
     mem_flags;
 
+  const void
+    *inputPixels;
+
+  Image
+    *filteredImage;
+
   MagickBooleanType
     outputReady;
 
@@ -4618,294 +4081,329 @@ MagickBooleanType ComputeGrayscaleImage(Image *image,
   MagickSizeType
     length;
 
-  register ssize_t
-    i;
-
   void
-    *inputPixels;
+    *filteredPixels,
+    *hostPtr;
 
-  inputPixels = NULL;
-  imageBuffer = NULL;
-  grayscaleKernel = NULL; 
+  unsigned int
+    i,
+    imageColumns,
+    imageRows,
+    passes;
 
-  assert(image != (Image *) NULL);
-  assert(image->signature == MagickCoreSignature);
-  if (image->debug != MagickFalse)
-    (void) LogMagickEvent(TraceEvent,GetMagickModule(),"%s",image->filename);
+  clEnv = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  context = NULL;
+  imageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  tempImageBuffer = NULL;
+  imageKernelBuffer = NULL;
+  blurRowKernel = NULL;
+  blurColumnKernel = NULL;
+  queue = NULL;
+  outputReady = MagickFalse;
 
-  /*
-   * initialize opencl env
-   */
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
   queue = AcquireOpenCLCommandQueue(clEnv);
 
-  outputReady = MagickFalse;
-
-  /* Create and initialize OpenCL buffers.
-   inputPixels = AcquirePixelCachePixels(image, &length, exception);
-   assume this  will get a writable image
-   */
-  image_view=AcquireAuthenticCacheView(image,exception);
-  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (inputPixels == (void *) NULL)
+  /* Create and initialize OpenCL buffers. */
   {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
-    goto cleanup;
-  }
+    image_view=AcquireVirtualCacheView(image,exception);
+    inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+    if (inputPixels == (const void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+      goto cleanup;
+    }
 
-  /* If the host pointer is aligned to the size of CLPixelPacket, 
-   then use the host buffer directly from the GPU; otherwise, 
-   create a buffer on the GPU and copy the data over
-   */
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
-  }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
+    /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+    if (ALIGNED(inputPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
   }
 
-  intensityMethod = method;
-  colorspace = image->colorspace;
-
-  grayscaleKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Grayscale");
-  if (grayscaleKernel == NULL)
+  /* create output */
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
+    filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
+    assert(filteredImage != NULL);
+    if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
+    filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
+    if (filteredPixels == (void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+      goto cleanup;
+    }
 
-  i = 0;
-  clStatus=clEnv->library->clSetKernelArg(grayscaleKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(grayscaleKernel,i++,sizeof(cl_int),&intensityMethod);
-  clStatus|=clEnv->library->clSetKernelArg(grayscaleKernel,i++,sizeof(cl_int),&colorspace);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    printf("no kernel\n");
-    goto cleanup;
-  }
+    if (ALIGNED(filteredPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+      hostPtr = filteredPixels;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY;
+      hostPtr = NULL;
+    }
 
-  {
-    size_t global_work_size[2];
-    global_work_size[0] = image->columns;
-    global_work_size[1] = image->rows;
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, grayscaleKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
     if (clStatus != CL_SUCCESS)
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
       goto cleanup;
     }
-    clEnv->library->clFlush(queue);
   }
 
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
-  {
-    length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
-  }
-  else 
-  {
-    length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
-  }
-  if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
-    goto cleanup;
-  }
+    /* create temp buffer */
+    {
+      length = image->columns * image->rows;
+      tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length * sizeof(float), NULL, &clStatus);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+        goto cleanup;
+      }
+    }
 
-  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
+    /* get the opencl kernel */
+    {
+      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "LocalContrastBlurRow");
+      if (blurRowKernel == NULL)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
 
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+      blurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "LocalContrastBlurApplyColumn");
+      if (blurColumnKernel == NULL)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+    }
 
-  image_view=DestroyCacheView(image_view);
+    {
+      imageColumns = (unsigned int) image->columns;
+      imageRows = (unsigned int) image->rows;
+      iRadius = (cl_int) fabs(radius);
 
-  if (imageBuffer!=NULL)                     
-    clEnv->library->clReleaseMemObject(imageBuffer);
-  if (grayscaleKernel!=NULL)                     
-    RelinquishOpenCLKernel(clEnv, grayscaleKernel);
-  if (queue != NULL)                          
-    RelinquishOpenCLCommandQueue(clEnv, queue);
+      passes = ((1.0f * imageColumns) * imageColumns * iRadius) / 4000000000.0f;
+      passes = (passes < 1) ? 1: passes;
 
-  return( outputReady);
-}
+      /* set the kernel arguments */
+      i = 0;
+      clStatus=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_int),(void *)&iRadius);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+      
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+        goto cleanup;
+      }
+    }
 
-MagickExport MagickBooleanType AccelerateGrayscaleImage(Image* image,
-  const PixelIntensityMethod method,ExceptionInfo *exception)
-{
-  MagickBooleanType
-    status;
-
-  assert(image != NULL);
-  assert(exception != (ExceptionInfo *) NULL);
-
-  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
-    return(MagickFalse);
-
-  if (method == Rec601LuminancePixelIntensityMethod || method == Rec709LuminancePixelIntensityMethod)
-    return(MagickFalse);
-
-  if (image->colorspace != sRGBColorspace)
-    return(MagickFalse);
-
-  status=ComputeGrayscaleImage(image,method,exception);
-  return(status);
-}
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     E q u a l i z e I m a g e  w i t h  O p e n C L                         %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  EqualizeImage() applies a histogram equalization to the image.
-%
-%  The format of the EqualizeImage method is:
-%
-%      MagickBooleanType EqualizeImage(Image *image)
-%      MagickBooleanType EqualizeImageChannel(Image *image,
-%        const ChannelType channel)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o channel: the channel.
-%
-*/
+    /* launch the kernel */
+    {
+      int x;
+      for (x = 0; x < passes; ++x) {
+        size_t gsize[2];
+        size_t wsize[2];
+        size_t goffset[2];
 
-static MagickBooleanType LaunchHistogramKernel(MagickCLEnv clEnv,
-  cl_command_queue queue,cl_mem imageBuffer,cl_mem histogramBuffer,
-  Image *image,const ChannelType channel,ExceptionInfo *exception)
-{
-  MagickBooleanType
-    outputReady;
+        gsize[0] = 256;
+        gsize[1] = image->rows / passes;
+        wsize[0] = 256;
+        wsize[1] = 1;
+        goffset[0] = 0;
+        goffset[1] = x * gsize[1];
 
-  cl_int
-    clStatus,
-    colorspace,
-    method;
+        clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, blurRowKernel, 2, goffset, gsize, wsize, 0, NULL, &event);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+          goto cleanup;
+        }
+      }
+    }
 
-  cl_kernel
-    histogramKernel; 
+    {
+      cl_float FStrength = strength;
+      i = 0;
+      clStatus=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&iRadius);
+      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(cl_float),(void *)&FStrength);
+      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+      clStatus|=clEnv->library->clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
 
-  register ssize_t
-    i;
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+        goto cleanup;
+      }
+    }
 
-  size_t
-    global_work_size[2];
+    /* launch the kernel */
+    {
+      int x;
+      for (x = 0; x < passes; ++x) {
+        size_t gsize[2];
+        size_t wsize[2];
+        size_t goffset[2];
 
-  histogramKernel = NULL; 
+        gsize[0] = ((image->columns + 3) / 4) * 4;
+        gsize[1] = ((((image->rows + 63) / 64) + (passes + 1)) / passes) * 64;
+        wsize[0] = 4;
+        wsize[1] = 64;
+        goffset[0] = 0;
+        goffset[1] = x * gsize[1];
 
-  outputReady = MagickFalse;
-  method = image->intensity;
-  colorspace = image->colorspace;
+        clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, blurColumnKernel, 2, goffset, gsize, wsize, 0, NULL, &event);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+          goto cleanup;
+        }
+      }
+    }
+  }
 
-  /* get the OpenCL kernel */
-  histogramKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Histogram");
-  if (histogramKernel == NULL)
+  /* get result */
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
+    length = image->columns * image->rows;
+    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
-
-  /* set the kernel arguments */
-  i = 0;
-  clStatus=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(ChannelType),&channel);
-  clStatus|=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(cl_int),&method);
-  clStatus|=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(cl_int),&colorspace);
-  clStatus|=clEnv->library->clSetKernelArg(histogramKernel,i++,sizeof(cl_mem),(void *)&histogramBuffer);
-  if (clStatus != CL_SUCCESS)
+  else 
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
+    length = image->columns * image->rows;
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
   }
-
-  /* launch the kernel */
-  global_work_size[0] = image->columns;
-  global_work_size[1] = image->rows;
-
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, histogramKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
-  clEnv->library->clFlush(queue);
 
-  outputReady = MagickTrue;
+  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
 
 cleanup:
   OpenCLLogException(__FUNCTION__,__LINE__,exception);
-  if (histogramKernel!=NULL)                     
-    RelinquishOpenCLKernel(clEnv, histogramKernel);
 
-  return(outputReady);
+  image_view=DestroyCacheView(image_view);
+  if (filteredImage_view != NULL)
+    filteredImage_view=DestroyCacheView(filteredImage_view);
+
+  if (imageBuffer!=NULL)                      clEnv->library->clReleaseMemObject(imageBuffer);
+  if (filteredImageBuffer!=NULL)              clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (tempImageBuffer!=NULL)                  clEnv->library->clReleaseMemObject(tempImageBuffer);
+  if (imageKernelBuffer!=NULL)                clEnv->library->clReleaseMemObject(imageKernelBuffer);
+  if (blurRowKernel!=NULL)                    RelinquishOpenCLKernel(clEnv, blurRowKernel);
+  if (blurColumnKernel!=NULL)                 RelinquishOpenCLKernel(clEnv, blurColumnKernel);
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return(filteredImage);
 }
 
-MagickExport MagickBooleanType ComputeEqualizeImage(Image *image,
-  const ChannelType channel,ExceptionInfo *exception)
+MagickExport Image *AccelerateLocalContrastImage(const Image *image,
+  const double radius,const double strength,ExceptionInfo *exception)
 {
-#define EqualizeImageTag  "Equalize/Image"
+  Image
+    *filteredImage;
+
+  assert(image != NULL);
+  assert(exception != (ExceptionInfo *) NULL);
+
+  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
+    (checkAccelerateCondition(image, AllChannels) == MagickFalse))
+    return NULL;
+
+  filteredImage=ComputeLocalContrastImage(image,radius,strength,exception);
+
+  return(filteredImage);
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     A c c e l e r a t e M o d u l a t e I m a g e                           %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
+static MagickBooleanType ComputeModulateImage(Image *image,
+  double percent_brightness,double percent_hue,double percent_saturation,
+  ColorspaceType colorspace,ExceptionInfo *exception)
+{
   CacheView
     *image_view;
 
-  cl_command_queue
-    queue;
+  cl_float
+    bright,
+    hue,
+    saturation;
 
   cl_context
     context;
 
+  cl_command_queue
+    queue;
+
   cl_int
+    color,
     clStatus;
 
-  cl_mem_flags
-    mem_flags;
+  cl_kernel
+    modulateKernel;
 
   cl_mem
-    equalizeMapBuffer,
-    histogramBuffer,
     imageBuffer;
 
-  cl_kernel
-    equalizeKernel,
-    histogramKernel;
-
-  cl_uint4
-    *histogram;
-
-  FloatPixelPacket
-    white,
-    black,
-    intensity,
-    *map;
+  cl_mem_flags
+    mem_flags;
 
   MagickBooleanType
-    outputReady,
-    status;
+    outputReady;
 
   MagickCLEnv
     clEnv;
@@ -4913,31 +4411,15 @@ MagickExport MagickBooleanType ComputeEqualizeImage(Image *image,
   MagickSizeType
     length;
 
-  PixelPacket
-    *equalize_map;
-
   register ssize_t
     i;
 
-  size_t
-    global_work_size[2];
-
   void
-    *hostPtr,
     *inputPixels;
 
-  map=NULL;
-  histogram=NULL;
-  equalize_map=NULL;
   inputPixels = NULL;
   imageBuffer = NULL;
-  histogramBuffer = NULL;
-  equalizeMapBuffer = NULL;
-  histogramKernel = NULL; 
-  equalizeKernel = NULL; 
-  context = NULL;
-  queue = NULL;
-  outputReady = MagickFalse;
+  modulateKernel = NULL; 
 
   assert(image != (Image *) NULL);
   assert(image->signature == MagickCoreSignature);
@@ -4951,37 +4433,31 @@ MagickExport MagickBooleanType ComputeEqualizeImage(Image *image,
   context = GetOpenCLContext(clEnv);
   queue = AcquireOpenCLCommandQueue(clEnv);
 
-  /*
-    Allocate and initialize histogram arrays.
-  */
-  histogram=(cl_uint4 *) AcquireQuantumMemory(MaxMap+1UL, sizeof(*histogram));
-  if (histogram == (cl_uint4 *) NULL)
-      ThrowBinaryException(ResourceLimitWarning,"MemoryAllocationFailed", image->filename);
-
-  /* reset histogram */
-  (void) ResetMagickMemory(histogram,0,(MaxMap+1)*sizeof(*histogram));
+  outputReady = MagickFalse;
 
-  /* Create and initialize OpenCL buffers. */
-  /* inputPixels = AcquirePixelCachePixels(image, &length, exception); */
-  /* assume this  will get a writable image */
+  /* Create and initialize OpenCL buffers.
+   inputPixels = AcquirePixelCachePixels(image, &length, exception);
+   assume this  will get a writable image
+   */
   image_view=AcquireAuthenticCacheView(image,exception);
   inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
-
   if (inputPixels == (void *) NULL)
   {
     (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
     goto cleanup;
   }
+
   /* If the host pointer is aligned to the size of CLPixelPacket, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
+   then use the host buffer directly from the GPU; otherwise, 
+   create a buffer on the GPU and copy the data over
+   */
   if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
   }
   else 
   {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
   }
   /* create a CL buffer from image pixel buffer */
   length = image->columns * image->rows;
@@ -4992,313 +4468,82 @@ MagickExport MagickBooleanType ComputeEqualizeImage(Image *image,
     goto cleanup;
   }
 
-  /* If the host pointer is aligned to the size of cl_uint, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
-  if (ALIGNED(histogram,cl_uint4)) 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
-    hostPtr = histogram;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
-    hostPtr = histogram;
-  }
-  /* create a CL buffer for histogram  */
-  length = (MaxMap+1); 
-  histogramBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(cl_uint4), hostPtr, &clStatus);
-  if (clStatus != CL_SUCCESS)
+  modulateKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Modulate");
+  if (modulateKernel == NULL)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  status = LaunchHistogramKernel(clEnv, queue, imageBuffer, histogramBuffer, image, channel, exception);
-  if (status == MagickFalse)
-    goto cleanup;
+  bright=percent_brightness;
+  hue=percent_hue;
+  saturation=percent_saturation;
+  color=colorspace;
 
-  /* read from the kenel output */
-  if (ALIGNED(histogram,cl_uint4)) 
-  {
-    length = (MaxMap+1); 
-    clEnv->library->clEnqueueMapBuffer(queue, histogramBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(cl_uint4), 0, NULL, NULL, &clStatus);
-  }
-  else 
-  {
-    length = (MaxMap+1); 
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, histogramBuffer, CL_TRUE, 0, length * sizeof(cl_uint4), histogram, 0, NULL, NULL);
-  }
+  i = 0;
+  clStatus=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&bright);
+  clStatus|=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&hue);
+  clStatus|=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&saturation);
+  clStatus|=clEnv->library->clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&color);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    printf("no kernel\n");
     goto cleanup;
   }
 
-  /* unmap, don't block gpu to use this buffer again.  */
-  if (ALIGNED(histogram,cl_uint4))
   {
-    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, histogramBuffer, histogram, 0, NULL, NULL);
+    size_t global_work_size[2];
+    global_work_size[0] = image->columns;
+    global_work_size[1] = image->rows;
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, modulateKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
     if (clStatus != CL_SUCCESS)
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
       goto cleanup;
     }
+    clEnv->library->clFlush(queue);
   }
 
-  /* recreate input buffer later, in case image updated */
-#ifdef RECREATEBUFFER 
-  if (imageBuffer!=NULL)                     
-    clEnv->library->clReleaseMemObject(imageBuffer);
-#endif
-  /* CPU stuff */
-  equalize_map=(PixelPacket *) AcquireQuantumMemory(MaxMap+1UL, sizeof(*equalize_map));
-  if (equalize_map == (PixelPacket *) NULL)
-    ThrowBinaryException(ResourceLimitWarning,"MemoryAllocationFailed", image->filename);
-
-  map=(FloatPixelPacket *) AcquireQuantumMemory(MaxMap+1UL,sizeof(*map));
-  if (map == (FloatPixelPacket *) NULL)
-    ThrowBinaryException(ResourceLimitWarning,"MemoryAllocationFailed", image->filename);
-
-  /*
-    Integrate the histogram to get the equalization map.
-  */
-  (void) ResetMagickMemory(&intensity,0,sizeof(intensity));
-  for (i=0; i <= (ssize_t) MaxMap; i++)
-  {
-    if ((channel & SyncChannels) != 0)
-    {
-      intensity.red+=histogram[i].s[2];
-      map[i]=intensity;
-      continue;
-    }
-    if ((channel & RedChannel) != 0)
-      intensity.red+=histogram[i].s[2];
-    if ((channel & GreenChannel) != 0)
-      intensity.green+=histogram[i].s[1];
-    if ((channel & BlueChannel) != 0)
-      intensity.blue+=histogram[i].s[0];
-    if ((channel & OpacityChannel) != 0)
-      intensity.alpha+=histogram[i].s[3];
-    /*
-    if (((channel & IndexChannel) != 0) &&
-        (image->colorspace == CMYKColorspace))
-    {
-      intensity.index+=histogram[i].index; 
-    }
-    */
-    map[i]=intensity;
-  }
-  black=map[0];
-  white=map[(int) MaxMap];
-  (void) ResetMagickMemory(equalize_map,0,(MaxMap+1)*sizeof(*equalize_map));
-  for (i=0; i <= (ssize_t) MaxMap; i++)
-  {
-    if ((channel & SyncChannels) != 0)
-    {
-      if (white.red != black.red)
-        equalize_map[i].red=ScaleMapToQuantum((MagickRealType) ((MaxMap*
-                (map[i].red-black.red))/(white.red-black.red)));
-      continue;
-    }
-    if (((channel & RedChannel) != 0) && (white.red != black.red))
-      equalize_map[i].red=ScaleMapToQuantum((MagickRealType) ((MaxMap*
-              (map[i].red-black.red))/(white.red-black.red)));
-    if (((channel & GreenChannel) != 0) && (white.green != black.green))
-      equalize_map[i].green=ScaleMapToQuantum((MagickRealType) ((MaxMap*
-              (map[i].green-black.green))/(white.green-black.green)));
-    if (((channel & BlueChannel) != 0) && (white.blue != black.blue))
-      equalize_map[i].blue=ScaleMapToQuantum((MagickRealType) ((MaxMap*
-              (map[i].blue-black.blue))/(white.blue-black.blue)));
-    if (((channel & OpacityChannel) != 0) && (white.alpha != black.alpha))
-      equalize_map[i].alpha=ScaleMapToQuantum((MagickRealType) ((MaxMap*
-              (map[i].alpha-black.alpha))/(white.alpha-black.alpha)));
-    /*
-    if ((((channel & IndexChannel) != 0) &&
-          (image->colorspace == CMYKColorspace)) &&
-        (white.index != black.index))
-      equalize_map[i].index=ScaleMapToQuantum((MagickRealType) ((MaxMap*
-              (map[i].index-black.index))/(white.index-black.index)));
-    */
-  }
-
-  if (image->storage_class == PseudoClass)
-  {
-    /*
-       Equalize colormap.
-       */
-    for (i=0; i < (ssize_t) image->colors; i++)
-    {
-      if ((channel & SyncChannels) != 0)
-      {
-        if (white.red != black.red)
-        {
-          image->colormap[i].red=equalize_map[
-            ScaleQuantumToMap(image->colormap[i].red)].red;
-          image->colormap[i].green=equalize_map[
-            ScaleQuantumToMap(image->colormap[i].green)].red;
-          image->colormap[i].blue=equalize_map[
-            ScaleQuantumToMap(image->colormap[i].blue)].red;
-          image->colormap[i].alpha=equalize_map[
-            ScaleQuantumToMap(image->colormap[i].alpha)].red;
-        }
-        continue;
-      }
-      if (((channel & RedChannel) != 0) && (white.red != black.red))
-        image->colormap[i].red=equalize_map[
-          ScaleQuantumToMap(image->colormap[i].red)].red;
-      if (((channel & GreenChannel) != 0) && (white.green != black.green))
-        image->colormap[i].green=equalize_map[
-          ScaleQuantumToMap(image->colormap[i].green)].green;
-      if (((channel & BlueChannel) != 0) && (white.blue != black.blue))
-        image->colormap[i].blue=equalize_map[
-          ScaleQuantumToMap(image->colormap[i].blue)].blue;
-      if (((channel & OpacityChannel) != 0) &&
-          (white.alpha != black.alpha))
-        image->colormap[i].alpha=equalize_map[
-          ScaleQuantumToMap(image->colormap[i].alpha)].alpha;
-    }
-  }
-
-  /*
-    Equalize image.
-  */
-
-  /* GPU can work on this again, image and equalize map as input
-    image:        uchar4 (CLPixelPacket)
-    equalize_map: uchar4 (PixelPacket)
-    black, white: float4 (FloatPixelPacket) */
-
-#ifdef RECREATEBUFFER 
-  /* If the host pointer is aligned to the size of CLPixelPacket, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
   if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+    length = image->columns * image->rows;
+    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
   else 
   {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    length = image->columns * image->rows;
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
   }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
-#endif
 
-  /* Create and initialize OpenCL buffers. */
-  if (ALIGNED(equalize_map, PixelPacket)) 
-  {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
-    hostPtr = equalize_map;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
-    hostPtr = equalize_map;
-  }
-  /* create a CL buffer for eqaulize_map  */
-  length = (MaxMap+1); 
-  equalizeMapBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(PixelPacket), hostPtr, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
+  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
 
-  /* get the OpenCL kernel */
-  equalizeKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Equalize");
-  if (equalizeKernel == NULL)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  /* set the kernel arguments */
-  i = 0;
-  clStatus=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(ChannelType),&channel);
-  clStatus|=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(cl_mem),(void *)&equalizeMapBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(FloatPixelPacket),&white);
-  clStatus|=clEnv->library->clSetKernelArg(equalizeKernel,i++,sizeof(FloatPixelPacket),&black);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  /* launch the kernel */
-  global_work_size[0] = image->columns;
-  global_work_size[1] = image->rows;
-
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, equalizeKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  clEnv->library->clFlush(queue);
-
-  /* read the data back */
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
-  {
-    length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
-  }
-  else 
-  {
-    length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
-  }
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
-
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
   image_view=DestroyCacheView(image_view);
 
   if (imageBuffer!=NULL)                     
     clEnv->library->clReleaseMemObject(imageBuffer);
-
-  if (map!=NULL)
-    map=(FloatPixelPacket *) RelinquishMagickMemory(map);
-
-  if (equalizeMapBuffer!=NULL)
-    clEnv->library->clReleaseMemObject(equalizeMapBuffer);
-  if (equalize_map!=NULL)
-    equalize_map=(PixelPacket *) RelinquishMagickMemory(equalize_map);
-
-  if (histogramBuffer!=NULL)                 
-    clEnv->library->clReleaseMemObject(histogramBuffer);
-  if (histogram!=NULL)
-    histogram=(cl_uint4 *) RelinquishMagickMemory(histogram);
-
-  if (histogramKernel!=NULL)                     
-    RelinquishOpenCLKernel(clEnv, histogramKernel);
-  if (equalizeKernel!=NULL)                     
-    RelinquishOpenCLKernel(clEnv, equalizeKernel);
-
+  if (modulateKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, modulateKernel);
   if (queue != NULL)                          
     RelinquishOpenCLCommandQueue(clEnv, queue);
 
-  return(outputReady);
+  return outputReady;
+
 }
 
-MagickExport MagickBooleanType AccelerateEqualizeImage(Image *image,
-  const ChannelType channel,ExceptionInfo *exception)
+MagickExport MagickBooleanType AccelerateModulateImage(Image *image,
+  double percent_brightness,double percent_hue,double percent_saturation,
+  ColorspaceType colorspace,ExceptionInfo *exception)
 {
   MagickBooleanType
     status;
@@ -5307,11 +4552,13 @@ MagickExport MagickBooleanType AccelerateEqualizeImage(Image *image,
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, channel) == MagickFalse) ||
-      (checkHistogramCondition(image, channel) == MagickFalse))
+      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
     return(MagickFalse);
 
-  status=ComputeEqualizeImage(image,channel,exception);
+  if ((colorspace != HSLColorspace && colorspace != UndefinedColorspace))
+    return(MagickFalse);
+
+  status = ComputeModulateImage(image,percent_brightness, percent_hue, percent_saturation, colorspace, exception);
   return(status);
 }
 
@@ -5320,50 +4567,19 @@ MagickExport MagickBooleanType AccelerateEqualizeImage(Image *image,
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     C o n t r a s t S t r e t c h I m a g e  w i t h  O p e n C L           %
+%     A c c e l e r a t e M o t i o n B l u r I m a g e                       %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  ContrastStretchImage() is a simple image enhancement technique that attempts
-%  to improve the contrast in an image by `stretching' the range of intensity
-%  values it contains to span a desired range of values. It differs from the
-%  more sophisticated histogram equalization in that it can only apply a
-%  linear scaling function to the image pixel values.  As a result the
-%  `enhancement' is less harsh.
-%
-%  The format of the ContrastStretchImage method is:
-%
-%      MagickBooleanType ContrastStretchImage(Image *image,
-%        const char *levels)
-%      MagickBooleanType ContrastStretchImageChannel(Image *image,
-%        const size_t channel,const double black_point,
-%        const double white_point)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o channel: the channel.
-%
-%    o black_point: the black point.
-%
-%    o white_point: the white point.
-%
-%    o levels: Specify the levels where the black and white points have the
-%      range of 0 to number-of-pixels (e.g. 1%, 10x90%, etc.).
-%
 */
 
-MagickExport MagickBooleanType ComputeContrastStretchImageChannel(Image *image,
-  const ChannelType channel,const double black_point,const double white_point
-  ExceptionInfo *exception) 
+static Image* ComputeMotionBlurImage(const Image *image,
+  const ChannelType channel,const double *kernel,const size_t width
+  const OffsetInfo *offset,ExceptionInfo *exception)
 {
-#define ContrastStretchImageTag  "ContrastStretch/Image"
-#define MaxRange(color)  ((MagickRealType) ScaleQuantumToMap((Quantum) (color)))
-
   CacheView
+    *filteredImage_view,
     *image_view;
 
   cl_command_queue
@@ -5372,117 +4588,89 @@ MagickExport MagickBooleanType ComputeContrastStretchImageChannel(Image *image,
   cl_context
     context;
 
+  cl_float4
+    biasPixel;
+
   cl_int
     clStatus;
 
-  cl_mem_flags
-    mem_flags;
+  cl_kernel
+    motionBlurKernel;
 
   cl_mem
-    histogramBuffer,
+    filteredImageBuffer,
     imageBuffer,
-    stretchMapBuffer;
+    imageKernelBuffer, 
+    offsetBuffer;
 
-  cl_kernel
-    histogramKernel,
-    stretchKernel;
+  cl_mem_flags
+    mem_flags;
 
-  cl_uint4
-    *histogram;
+  const void
+    *inputPixels;
 
-  double
-    intensity;
+  float
+    *kernelBufferPtr;
 
-  FloatPixelPacket
-    black,
-    white;
+  Image
+    *filteredImage;
+
+  int
+    *offsetBufferPtr;
 
   MagickBooleanType
-    outputReady,
-    status;
+    outputReady;
 
   MagickCLEnv
-    clEnv;
+   clEnv;
+
+  PixelInfo
+    bias;
 
   MagickSizeType
     length;
 
-  PixelPacket
-    *stretch_map;
-
-  register ssize_t
-    i;
-
   size_t
-    global_work_size[2];
+    global_work_size[2],
+    local_work_size[2];
+
+  unsigned int
+    i,
+    imageHeight,
+    imageWidth,
+    matte;
 
   void
-    *hostPtr,
-    *inputPixels;
+    *filteredPixels,
+    *hostPtr;
 
-  histogram=NULL;
-  stretch_map=NULL;
-  inputPixels = NULL;
-  imageBuffer = NULL;
-  histogramBuffer = NULL;
-  stretchMapBuffer = NULL;
-  histogramKernel = NULL; 
-  stretchKernel = NULL; 
+  outputReady = MagickFalse;
   context = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  imageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  imageKernelBuffer = NULL;
+  motionBlurKernel = NULL;
   queue = NULL;
-  outputReady = MagickFalse;
-
-
-  assert(image != (Image *) NULL);
-  assert(image->signature == MagickCoreSignature);
-  if (image->debug != MagickFalse)
-    (void) LogMagickEvent(TraceEvent,GetMagickModule(),"%s",image->filename);
 
-  //exception=(&image->exception);
-
-  /*
-   * initialize opencl env
-   */
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
-  queue = AcquireOpenCLCommandQueue(clEnv);
-
-  /*
-    Allocate and initialize histogram arrays.
-  */
-  histogram=(cl_uint4 *) AcquireQuantumMemory(MaxMap+1UL, sizeof(*histogram));
-
-  if (histogram == (cl_uint4 *) NULL)
-    ThrowBinaryException(ResourceLimitError,"MemoryAllocationFailed", image->filename);
-  /* reset histogram */
-  (void) ResetMagickMemory(histogram,0,(MaxMap+1)*sizeof(*histogram));
-
-  /*
-  if (IsGrayImage(image,exception) != MagickFalse)
-    (void) SetImageColorspace(image,GRAYColorspace);
-  */
-
-  status=MagickTrue;
 
-
-  /*
-    Form histogram.
-  */
   /* Create and initialize OpenCL buffers. */
-  /* inputPixels = AcquirePixelCachePixels(image, &length, exception); */
-  /* assume this  will get a writable image */
-  image_view=AcquireAuthenticCacheView(image,exception);
-  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
 
-  if (inputPixels == (void *) NULL)
+  image_view=AcquireVirtualCacheView(image,exception);
+  inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+  if (inputPixels == (const void *) NULL)
   {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheError,
+      "UnableToReadPixelCache.","`%s'",image->filename);
     goto cleanup;
   }
-  /* If the host pointer is aligned to the size of CLPixelPacket, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
+
+  // If the host pointer is aligned to the size of CLPixelPacket, 
+  // then use the host buffer directly from the GPU; otherwise, 
+  // create a buffer on the GPU and copy the data over
   if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
     mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
@@ -5491,459 +4679,244 @@ MagickExport MagickBooleanType ComputeContrastStretchImageChannel(Image *image,
   {
     mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
   }
-  /* create a CL buffer from image pixel buffer */
+  // create a CL buffer from image pixel buffer
   length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, 
+    length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    (void) ThrowMagickException(exception, GetMagickModule(),
+      ResourceLimitError, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
 
-  /* If the host pointer is aligned to the size of cl_uint, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
-  if (ALIGNED(histogram,cl_uint4)) 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
-    hostPtr = histogram;
-  }
-  else 
+
+  filteredImage = CloneImage(image,image->columns,image->rows,
+    MagickTrue,exception);
+  assert(filteredImage != NULL);
+  if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
   {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
-    hostPtr = histogram;
+    (void) ThrowMagickException(exception, GetMagickModule(), 
+      ResourceLimitError, "CloneImage failed.", "'%s'", ".");
+    goto cleanup;
   }
-  /* create a CL buffer for histogram  */
-  length = (MaxMap+1); 
-  histogramBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(cl_uint4), hostPtr, &clStatus);
-  if (clStatus != CL_SUCCESS)
+  filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
+  filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
+  if (filteredPixels == (void *) NULL)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheError, 
+      "UnableToReadPixelCache.","`%s'",filteredImage->filename);
     goto cleanup;
   }
 
-  status = LaunchHistogramKernel(clEnv, queue, imageBuffer, histogramBuffer, image, channel, exception);
-  if (status == MagickFalse)
-    goto cleanup;
-
-  /* read from the kenel output */
-  if (ALIGNED(histogram,cl_uint4)) 
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
-    length = (MaxMap+1); 
-    clEnv->library->clEnqueueMapBuffer(queue, histogramBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(cl_uint4), 0, NULL, NULL, &clStatus);
+    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = filteredPixels;
   }
   else 
   {
-    length = (MaxMap+1); 
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, histogramBuffer, CL_TRUE, 0, length * sizeof(cl_uint4), histogram, 0, NULL, NULL);
+    mem_flags = CL_MEM_WRITE_ONLY;
+    hostPtr = NULL;
   }
+  // create a CL buffer from image pixel buffer
+  length = image->columns * image->rows;
+  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, 
+    length * sizeof(CLPixelPacket), hostPtr, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    (void) ThrowMagickException(exception, GetMagickModule(), 
+      ResourceLimitError, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
 
-  /* unmap, don't block gpu to use this buffer again.  */
-  if (ALIGNED(histogram,cl_uint4))
+
+  imageKernelBuffer = clEnv->library->clCreateBuffer(context, 
+    CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, width * sizeof(float), NULL,
+    &clStatus);
+  if (clStatus != CL_SUCCESS)
   {
-    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, histogramBuffer, histogram, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
-      goto cleanup;
-    }
+    (void) ThrowMagickException(exception, GetMagickModule(), 
+      ResourceLimitError, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
   }
 
-  /* recreate input buffer later, in case image updated */
-#ifdef RECREATEBUFFER 
-  if (imageBuffer!=NULL)                     
-    clEnv->library->clReleaseMemObject(imageBuffer);
-#endif
+  queue = AcquireOpenCLCommandQueue(clEnv);
+  kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, 
+    CL_TRUE, CL_MAP_WRITE, 0, width * sizeof(float), 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), 
+      ResourceLimitError, "clEnv->library->clEnqueueMapBuffer failed.",".");
+    goto cleanup;
+  }
+  for (i = 0; i < width; i++)
+  {
+    kernelBufferPtr[i] = (float) kernel[i];
+  }
+  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr,
+    0, NULL, NULL);
+ if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, 
+      "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-  /* CPU stuff */
-  /*
-     Find the histogram boundaries by locating the black/white levels.
-  */
-  black.red=0.0;
-  white.red=MaxRange(QuantumRange);
-  if ((channel & RedChannel) != 0)
+  offsetBuffer = clEnv->library->clCreateBuffer(context, 
+    CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, width * sizeof(cl_int2), NULL,
+    &clStatus);
+  if (clStatus != CL_SUCCESS)
   {
-    intensity=0.0;
-    for (i=0; i <= (ssize_t) MaxMap; i++)
-    {
-      intensity+=histogram[i].s[2];
-      if (intensity > black_point)
-        break;
-    }
-    black.red=(MagickRealType) i;
-    intensity=0.0;
-    for (i=(ssize_t) MaxMap; i != 0; i--)
-    {
-      intensity+=histogram[i].s[2];
-      if (intensity > ((double) image->columns*image->rows-white_point))
-        break;
-    }
-    white.red=(MagickRealType) i;
+    (void) ThrowMagickException(exception, GetMagickModule(), 
+      ResourceLimitError, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
   }
-  black.green=0.0;
-  white.green=MaxRange(QuantumRange);
-  if ((channel & GreenChannel) != 0)
+
+  offsetBufferPtr = (int*)clEnv->library->clEnqueueMapBuffer(queue, offsetBuffer, CL_TRUE, 
+    CL_MAP_WRITE, 0, width * sizeof(cl_int2), 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
   {
-    intensity=0.0;
-    for (i=0; i <= (ssize_t) MaxMap; i++)
-    {
-      intensity+=histogram[i].s[2];
-      if (intensity > black_point)
-        break;
-    }
-    black.green=(MagickRealType) i;
-    intensity=0.0;
-    for (i=(ssize_t) MaxMap; i != 0; i--)
-    {
-      intensity+=histogram[i].s[2];
-      if (intensity > ((double) image->columns*image->rows-white_point))
-        break;
-    }
-    white.green=(MagickRealType) i;
+    (void) ThrowMagickException(exception, GetMagickModule(), 
+      ResourceLimitError, "clEnv->library->clEnqueueMapBuffer failed.",".");
+    goto cleanup;
   }
-  black.blue=0.0;
-  white.blue=MaxRange(QuantumRange);
-  if ((channel & BlueChannel) != 0)
+  for (i = 0; i < width; i++)
   {
-    intensity=0.0;
-    for (i=0; i <= (ssize_t) MaxMap; i++)
-    {
-      intensity+=histogram[i].s[2];
-      if (intensity > black_point)
-        break;
-    }
-    black.blue=(MagickRealType) i;
-    intensity=0.0;
-    for (i=(ssize_t) MaxMap; i != 0; i--)
-    {
-      intensity+=histogram[i].s[2];
-      if (intensity > ((double) image->columns*image->rows-white_point))
-        break;
-    }
-    white.blue=(MagickRealType) i;
+    offsetBufferPtr[2*i] = (int)offset[i].x;
+    offsetBufferPtr[2*i+1] = (int)offset[i].y;
   }
-  black.alpha=0.0;
-  white.alpha=MaxRange(QuantumRange);
 if ((channel & OpacityChannel) != 0)
+  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, offsetBuffer, offsetBufferPtr, 0, 
+    NULL, NULL);
if (clStatus != CL_SUCCESS)
   {
-    intensity=0.0;
-    for (i=0; i <= (ssize_t) MaxMap; i++)
-    {
-      intensity+=histogram[i].s[2];
-      if (intensity > black_point)
-        break;
-    }
-    black.alpha=(MagickRealType) i;
-    intensity=0.0;
-    for (i=(ssize_t) MaxMap; i != 0; i--)
-    {
-      intensity+=histogram[i].s[2];
-      if (intensity > ((double) image->columns*image->rows-white_point))
-        break;
-    }
-    white.alpha=(MagickRealType) i;
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
+      "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    goto cleanup;
   }
-  /*
-  black.index=0.0;
-  white.index=MaxRange(QuantumRange);
-  if (((channel & IndexChannel) != 0) && (image->colorspace == CMYKColorspace))
+
+
+ // get the OpenCL kernel
+  motionBlurKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, 
+    "MotionBlur");
+  if (motionBlurKernel == NULL)
   {
-    intensity=0.0;
-    for (i=0; i <= (ssize_t) MaxMap; i++)
-    {
-      intensity+=histogram[i].index;
-      if (intensity > black_point)
-        break;
-    }
-    black.index=(MagickRealType) i;
-    intensity=0.0;
-    for (i=(ssize_t) MaxMap; i != 0; i--)
-    {
-      intensity+=histogram[i].index;
-      if (intensity > ((double) image->columns*image->rows-white_point))
-        break;
-    }
-    white.index=(MagickRealType) i;
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
+      "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
   }
-  */
+  
+  // set the kernel arguments
+  i = 0;
+  clStatus=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_mem),
+    (void *)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_mem),
+    (void *)&filteredImageBuffer);
+  imageWidth = (unsigned int) image->columns;
+  imageHeight = (unsigned int) image->rows;
+  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(unsigned int),
+    &imageWidth);
+  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(unsigned int),
+    &imageHeight);
+  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_mem),
+    (void *)&imageKernelBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(unsigned int),
+    &width);
+  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_mem),
+    (void *)&offsetBuffer);
 
+  GetPixelInfo(image,&bias);
+  biasPixel.s[0] = bias.red;
+  biasPixel.s[1] = bias.green;
+  biasPixel.s[2] = bias.blue;
+  biasPixel.s[3] = bias.alpha;
+  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_float4), &biasPixel);
 
-  stretch_map=(PixelPacket *) AcquireQuantumMemory(MaxMap+1UL,
-    sizeof(*stretch_map));
+  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(ChannelType), &channel);
+  matte = (image->alpha_trait == UndefinedPixelTrait)?1:0;
+  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(unsigned int), &matte);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
+      "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-  if (stretch_map == (PixelPacket *) NULL)
-    ThrowBinaryException(ResourceLimitError,"MemoryAllocationFailed",
-      image->filename);
-  /*
-    Stretch the histogram to create the stretched image mapping.
-  */
-  (void) ResetMagickMemory(stretch_map,0,(MaxMap+1)*sizeof(*stretch_map));
-  for (i=0; i <= (ssize_t) MaxMap; i++)
+  // launch the kernel
+  local_work_size[0] = 16;
+  local_work_size[1] = 16;
+  global_work_size[0] = (size_t)padGlobalWorkgroupSizeToLocalWorkgroupSize(
+                                (unsigned int) image->columns,(unsigned int) local_work_size[0]);
+  global_work_size[1] = (size_t)padGlobalWorkgroupSizeToLocalWorkgroupSize(
+                                (unsigned int) image->rows,(unsigned int) local_work_size[1]);
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, motionBlurKernel, 2, NULL, 
+    global_work_size, local_work_size, 0, NULL, NULL);
+
+  if (clStatus != CL_SUCCESS)
   {
-    if ((channel & RedChannel) != 0)
-    {
-      if (i < (ssize_t) black.red)
-        stretch_map[i].red=(Quantum) 0;
-      else
-        if (i > (ssize_t) white.red)
-          stretch_map[i].red=QuantumRange;
-        else
-          if (black.red != white.red)
-            stretch_map[i].red=ScaleMapToQuantum((MagickRealType) (MaxMap*
-                  (i-black.red)/(white.red-black.red)));
-    }
-    if ((channel & GreenChannel) != 0)
-    {
-      if (i < (ssize_t) black.green)
-        stretch_map[i].green=0;
-      else
-        if (i > (ssize_t) white.green)
-          stretch_map[i].green=QuantumRange;
-        else
-          if (black.green != white.green)
-            stretch_map[i].green=ScaleMapToQuantum((MagickRealType) (MaxMap*
-                  (i-black.green)/(white.green-black.green)));
-    }
-    if ((channel & BlueChannel) != 0)
-    {
-      if (i < (ssize_t) black.blue)
-        stretch_map[i].blue=0;
-      else
-        if (i > (ssize_t) white.blue)
-          stretch_map[i].blue= QuantumRange;
-        else
-          if (black.blue != white.blue)
-            stretch_map[i].blue=ScaleMapToQuantum((MagickRealType) (MaxMap*
-                  (i-black.blue)/(white.blue-black.blue)));
-    }
-    if ((channel & OpacityChannel) != 0)
-    {
-      if (i < (ssize_t) black.alpha)
-        stretch_map[i].alpha=0;
-      else
-        if (i > (ssize_t) white.alpha)
-          stretch_map[i].alpha=QuantumRange;
-        else
-          if (black.alpha != white.alpha)
-            stretch_map[i].alpha=ScaleMapToQuantum((MagickRealType) (MaxMap*
-                  (i-black.alpha)/(white.alpha-black.alpha)));
-    }
-    /*
-    if (((channel & IndexChannel) != 0) &&
-        (image->colorspace == CMYKColorspace))
-    {
-      if (i < (ssize_t) black.index)
-        stretch_map[i].index=0;
-      else
-        if (i > (ssize_t) white.index)
-          stretch_map[i].index=QuantumRange;
-        else
-          if (black.index != white.index)
-            stretch_map[i].index=ScaleMapToQuantum((MagickRealType) (MaxMap*
-                  (i-black.index)/(white.index-black.index)));
-    }
-    */
-  }
-
-  /*
-    Stretch the image.
-  */
-  if (((channel & OpacityChannel) != 0) || (((channel & IndexChannel) != 0) &&
-      (image->colorspace == CMYKColorspace)))
-    image->storage_class=DirectClass;
-  if (image->storage_class == PseudoClass)
-  {
-    /*
-       Stretch colormap.
-       */
-    for (i=0; i < (ssize_t) image->colors; i++)
-    {
-      if ((channel & RedChannel) != 0)
-      {
-        if (black.red != white.red)
-          image->colormap[i].red=stretch_map[
-            ScaleQuantumToMap(image->colormap[i].red)].red;
-      }
-      if ((channel & GreenChannel) != 0)
-      {
-        if (black.green != white.green)
-          image->colormap[i].green=stretch_map[
-            ScaleQuantumToMap(image->colormap[i].green)].green;
-      }
-      if ((channel & BlueChannel) != 0)
-      {
-        if (black.blue != white.blue)
-          image->colormap[i].blue=stretch_map[
-            ScaleQuantumToMap(image->colormap[i].blue)].blue;
-      }
-      if ((channel & OpacityChannel) != 0)
-      {
-        if (black.alpha != white.alpha)
-          image->colormap[i].alpha=stretch_map[
-            ScaleQuantumToMap(image->colormap[i].alpha)].alpha;
-      }
-    }
-  }
-
-  /*
-    Stretch image.
-  */
-
-
-  /* GPU can work on this again, image and equalize map as input
-    image:        uchar4 (CLPixelPacket)
-    stretch_map:  uchar4 (PixelPacket)
-    black, white: float4 (FloatPixelPacket) */
-
-#ifdef RECREATEBUFFER 
-  /* If the host pointer is aligned to the size of CLPixelPacket, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
-  }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-#endif
-
-  /* Create and initialize OpenCL buffers. */
-  if (ALIGNED(stretch_map, PixelPacket)) 
-  {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
-    hostPtr = stretch_map;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
-    hostPtr = stretch_map;
-  }
-  /* create a CL buffer for stretch_map  */
-  length = (MaxMap+1); 
-  stretchMapBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(PixelPacket), hostPtr, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-
-  /* get the OpenCL kernel */
-  stretchKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Stretch");
-  if (stretchKernel == NULL)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  /* set the kernel arguments */
-  i = 0;
-  clStatus=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(ChannelType),&channel);
-  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&stretchMapBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&white);
-  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&black);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  /* launch the kernel */
-  global_work_size[0] = image->columns;
-  global_work_size[1] = image->rows;
-
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, stretchKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
+      "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
     goto cleanup;
   }
   clEnv->library->clFlush(queue);
 
-  /* read the data back */
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
     length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, 
+      CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, 
+      NULL, &clStatus);
   }
   else 
   {
     length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, 
+      length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
   }
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
+      "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
-
-  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
+  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
 
 cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
   image_view=DestroyCacheView(image_view);
+  if (filteredImage_view != NULL)
+    filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  if (imageBuffer!=NULL)                     
-    clEnv->library->clReleaseMemObject(imageBuffer);
-
-  if (stretchMapBuffer!=NULL)
-    clEnv->library->clReleaseMemObject(stretchMapBuffer);
-  if (stretch_map!=NULL)
-    stretch_map=(PixelPacket *) RelinquishMagickMemory(stretch_map);
-
-
-  if (histogramBuffer!=NULL)
-    clEnv->library->clReleaseMemObject(histogramBuffer);
-  if (histogram!=NULL)
-    histogram=(cl_uint4 *) RelinquishMagickMemory(histogram);
-
-
-  if (histogramKernel!=NULL)                     
-    RelinquishOpenCLKernel(clEnv, histogramKernel);
-  if (stretchKernel!=NULL)                     
-    RelinquishOpenCLKernel(clEnv, stretchKernel);
-
-  if (queue != NULL)                          
-    RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (filteredImageBuffer!=NULL)  clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (imageBuffer!=NULL)     clEnv->library->clReleaseMemObject(imageBuffer);
+  if (imageKernelBuffer!=NULL)    clEnv->library->clReleaseMemObject(imageKernelBuffer);
+  if (motionBlurKernel!=NULL)  RelinquishOpenCLKernel(clEnv, motionBlurKernel);
+  if (queue != NULL)           RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse && filteredImage != NULL)
+    filteredImage=DestroyImage(filteredImage);
 
-  return(outputReady);
+  return(filteredImage);
 }
 
-MagickExport MagickBooleanType AccelerateContrastStretchImageChannel(
-  Image *image,const ChannelType channel,const double black_point,
-  const double white_point,ExceptionInfo *exception)
+MagickExport Image *AccelerateMotionBlurImage(const Image *image,
+  const ChannelType channel,const double* kernel,const size_t width,
+  const OffsetInfo *offset,ExceptionInfo *exception)
 {
-  MagickBooleanType
-    status;
+  Image
+    *filteredImage;
 
   assert(image != NULL);
+  assert(kernel != (double *) NULL);
+  assert(offset != (OffsetInfo *) NULL);
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, channel) == MagickFalse) ||
-      (checkHistogramCondition(image, channel) == MagickFalse))
-    return(MagickFalse);
+      (checkAccelerateCondition(image, channel) == MagickFalse))
+    return NULL;
 
-  status=ComputeContrastStretchImageChannel(image,channel, black_point, white_point, exception);
-  return(status);
+  filteredImage=ComputeMotionBlurImage(image, channel, kernel, width,
+    offset, exception);
+  return(filteredImage);
 }
 
 /*
@@ -5951,40 +4924,75 @@ MagickExport MagickBooleanType AccelerateContrastStretchImageChannel(
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     D e s p e c k l e I m a g e  w i t h  O p e n C L                       %
+%     A c c e l e r a t e R a n d o m I m a g e                               %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  DespeckleImage() reduces the speckle noise in an image while perserving the
-%  edges of the original image.  A speckle removing filter uses a complementary 
-%  hulling technique (raising pixels that are darker than their surrounding
-%  neighbors, then complementarily lowering pixels that are brighter than their
-%  surrounding neighbors) to reduce the speckle index of that image (reference
-%  Crimmins speckle removal).
-%
-%  The format of the DespeckleImage method is:
-%
-%      Image *DespeckleImage(const Image *image,ExceptionInfo *exception)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o exception: return any errors or warnings in this structure.
-%
 */
 
-static Image *ComputeDespeckleImage(const Image *image,
-  ExceptionInfo*exception)
+static MagickBooleanType LaunchRandomImageKernel(MagickCLEnv clEnv,
+  cl_command_queue queue,cl_mem imageBuffer,const unsigned int imageColumns,
+  const unsigned int imageRows,cl_mem seedBuffer,
+  const unsigned int numGenerators,ExceptionInfo *exception)
 {
-  static const int 
-    X[4] = {0, 1, 1,-1},
-    Y[4] = {1, 0, 1, 1};
+  int
+    k;
+
+  cl_int
+    clStatus;
+
+  cl_kernel
+    randomImageKernel;
+
+  MagickBooleanType
+    status;
+
+  size_t
+    global_work_size,
+    local_work_size;
+
+  status = MagickFalse;
+  randomImageKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "RandomNumberGenerator");
+
+  k = 0;
+  clEnv->library->clSetKernelArg(randomImageKernel,k++,sizeof(cl_mem),(void*)&imageBuffer);
+  clEnv->library->clSetKernelArg(randomImageKernel,k++,sizeof(cl_uint),(void*)&imageColumns);
+  clEnv->library->clSetKernelArg(randomImageKernel,k++,sizeof(cl_uint),(void*)&imageRows);
+  clEnv->library->clSetKernelArg(randomImageKernel,k++,sizeof(cl_mem),(void*)&seedBuffer);
+  {
+    const float randNormNumerator = 1.0f;
+    const unsigned int randNormDenominator = (unsigned int)(~0UL);
+    clEnv->library->clSetKernelArg(randomImageKernel,k++,
+          sizeof(float),(void*)&randNormNumerator);
+    clEnv->library->clSetKernelArg(randomImageKernel,k++,
+          sizeof(cl_uint),(void*)&randNormDenominator);
+  }
+
+
+  global_work_size = numGenerators;
+  local_work_size = 64;
+
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue,randomImageKernel,1,NULL,&global_work_size,
+                                    &local_work_size,0,NULL,NULL);
+
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, 
+                                      "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  status = MagickTrue;
+
+cleanup:
+  if (randomImageKernel!=NULL) RelinquishOpenCLKernel(clEnv, randomImageKernel);
+  return(status);
+}
 
+static MagickBooleanType ComputeRandomImage(Image* image,
+  ExceptionInfo* exception)
+{
   CacheView
-    *filteredImage_view,
     *image_view;
 
   cl_command_queue
@@ -5996,30 +5004,19 @@ static Image *ComputeDespeckleImage(const Image *image,
   cl_int
     clStatus;
 
-  cl_kernel
-    hullPass1,
-    hullPass2;
+  /* Don't release this buffer in this function !!! */
+  cl_mem
+    randomNumberSeedsBuffer;
 
   cl_mem_flags
     mem_flags;
 
-  cl_mem
-    filteredImageBuffer,
-    imageBuffer,
-    tempImageBuffer[2];
-
-  const void
-    *inputPixels;
-
-  Image
-    *filteredImage;
-
-  int
-    k,
-    matte;
+  cl_mem 
+   imageBuffer;
 
-  MagickBooleanType
-    outputReady;
+  MagickBooleanType 
+    outputReady,
+    status;
 
   MagickCLEnv
     clEnv;
@@ -6027,49 +5024,38 @@ static Image *ComputeDespeckleImage(const Image *image,
   MagickSizeType
     length;
 
-  size_t
-    global_work_size[2];
-
-  unsigned int
-    imageHeight,
-    imageWidth;
-
   void
-    *filteredPixels,
-    *hostPtr;
+    *inputPixels;
 
+  status = MagickFalse;
   outputReady = MagickFalse;
-  clEnv = NULL;
   inputPixels = NULL;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  filteredPixels = NULL;
   context = NULL;
   imageBuffer = NULL;
-  filteredImageBuffer = NULL;
-  hullPass1 = NULL;
-  hullPass2 = NULL;
   queue = NULL;
-  tempImageBuffer[0] = tempImageBuffer[1] = NULL;
+
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
-  queue = AcquireOpenCLCommandQueue(clEnv);
-  image_view=AcquireVirtualCacheView(image,exception);
-  inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+
+  /* Create and initialize OpenCL buffers. */
+  image_view=AcquireAuthenticCacheView(image,exception);
+  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
   if (inputPixels == (void *) NULL)
   {
     (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
     goto cleanup;
   }
 
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
   if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
   }
   else 
   {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
   }
   /* create a CL buffer from image pixel buffer */
   length = image->columns * image->rows;
@@ -6079,664 +5065,578 @@ static Image *ComputeDespeckleImage(const Image *image,
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
+  queue = AcquireOpenCLCommandQueue(clEnv);
 
-  mem_flags = CL_MEM_READ_WRITE;
-  length = image->columns * image->rows;
-  for (k = 0; k < 2; k++)
-  {
-    tempImageBuffer[k] = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
-  }
-
-  filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
-  assert(filteredImage != NULL);
-  if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
+  randomNumberSeedsBuffer = GetAndLockRandSeedBuffer(clEnv);
+  if (randomNumberSeedsBuffer==NULL)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), 
+           ResourceLimitWarning, "Failed to get GPU random number generators.",
+           "'%s'", ".");
     goto cleanup;
   }
-  filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
-  filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
-  if (filteredPixels == (void *) NULL)
+
+  status = LaunchRandomImageKernel(clEnv,queue,
+                                   imageBuffer,
+                                   (unsigned int) image->columns,
+                                   (unsigned int) image->rows,
+                                   randomNumberSeedsBuffer,
+                                   GetNumRandGenerators(clEnv),
+                                   exception);
+  if (status==MagickFalse)
   {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
     goto cleanup;
   }
 
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
-    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
-    hostPtr = filteredPixels;
+    length = image->columns * image->rows;
+    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
   else 
   {
-    mem_flags = CL_MEM_WRITE_ONLY;
-    hostPtr = NULL;
+    length = image->columns * image->rows;
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
   }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
+  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
 
-  hullPass1 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass1");
-  hullPass2 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass2");
+cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
-  clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus |=clEnv->library->clSetKernelArg(hullPass1,1,sizeof(cl_mem),(void *)(tempImageBuffer+1));
-  imageWidth = (unsigned int) image->columns;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass1,2,sizeof(unsigned int),(void *)&imageWidth);
-  imageHeight = (unsigned int) image->rows;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass1,3,sizeof(unsigned int),(void *)&imageHeight);
-  matte = (image->alpha_trait == UndefinedPixelTrait)?0:1;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass1,6,sizeof(int),(void *)&matte);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
+  image_view=DestroyCacheView(image_view);
 
-  clStatus = clEnv->library->clSetKernelArg(hullPass2,0,sizeof(cl_mem),(void *)(tempImageBuffer+1));
-  clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)tempImageBuffer);
-  imageWidth = (unsigned int) image->columns;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass2,2,sizeof(unsigned int),(void *)&imageWidth);
-  imageHeight = (unsigned int) image->rows;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass2,3,sizeof(unsigned int),(void *)&imageHeight);
-  matte = (image->alpha_trait == UndefinedPixelTrait)?0:1;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass2,6,sizeof(int),(void *)&matte);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
+  UnlockRandSeedBuffer(clEnv);
+  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
+  if (queue != NULL)                  RelinquishOpenCLCommandQueue(clEnv, queue);
+  return outputReady;
+}
 
+MagickExport MagickBooleanType AccelerateRandomImage(Image *image,
+  ExceptionInfo* exception)
+{
+  MagickBooleanType
+    status;
 
-  global_work_size[0] = image->columns;
-  global_work_size[1] = image->rows;
+  assert(image != NULL);
+  assert(exception != (ExceptionInfo *) NULL);
 
-  
-  for (k = 0; k < 4; k++)
-  {
-    cl_int2 offset;
-    int polarity;
+  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
+      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
+    return(MagickFalse);
 
-    
-    offset.s[0] = X[k];
-    offset.s[1] = Y[k];
-    polarity = 1;
-    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
+  status=ComputeRandomImage(image,exception);
+  return(status);
+}
 
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     A c c e l e r a t e R e s i z e I m a g e                               %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
 
-    if (k == 0)
-      clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)(tempImageBuffer));
-    offset.s[0] = -X[k];
-    offset.s[1] = -Y[k];
-    polarity = 1;
-    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
+static MagickBooleanType resizeHorizontalFilter(cl_mem image,
+  const unsigned int imageColumns,const unsigned int imageRows,
+  const unsigned int matte,cl_mem resizedImage,
+  const unsigned int resizedColumns,const unsigned int resizedRows,
+  const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients,
+  const float xFactor,MagickCLEnv clEnv,cl_command_queue queue,
+  ExceptionInfo *exception)
+{
+  cl_kernel
+    horizontalKernel;
 
-    offset.s[0] = -X[k];
-    offset.s[1] = -Y[k];
-    polarity = -1;
-    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
+  cl_int clStatus;
 
-    offset.s[0] = X[k];
-    offset.s[1] = Y[k];
-    polarity = -1;
-    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+  const unsigned int
+    workgroupSize = 256;
 
-    if (k == 3)
-      clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)&filteredImageBuffer);
+  float
+    resizeFilterScale,
+    resizeFilterSupport,
+    resizeFilterWindowSupport,
+    resizeFilterBlur,
+    scale,
+    support;
 
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-  }
+  int
+    cacheRangeStart,
+    cacheRangeEnd,
+    numCachedPixels,
+    resizeFilterType,
+    resizeWindowType;
 
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  MagickBooleanType
+    status = MagickFalse;
+
+  size_t
+    deviceLocalMemorySize,
+    gammaAccumulatorLocalMemorySize,
+    global_work_size[2],
+    imageCacheLocalMemorySize,
+    pixelAccumulatorLocalMemorySize,
+    local_work_size[2],
+    totalLocalMemorySize,
+    weightAccumulatorLocalMemorySize;
+
+  unsigned int
+    chunkSize,
+    i,
+    pixelPerWorkgroup;
+
+  horizontalKernel = NULL;
+  status = MagickFalse;
+
+  /*
+  Apply filter to resize vertically from image to resize image.
+  */
+  scale=MAGICK_MAX(1.0/xFactor+MagickEpsilon,1.0);
+  support=scale*GetResizeFilterSupport(resizeFilter);
+  if (support < 0.5)
   {
-    length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+    /*
+    Support too small even for nearest neighbour: Reduce to point
+    sampling.
+    */
+    support=(MagickRealType) 0.5;
+    scale=1.0;
   }
-  else 
+  scale=PerceptibleReciprocal(scale);
+
+  if (resizedColumns < workgroupSize) 
   {
-    length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+    chunkSize = 32;
+    pixelPerWorkgroup = 32;
   }
-  if (clStatus != CL_SUCCESS)
+  else
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
-    goto cleanup;
+    chunkSize = workgroupSize;
+    pixelPerWorkgroup = workgroupSize;
   }
 
-  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
-
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
-
-  image_view=DestroyCacheView(image_view);
-  if (filteredImage_view != NULL)
-    filteredImage_view=DestroyCacheView(filteredImage_view);
+  /* get the local memory size supported by the device */
+  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
 
-  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
-  for (k = 0; k < 2; k++)
+DisableMSCWarning(4127)
+  while(1)
+RestoreMSCWarning
   {
-    if (tempImageBuffer[k]!=NULL)            clEnv->library->clReleaseMemObject(tempImageBuffer[k]);
-  }
-  if (filteredImageBuffer!=NULL)             clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (hullPass1!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass1);
-  if (hullPass2!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass2);
-  if (outputReady == MagickFalse && filteredImage != NULL)
-    filteredImage=DestroyImage(filteredImage);
-  return(filteredImage);
-}
-
-MagickExport Image *AccelerateDespeckleImage(const Image* image,
-  ExceptionInfo* exception)
-{
-  Image
-    *filteredImage;
+    /* calculate the local memory size needed per workgroup */
+    cacheRangeStart = (int) (((0 + 0.5)/xFactor+MagickEpsilon)-support+0.5);
+    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/xFactor+MagickEpsilon)+support+0.5);
+    numCachedPixels = cacheRangeEnd - cacheRangeStart + 1;
+    imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket);
+    totalLocalMemorySize = imageCacheLocalMemorySize;
 
-  assert(image != NULL);
-  assert(exception != (ExceptionInfo *) NULL);
+    /* local size for the pixel accumulator */
+    pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4);
+    totalLocalMemorySize+=pixelAccumulatorLocalMemorySize;
 
-  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
-    return NULL;
+    /* local memory size for the weight accumulator */
+    weightAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=weightAccumulatorLocalMemorySize;
 
-  filteredImage=ComputeDespeckleImage(image,exception);
-  return(filteredImage);
-}
+    /* local memory size for the gamma accumulator */
+    if (matte == 0)
+      gammaAccumulatorLocalMemorySize = sizeof(float);
+    else
+      gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=gammaAccumulatorLocalMemorySize;
 
-static Image *ComputeAddNoiseImage(const Image *image,
-  const ChannelType channel,const NoiseType noise_type,
-  ExceptionInfo *exception)
-{
-  CacheView
-    *filteredImage_view,
-    *image_view;
+    if (totalLocalMemorySize <= deviceLocalMemorySize)
+      break;
+    else
+    {
+      pixelPerWorkgroup = pixelPerWorkgroup/2;
+      chunkSize = chunkSize/2;
+      if (pixelPerWorkgroup == 0
+          || chunkSize == 0)
+      {
+        /* quit, fallback to CPU */
+        goto cleanup;
+      }
+    }
+  }
 
-  cl_command_queue
-    queue;
+  resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter);
+  resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter);
 
-  cl_context
-    context;
 
-  cl_int
-    inputPixelCount,
-    pixelsPerWorkitem,
-    clStatus;
-
-  cl_uint
-    seed0,
-    seed1;
-
-  cl_kernel
-    addNoiseKernel;
-
-  cl_mem_flags
-    mem_flags;
-
-  cl_mem
-    filteredImageBuffer,
-    imageBuffer;
-
-  const char
-    *option;
-
-  const void
-    *inputPixels;
-
-  float
-    attenuate;
+  if (resizeFilterType == SincFastWeightingFunction
+    && resizeWindowType == SincFastWeightingFunction)
+  {
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilterSinc");
+  }
+  else
+  {
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilter");
+  }
+  if (horizontalKernel == NULL)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-  MagickBooleanType
-    outputReady;
+  i = 0;
+  clStatus = clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&image);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageColumns);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageRows);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&matte);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&xFactor);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizedImage);
 
-  MagickCLEnv
-    clEnv;
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedRows);
 
-  MagickSizeType
-    length;
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeFilterType);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeWindowType);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients);
 
-  Image
-    *filteredImage;
+  resizeFilterScale = (float) GetResizeFilterScale(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterScale);
 
-  RandomInfo
-    **restrict random_info;
+  resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterSupport);
 
-  size_t
-    global_work_size[1],
-    local_work_size[1];
+  resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport);
 
-  unsigned int
-    k,
-    numRandomNumberPerPixel;
+  resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterBlur);
 
-#if defined(MAGICKCORE_OPENMP_SUPPORT)
-  unsigned long
-    key;
-#endif
 
-  void
-    *filteredPixels,
-    *hostPtr;
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, imageCacheLocalMemorySize, NULL);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), &numCachedPixels);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &chunkSize);
+  
 
-  outputReady = MagickFalse;
-  clEnv = NULL;
-  inputPixels = NULL;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  filteredPixels = NULL;
-  context = NULL;
-  imageBuffer = NULL;
-  filteredImageBuffer = NULL;
-  queue = NULL;
-  addNoiseKernel = NULL;
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, pixelAccumulatorLocalMemorySize, NULL);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, weightAccumulatorLocalMemorySize, NULL);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, gammaAccumulatorLocalMemorySize, NULL);
 
-  clEnv = GetDefaultOpenCLEnv();
-  context = GetOpenCLContext(clEnv);
-  queue = AcquireOpenCLCommandQueue(clEnv);
-  image_view=AcquireVirtualCacheView(image,exception);
-  inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (inputPixels == (void *) NULL)
+  if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
-  {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
-  }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  global_work_size[0] = (resizedColumns+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize;
+  global_work_size[1] = resizedRows;
+
+  local_work_size[0] = workgroupSize;
+  local_work_size[1] = 1;
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, horizontalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+  (void) local_work_size;
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
     goto cleanup;
   }
+  clEnv->library->clFlush(queue);
+  status = MagickTrue;
 
 
-  filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
-  assert(filteredImage != NULL);
-  if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
-  filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
-  if (filteredPixels == (void *) NULL)
-  {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
-    goto cleanup;
-  }
+cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  if (horizontalKernel != NULL) RelinquishOpenCLKernel(clEnv, horizontalKernel);
+
+  return(status);
+}
+
+static MagickBooleanType resizeVerticalFilter(cl_mem image,
+  const unsigned int imageColumns,const unsigned int imageRows,
+  const unsigned int matte,cl_mem resizedImage,
+  const unsigned int resizedColumns,const unsigned int resizedRows,
+  const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients,
+  const float yFactor,MagickCLEnv clEnv,cl_command_queue queue,
+  ExceptionInfo *exception)
+{
+  cl_kernel
+    verticalKernel;
+
+  cl_int clStatus;
+
+  const unsigned int
+    workgroupSize = 256;
+
+  float
+    resizeFilterScale,
+    resizeFilterSupport,
+    resizeFilterWindowSupport,
+    resizeFilterBlur,
+    scale,
+    support;
+
+  int
+    cacheRangeStart,
+    cacheRangeEnd,
+    numCachedPixels,
+    resizeFilterType,
+    resizeWindowType;
+
+  MagickBooleanType
+    status = MagickFalse;
+
+  size_t
+    deviceLocalMemorySize,
+    gammaAccumulatorLocalMemorySize,
+    global_work_size[2],
+    imageCacheLocalMemorySize,
+    pixelAccumulatorLocalMemorySize,
+    local_work_size[2],
+    totalLocalMemorySize,
+    weightAccumulatorLocalMemorySize;
+
+  unsigned int
+    chunkSize,
+    i,
+    pixelPerWorkgroup;
+
+  verticalKernel = NULL;
+  status = MagickFalse;
+
+  /*
+  Apply filter to resize vertically from image to resize image.
+  */
+  scale=MAGICK_MAX(1.0/yFactor+MagickEpsilon,1.0);
+  support=scale*GetResizeFilterSupport(resizeFilter);
+  if (support < 0.5)
   {
-    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
-    hostPtr = filteredPixels;
+    /*
+    Support too small even for nearest neighbour: Reduce to point
+    sampling.
+    */
+    support=(MagickRealType) 0.5;
+    scale=1.0;
   }
-  else 
+  scale=PerceptibleReciprocal(scale);
+
+  if (resizedRows < workgroupSize) 
   {
-    mem_flags = CL_MEM_WRITE_ONLY;
-    hostPtr = NULL;
+    chunkSize = 32;
+    pixelPerWorkgroup = 32;
   }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
-  if (clStatus != CL_SUCCESS)
+  else
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
+    chunkSize = workgroupSize;
+    pixelPerWorkgroup = workgroupSize;
   }
 
-  /* find out how many random numbers needed by pixel */
-  numRandomNumberPerPixel = 0;
+  /* get the local memory size supported by the device */
+  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
+
+DisableMSCWarning(4127)
+  while(1)
+RestoreMSCWarning
   {
-    unsigned int numRandPerChannel = 0;
-    switch (noise_type)
-    {
-    case UniformNoise:
-    case ImpulseNoise:
-    case LaplacianNoise:
-    case RandomNoise:
-    default:
-      numRandPerChannel = 1;
-      break;
-    case GaussianNoise:
-    case MultiplicativeGaussianNoise:
-    case PoissonNoise:
-      numRandPerChannel = 2;
-      break;
-    };
-
-    if ((channel & RedChannel) != 0)
-      numRandomNumberPerPixel+=numRandPerChannel;
-    if ((channel & GreenChannel) != 0)
-      numRandomNumberPerPixel+=numRandPerChannel;
-    if ((channel & BlueChannel) != 0)
-      numRandomNumberPerPixel+=numRandPerChannel;
-    if ((channel & OpacityChannel) != 0)
-      numRandomNumberPerPixel+=numRandPerChannel;
-  }
+    /* calculate the local memory size needed per workgroup */
+    cacheRangeStart = (int) (((0 + 0.5)/yFactor+MagickEpsilon)-support+0.5);
+    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/yFactor+MagickEpsilon)+support+0.5);
+    numCachedPixels = cacheRangeEnd - cacheRangeStart + 1;
+    imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket);
+    totalLocalMemorySize = imageCacheLocalMemorySize;
 
-  /* set up the random number generators */
-  attenuate=1.0;
-  option=GetImageArtifact(image,"attenuate");
-  if (option != (char *) NULL)
-    attenuate=StringToDouble(option,(char **) NULL);
-  random_info=AcquireRandomInfoThreadSet();
-#if defined(MAGICKCORE_OPENMP_SUPPORT)
-  key=GetRandomSecretKey(random_info[0]);
-  (void) key;
-#endif
+    /* local size for the pixel accumulator */
+    pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4);
+    totalLocalMemorySize+=pixelAccumulatorLocalMemorySize;
 
-  addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"GenerateNoiseImage");
+    /* local memory size for the weight accumulator */
+    weightAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=weightAccumulatorLocalMemorySize;
 
-  {
-    cl_uint computeUnitCount;
-    cl_uint workItemCount;
-    clEnv->library->clGetDeviceInfo(clEnv->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &computeUnitCount, NULL);
-    workItemCount = computeUnitCount * 2 * 256;                        // 256 work items per group, 2 groups per CU
-    inputPixelCount = (cl_int) (image->columns * image->rows);
-    pixelsPerWorkitem = (inputPixelCount + workItemCount - 1) / workItemCount;
-    pixelsPerWorkitem = ((pixelsPerWorkitem + 3) / 4) * 4;
+    /* local memory size for the gamma accumulator */
+    if (matte == 0)
+      gammaAccumulatorLocalMemorySize = sizeof(float);
+    else
+      gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=gammaAccumulatorLocalMemorySize;
 
-    local_work_size[0] = 256;
-    global_work_size[0] = workItemCount;
-  }
-  {
-    RandomInfo* randomInfo = AcquireRandomInfo();
-       const unsigned long* s = GetRandomInfoSeed(randomInfo);
-       seed0 = s[0];
-       GetPseudoRandomValue(randomInfo);
-       seed1 = s[0];
-       randomInfo = DestroyRandomInfo(randomInfo);
+    if (totalLocalMemorySize <= deviceLocalMemorySize)
+      break;
+    else
+    {
+      pixelPerWorkgroup = pixelPerWorkgroup/2;
+      chunkSize = chunkSize/2;
+      if (pixelPerWorkgroup == 0
+          || chunkSize == 0)
+      {
+        /* quit, fallback to CPU */
+        goto cleanup;
+      }
+    }
   }
 
-  k = 0;
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&imageBuffer);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&inputPixelCount);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&pixelsPerWorkitem);  
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(ChannelType),(void *)&channel);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(NoiseType),(void *)&noise_type);
-  attenuate=1.0f;
-  option=GetImageArtifact(image,"attenuate");
-  if (option != (char *) NULL)
-    attenuate=(float)StringToDouble(option,(char **) NULL);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(float),(void *)&attenuate);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&seed0);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&seed1);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&numRandomNumberPerPixel);
-
-  clEnv->library->clEnqueueNDRangeKernel(queue,addNoiseKernel,1,NULL,global_work_size,NULL,0,NULL,NULL);
+  resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter);
+  resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter);
 
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
-  {
-    length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
-  }
+  if (resizeFilterType == SincFastWeightingFunction
+    && resizeWindowType == SincFastWeightingFunction)
+    verticalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilterSinc");
   else 
+    verticalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilter");
+
+  if (verticalKernel == NULL)
   {
-    length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
-  }
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
-
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  i = 0;
+  clStatus = clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(cl_mem), (void*)&image);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&imageColumns);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&imageRows);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&matte);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&yFactor);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(cl_mem), (void*)&resizedImage);
 
-  image_view=DestroyCacheView(image_view);
-  if (filteredImage_view != NULL)
-    filteredImage_view=DestroyCacheView(filteredImage_view);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), (void*)&resizedRows);
 
-  if (queue!=NULL)                  RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (addNoiseKernel!=NULL)         RelinquishOpenCLKernel(clEnv, addNoiseKernel);
-  if (imageBuffer!=NULL)                   clEnv->library->clReleaseMemObject(imageBuffer);
-  if (filteredImageBuffer!=NULL)         clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (outputReady == MagickFalse && filteredImage != NULL) 
-    filteredImage=DestroyImage(filteredImage);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(int), (void*)&resizeFilterType);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(int), (void*)&resizeWindowType);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients);
 
-  return(filteredImage);
-}
+  resizeFilterScale = (float) GetResizeFilterScale(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&resizeFilterScale);
 
+  resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&resizeFilterSupport);
 
-MagickExport Image *AccelerateAddNoiseImage(const Image *image,
-  const ChannelType channel,const NoiseType noise_type,
-  ExceptionInfo *exception) 
-{
-  Image
-    *filteredImage;
+  resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport);
 
-  assert(image != NULL);
-  assert(exception != (ExceptionInfo *) NULL);
+  resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(float), (void*)&resizeFilterBlur);
 
-  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, channel) == MagickFalse))
-    return NULL;
 
-  filteredImage = ComputeAddNoiseImage(image,channel,noise_type,exception);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, imageCacheLocalMemorySize, NULL);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(int), &numCachedPixels);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, sizeof(unsigned int), &chunkSize);
   
-  return(filteredImage);
-}
-
-static MagickBooleanType LaunchRandomImageKernel(MagickCLEnv clEnv,
-  cl_command_queue queue,cl_mem imageBuffer,const unsigned int imageColumns,
-  const unsigned int imageRows,cl_mem seedBuffer,
-  const unsigned int numGenerators,ExceptionInfo *exception)
-{
-  int
-    k;
-
-  cl_int
-    clStatus;
-
-  cl_kernel
-    randomImageKernel;
-
-  MagickBooleanType
-    status;
 
-  size_t
-    global_work_size,
-    local_work_size;
-
-  status = MagickFalse;
-  randomImageKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "RandomImage");
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, pixelAccumulatorLocalMemorySize, NULL);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, weightAccumulatorLocalMemorySize, NULL);
+  clStatus |= clEnv->library->clSetKernelArg(verticalKernel, i++, gammaAccumulatorLocalMemorySize, NULL);
 
-  k = 0;
-  clEnv->library->clSetKernelArg(randomImageKernel,k++,sizeof(cl_mem),(void*)&imageBuffer);
-  clEnv->library->clSetKernelArg(randomImageKernel,k++,sizeof(cl_uint),(void*)&imageColumns);
-  clEnv->library->clSetKernelArg(randomImageKernel,k++,sizeof(cl_uint),(void*)&imageRows);
-  clEnv->library->clSetKernelArg(randomImageKernel,k++,sizeof(cl_mem),(void*)&seedBuffer);
+  if (clStatus != CL_SUCCESS)
   {
-    const float randNormNumerator = 1.0f;
-    const unsigned int randNormDenominator = (unsigned int)(~0UL);
-    clEnv->library->clSetKernelArg(randomImageKernel,k++,
-          sizeof(float),(void*)&randNormNumerator);
-    clEnv->library->clSetKernelArg(randomImageKernel,k++,
-          sizeof(cl_uint),(void*)&randNormDenominator);
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
   }
 
+  global_work_size[0] = resizedColumns;
+  global_work_size[1] = (resizedRows+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize;
 
-  global_work_size = numGenerators;
-  local_work_size = 64;
-
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue,randomImageKernel,1,NULL,&global_work_size,
-                                    &local_work_size,0,NULL,NULL);
-
+  local_work_size[0] = 1;
+  local_work_size[1] = workgroupSize;
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, verticalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, 
-                                      "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
     goto cleanup;
   }
+  clEnv->library->clFlush(queue);
   status = MagickTrue;
 
+
 cleanup:
-  if (randomImageKernel!=NULL) RelinquishOpenCLKernel(clEnv, randomImageKernel);
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+
+  if (verticalKernel != NULL) RelinquishOpenCLKernel(clEnv, verticalKernel);
+
   return(status);
 }
 
-static MagickBooleanType ComputeRandomImage(Image* image,
-  ExceptionInfo* exception)
+static Image *ComputeResizeImage(const Image* image,
+  const size_t resizedColumns,const size_t resizedRows,
+  const ResizeFilter *resizeFilter,ExceptionInfo *exception)
 {
   CacheView
+    *filteredImage_view,
     *image_view;
 
   cl_command_queue
     queue;
 
-  cl_context
-    context;
-
   cl_int
     clStatus;
 
-  /* Don't release this buffer in this function !!! */
+  cl_context
+    context;
+
   cl_mem
-    randomNumberSeedsBuffer;
+    cubicCoefficientsBuffer,
+    filteredImageBuffer,
+    imageBuffer,
+    tempImageBuffer;
 
   cl_mem_flags
     mem_flags;
 
-  cl_mem 
-   imageBuffer;
+  const double
+    *resizeFilterCoefficient;
 
-  MagickBooleanType 
-    outputReady,
-    status;
+  const void
+    *inputPixels;
 
-  MagickCLEnv
-    clEnv;
+  float
+    *mappedCoefficientBuffer,
+    xFactor,
+    yFactor;
+
+  MagickBooleanType
+    outputReady,
+    status;
+
+  MagickCLEnv
+    clEnv;
 
   MagickSizeType
     length;
 
+  Image
+    *filteredImage;
+
+  unsigned int
+    i,
+    matte;
+
   void
-    *inputPixels;
+    *filteredPixels,
+    *hostPtr;
 
-  status = MagickFalse;
   outputReady = MagickFalse;
-  inputPixels = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  clEnv = NULL;
   context = NULL;
   imageBuffer = NULL;
+  tempImageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  cubicCoefficientsBuffer = NULL;
   queue = NULL;
 
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
 
   /* Create and initialize OpenCL buffers. */
-  image_view=AcquireAuthenticCacheView(image,exception);
-  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (inputPixels == (void *) NULL)
+  image_view=AcquireVirtualCacheView(image,exception);
+  inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+  if (inputPixels == (const void *) NULL)
   {
     (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
     goto cleanup;
@@ -6747,11 +5647,11 @@ static MagickBooleanType ComputeRandomImage(Image* image,
      create a buffer on the GPU and copy the data over */
   if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
   }
   else 
   {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
   }
   /* create a CL buffer from image pixel buffer */
   length = image->columns * image->rows;
@@ -6761,82 +5661,211 @@ static MagickBooleanType ComputeRandomImage(Image* image,
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
+
+  cubicCoefficientsBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, 7 * sizeof(float), NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
   queue = AcquireOpenCLCommandQueue(clEnv);
+  mappedCoefficientBuffer = (float*)clEnv->library->clEnqueueMapBuffer(queue, cubicCoefficientsBuffer, CL_TRUE, CL_MAP_WRITE, 0, 7 * sizeof(float)
+          , 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
+    goto cleanup;
+  }
+  resizeFilterCoefficient = GetResizeFilterCoefficient(resizeFilter);
+  for (i = 0; i < 7; i++)
+  {
+    mappedCoefficientBuffer[i] = (float) resizeFilterCoefficient[i];
+  }
+  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, cubicCoefficientsBuffer, mappedCoefficientBuffer, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-  randomNumberSeedsBuffer = GetAndLockRandSeedBuffer(clEnv);
-  if (randomNumberSeedsBuffer==NULL)
+  filteredImage = CloneImage(image,resizedColumns,resizedRows,MagickTrue,exception);
+  if (filteredImage == NULL)
+    goto cleanup;
+
+  if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), 
-           ResourceLimitWarning, "Failed to get GPU random number generators.",
-           "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
+  filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
+  if (filteredPixels == (void *) NULL)
+  {
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
     goto cleanup;
   }
 
-  status = LaunchRandomImageKernel(clEnv,queue,
-                                   imageBuffer,
-                                   (unsigned int) image->columns,
-                                   (unsigned int) image->rows,
-                                   randomNumberSeedsBuffer,
-                                   GetNumRandGenerators(clEnv),
-                                   exception);
-  if (status==MagickFalse)
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = filteredPixels;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY;
+    hostPtr = NULL;
+  }
+
+  /* create a CL buffer from image pixel buffer */
+  length = filteredImage->columns * filteredImage->rows;
+  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
   {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
 
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  xFactor=(float) resizedColumns/(float) image->columns;
+  yFactor=(float) resizedRows/(float) image->rows;
+  matte=(image->alpha_trait != UndefinedPixelTrait)?1:0;
+  if (xFactor > yFactor)
   {
-    length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+
+    length = resizedColumns*image->rows;
+    tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(CLPixelPacket), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+    
+    status = resizeHorizontalFilter(imageBuffer, (unsigned int) image->columns, (unsigned int) image->rows, matte
+          , tempImageBuffer, (unsigned int) resizedColumns, (unsigned int) image->rows
+          , resizeFilter, cubicCoefficientsBuffer
+          , xFactor, clEnv, queue, exception);
+    if (status != MagickTrue)
+      goto cleanup;
+    
+    status = resizeVerticalFilter(tempImageBuffer, (unsigned int) resizedColumns, (unsigned int) image->rows, matte
+       , filteredImageBuffer, (unsigned int) resizedColumns, (unsigned int) resizedRows
+       , resizeFilter, cubicCoefficientsBuffer
+       , yFactor, clEnv, queue, exception);
+    if (status != MagickTrue)
+      goto cleanup;
+  }
+  else
+  {
+    length = image->columns*resizedRows;
+    tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(CLPixelPacket), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+
+    status = resizeVerticalFilter(imageBuffer, (unsigned int) image->columns, (unsigned int) image->rows, matte
+       , tempImageBuffer, (unsigned int) image->columns, (unsigned int) resizedRows
+       , resizeFilter, cubicCoefficientsBuffer
+       , yFactor, clEnv, queue, exception);
+    if (status != MagickTrue)
+      goto cleanup;
+
+    status = resizeHorizontalFilter(tempImageBuffer, (unsigned int) image->columns, (unsigned int) resizedRows, matte
+       , filteredImageBuffer, (unsigned int) resizedColumns, (unsigned int) resizedRows
+       , resizeFilter, cubicCoefficientsBuffer
+       , xFactor, clEnv, queue, exception);
+    if (status != MagickTrue)
+      goto cleanup;
+  }
+  length = resizedColumns*resizedRows;
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
   else 
   {
-    length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
   }
   if (clStatus != CL_SUCCESS)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
-  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
+  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
 
 cleanup:
   OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
   image_view=DestroyCacheView(image_view);
+  if (filteredImage_view != NULL)
+    filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  UnlockRandSeedBuffer(clEnv);
-  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
-  if (queue != NULL)                  RelinquishOpenCLCommandQueue(clEnv, queue);
-  return outputReady;
+  if (imageBuffer!=NULL)                 clEnv->library->clReleaseMemObject(imageBuffer);
+  if (tempImageBuffer!=NULL)             clEnv->library->clReleaseMemObject(tempImageBuffer);
+  if (filteredImageBuffer!=NULL)         clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (cubicCoefficientsBuffer!=NULL)      clEnv->library->clReleaseMemObject(cubicCoefficientsBuffer);
+  if (queue != NULL)                     RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse && filteredImage != NULL)
+    filteredImage=DestroyImage(filteredImage);
+  return(filteredImage);
 }
 
-MagickExport MagickBooleanType AccelerateRandomImage(Image *image,
-  ExceptionInfo* exception)
+static MagickBooleanType gpuSupportedResizeWeighting(
+  ResizeWeightingFunctionType f)
 {
-  MagickBooleanType
-    status;
+  unsigned int
+    i;
+
+  for (i = 0; ;i++)
+  {
+    if (supportedResizeWeighting[i] == LastWeightingFunction)
+      break;
+    if (supportedResizeWeighting[i] == f)
+      return(MagickTrue);
+  }
+  return(MagickFalse);
+}
+
+MagickExport Image *AccelerateResizeImage(const Image *image,
+  const size_t resizedColumns,const size_t resizedRows,
+  const ResizeFilter *resizeFilter,ExceptionInfo *exception) 
+{
+  Image
+    *filteredImage;
 
   assert(image != NULL);
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
       (checkAccelerateCondition(image, AllChannels) == MagickFalse))
-    return(MagickFalse);
+    return NULL;
 
-  status=ComputeRandomImage(image,exception);
-  return(status);
+  if (gpuSupportedResizeWeighting(GetResizeFilterWeightingType(resizeFilter)) == MagickFalse ||
+      gpuSupportedResizeWeighting(GetResizeFilterWindowWeightingType(resizeFilter)) == MagickFalse)
+    return NULL;
+
+  filteredImage=ComputeResizeImage(image,resizedColumns,resizedRows,resizeFilter,exception);
+  return(filteredImage);
 }
 
-static Image* ComputeMotionBlurImage(const Image *image,
-  const ChannelType channel,const double *kernel,const size_t width, 
-  const OffsetInfo *offset,ExceptionInfo *exception)
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     A c c e l e r a t e R o t a t i o n a l B l u r I m a g e               %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+static Image* ComputeRotationalBlurImage(const Image *image,
+  const ChannelType channel,const double angle,ExceptionInfo *exception)
 {
   CacheView
-    *filteredImage_view,
-    *image_view;
+    *image_view,
+    *filteredImage_view;
 
   cl_command_queue
     queue;
@@ -6844,41 +5873,45 @@ static Image* ComputeMotionBlurImage(const Image *image,
   cl_context
     context;
 
+  cl_float2
+    blurCenter;
+
   cl_float4
     biasPixel;
 
   cl_int
     clStatus;
 
-  cl_kernel
-    motionBlurKernel;
-
   cl_mem
+    cosThetaBuffer,
     filteredImageBuffer,
     imageBuffer,
-    imageKernelBuffer, 
-    offsetBuffer;
+    sinThetaBuffer;
 
   cl_mem_flags
     mem_flags;
 
+  cl_kernel
+    rotationalBlurKernel;
+
   const void
     *inputPixels;
 
   float
-    *kernelBufferPtr;
-
-  Image
-    *filteredImage;
+    blurRadius,
+    *cosThetaPtr,
+    offset,
+    *sinThetaPtr,
+    theta;
 
-  int
-    *offsetBufferPtr;
+  Image
+    *filteredImage;
 
   MagickBooleanType
     outputReady;
 
   MagickCLEnv
-   clEnv;
+    clEnv;
 
   PixelInfo
     bias;
@@ -6887,13 +5920,11 @@ static Image* ComputeMotionBlurImage(const Image *image,
     length;
 
   size_t
-    global_work_size[2],
-    local_work_size[2];
+    global_work_size[2];
 
   unsigned int
+    cossin_theta_size,
     i,
-    imageHeight,
-    imageWidth,
     matte;
 
   void
@@ -6906,27 +5937,29 @@ static Image* ComputeMotionBlurImage(const Image *image,
   filteredImage_view = NULL;
   imageBuffer = NULL;
   filteredImageBuffer = NULL;
-  imageKernelBuffer = NULL;
-  motionBlurKernel = NULL;
+  sinThetaBuffer = NULL;
+  cosThetaBuffer = NULL;
   queue = NULL;
+  rotationalBlurKernel = NULL;
+
 
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
 
+
   /* Create and initialize OpenCL buffers. */
 
   image_view=AcquireVirtualCacheView(image,exception);
   inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
   if (inputPixels == (const void *) NULL)
   {
-    (void) ThrowMagickException(exception,GetMagickModule(),CacheError,
-      "UnableToReadPixelCache.","`%s'",image->filename);
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
     goto cleanup;
   }
 
-  // If the host pointer is aligned to the size of CLPixelPacket, 
-  // then use the host buffer directly from the GPU; otherwise, 
-  // create a buffer on the GPU and copy the data over
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
   if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
     mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
@@ -6935,33 +5968,28 @@ static Image* ComputeMotionBlurImage(const Image *image,
   {
     mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
   }
-  // create a CL buffer from image pixel buffer
+  /* create a CL buffer from image pixel buffer */
   length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, 
-    length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) ThrowMagickException(exception, GetMagickModule(),
-      ResourceLimitError, "clEnv->library->clCreateBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
 
 
-  filteredImage = CloneImage(image,image->columns,image->rows,
-    MagickTrue,exception);
+  filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
   assert(filteredImage != NULL);
   if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
   {
-    (void) ThrowMagickException(exception, GetMagickModule(), 
-      ResourceLimitError, "CloneImage failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
     goto cleanup;
   }
   filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
   filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
   if (filteredPixels == (void *) NULL)
   {
-    (void) ThrowMagickException(exception,GetMagickModule(),CacheError, 
-      "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
     goto cleanup;
   }
 
@@ -6975,276 +6003,938 @@ static Image* ComputeMotionBlurImage(const Image *image,
     mem_flags = CL_MEM_WRITE_ONLY;
     hostPtr = NULL;
   }
-  // create a CL buffer from image pixel buffer
+  /* create a CL buffer from image pixel buffer */
   length = image->columns * image->rows;
-  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, 
-    length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) ThrowMagickException(exception, GetMagickModule(), 
-      ResourceLimitError, "clEnv->library->clCreateBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
 
+  blurCenter.s[0] = (float) (image->columns-1)/2.0;
+  blurCenter.s[1] = (float) (image->rows-1)/2.0;
+  blurRadius=hypot(blurCenter.s[0],blurCenter.s[1]);
+  cossin_theta_size=(unsigned int) fabs(4.0*DegreesToRadians(angle)*sqrt((double)blurRadius)+2UL);
 
-  imageKernelBuffer = clEnv->library->clCreateBuffer(context, 
-    CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, width * sizeof(float), NULL,
-    &clStatus);
+  /* create a buffer for sin_theta and cos_theta */
+  sinThetaBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, cossin_theta_size * sizeof(float), NULL, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) ThrowMagickException(exception, GetMagickModule(), 
-      ResourceLimitError, "clEnv->library->clCreateBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+  cosThetaBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, cossin_theta_size * sizeof(float), NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
     goto cleanup;
   }
 
+
   queue = AcquireOpenCLCommandQueue(clEnv);
-  kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, 
-    CL_TRUE, CL_MAP_WRITE, 0, width * sizeof(float), 0, NULL, NULL, &clStatus);
+  sinThetaPtr = (float*) clEnv->library->clEnqueueMapBuffer(queue, sinThetaBuffer, CL_TRUE, CL_MAP_WRITE, 0, cossin_theta_size*sizeof(float), 0, NULL, NULL, &clStatus);
   if (clStatus != CL_SUCCESS)
   {
-    (void) ThrowMagickException(exception, GetMagickModule(), 
-      ResourceLimitError, "clEnv->library->clEnqueueMapBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueuemapBuffer failed.",".");
     goto cleanup;
   }
-  for (i = 0; i < width; i++)
+
+  cosThetaPtr = (float*) clEnv->library->clEnqueueMapBuffer(queue, cosThetaBuffer, CL_TRUE, CL_MAP_WRITE, 0, cossin_theta_size*sizeof(float), 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
   {
-    kernelBufferPtr[i] = (float) kernel[i];
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueuemapBuffer failed.",".");
+    goto cleanup;
   }
-  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr,
-    0, NULL, NULL);
- if (clStatus != CL_SUCCESS)
+
+  theta=DegreesToRadians(angle)/(MagickRealType) (cossin_theta_size-1);
+  offset=theta*(MagickRealType) (cossin_theta_size-1)/2.0;
+  for (i=0; i < (ssize_t) cossin_theta_size; i++)
   {
-    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, 
-      "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    cosThetaPtr[i]=(float)cos((double) (theta*i-offset));
+    sinThetaPtr[i]=(float)sin((double) (theta*i-offset));
+  }
+  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, sinThetaBuffer, sinThetaPtr, 0, NULL, NULL);
+  clStatus |= clEnv->library->clEnqueueUnmapMemObject(queue, cosThetaBuffer, cosThetaPtr, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  offsetBuffer = clEnv->library->clCreateBuffer(context, 
-    CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, width * sizeof(cl_int2), NULL,
-    &clStatus);
+  /* get the OpenCL kernel */
+  rotationalBlurKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "RotationalBlur");
+  if (rotationalBlurKernel == NULL)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  
+  /* set the kernel arguments */
+  i = 0;
+  clStatus=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+
+  GetPixelInfo(image,&bias);
+  biasPixel.s[0] = bias.red;
+  biasPixel.s[1] = bias.green;
+  biasPixel.s[2] = bias.blue;
+  biasPixel.s[3] = bias.alpha;
+  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_float4), &biasPixel);
+  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(ChannelType), &channel);
+
+  matte = (image->alpha_trait == UndefinedPixelTrait)?1:0;
+  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(unsigned int), &matte);
+
+  clStatus=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_float2), &blurCenter);
+
+  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_mem),(void *)&cosThetaBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(cl_mem),(void *)&sinThetaBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(rotationalBlurKernel,i++,sizeof(unsigned int), &cossin_theta_size);
   if (clStatus != CL_SUCCESS)
   {
-    (void) ThrowMagickException(exception, GetMagickModule(), 
-      ResourceLimitError, "clEnv->library->clCreateBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
     goto cleanup;
   }
 
-  offsetBufferPtr = (int*)clEnv->library->clEnqueueMapBuffer(queue, offsetBuffer, CL_TRUE, 
-    CL_MAP_WRITE, 0, width * sizeof(cl_int2), 0, NULL, NULL, &clStatus);
+
+  global_work_size[0] = image->columns;
+  global_work_size[1] = image->rows;
+  /* launch the kernel */
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, rotationalBlurKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
   if (clStatus != CL_SUCCESS)
   {
-    (void) ThrowMagickException(exception, GetMagickModule(), 
-      ResourceLimitError, "clEnv->library->clEnqueueMapBuffer failed.",".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
     goto cleanup;
   }
-  for (i = 0; i < width; i++)
+  clEnv->library->clFlush(queue);
+
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
-    offsetBufferPtr[2*i] = (int)offset[i].x;
-    offsetBufferPtr[2*i+1] = (int)offset[i].y;
+    length = image->columns * image->rows;
+    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
-  clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, offsetBuffer, offsetBufferPtr, 0, 
-    NULL, NULL);
- if (clStatus != CL_SUCCESS)
+  else 
   {
-    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
-      "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    length = image->columns * image->rows;
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
+  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
+
+cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+
+  image_view=DestroyCacheView(image_view);
+  if (filteredImage_view != NULL)
+    filteredImage_view=DestroyCacheView(filteredImage_view);
+
+  if (filteredImageBuffer!=NULL)  clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (imageBuffer!=NULL)     clEnv->library->clReleaseMemObject(imageBuffer);
+  if (sinThetaBuffer!=NULL)       clEnv->library->clReleaseMemObject(sinThetaBuffer);
+  if (cosThetaBuffer!=NULL)       clEnv->library->clReleaseMemObject(cosThetaBuffer);
+  if (rotationalBlurKernel!=NULL) RelinquishOpenCLKernel(clEnv, rotationalBlurKernel);
+  if (queue != NULL)              RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return filteredImage;
+}
+
+MagickExport Image* AccelerateRotationalBlurImage(const Image *image,
+  const ChannelType channel,const double angle,ExceptionInfo *exception)
+{
+  Image
+    *filteredImage;
+
+  assert(image != NULL);
+  assert(exception != (ExceptionInfo *) NULL);
+
+  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
+      (checkAccelerateCondition(image, channel) == MagickFalse))
+    return NULL;
+
+  filteredImage=ComputeRotationalBlurImage(image, channel, angle, exception);
+  return filteredImage;
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     A c c e l e r a t e U n s h a r p M a s k I m a g e                     %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+*/
+
+static Image *ComputeUnsharpMaskImage(const Image *image,
+  const ChannelType channel,const double radius,const double sigma,
+  const double gain,const double threshold,ExceptionInfo *exception)
+{
+  CacheView
+    *filteredImage_view,
+    *image_view;
+
+  char
+    geometry[MagickPathExtent];
+
+  cl_command_queue
+    queue;
+
+  cl_context
+    context;
+
+  cl_int
+    clStatus;
+
+  cl_kernel
+    blurRowKernel,
+    unsharpMaskBlurColumnKernel;
+
+  cl_mem
+    filteredImageBuffer,
+    imageBuffer,
+    imageKernelBuffer,
+    tempImageBuffer;
+
+  cl_mem_flags
+    mem_flags;
+
+  const void
+    *inputPixels;
+
+  float
+    fGain,
+    fThreshold,
+    *kernelBufferPtr;
+
+  Image
+    *filteredImage;
+
+  int
+    chunkSize;
+
+  KernelInfo
+    *kernel;
+
+  MagickBooleanType
+    outputReady;
+
+  MagickCLEnv
+    clEnv;
+
+  MagickSizeType
+    length;
+
+  void
+    *filteredPixels,
+    *hostPtr;
+
+  unsigned int
+    i,
+    imageColumns,
+    imageRows,
+    kernelWidth;
+
+  clEnv = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  kernel = NULL;
+  context = NULL;
+  imageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  tempImageBuffer = NULL;
+  imageKernelBuffer = NULL;
+  blurRowKernel = NULL;
+  unsharpMaskBlurColumnKernel = NULL;
+  queue = NULL;
+  outputReady = MagickFalse;
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  /* Create and initialize OpenCL buffers. */
+  {
+    image_view=AcquireVirtualCacheView(image,exception);
+    inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+    if (inputPixels == (const void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+      goto cleanup;
+    }
+
+    /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+    if (ALIGNED(inputPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create output */
+  {
+    filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
+    assert(filteredImage != NULL);
+    if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
+    filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
+    if (filteredPixels == (void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+      goto cleanup;
+    }
+
+    if (ALIGNED(filteredPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+      hostPtr = filteredPixels;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY;
+      hostPtr = NULL;
+    }
+
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create the blur kernel */
+  {
+    (void) FormatLocaleString(geometry,MagickPathExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
+    kernel=AcquireKernelInfo(geometry,exception);
+    if (kernel == (KernelInfo *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.",".");
+      goto cleanup;
+    }
+
+    imageKernelBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+
+
+    kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
+      goto cleanup;
+    }
+    for (i = 0; i < kernel->width; i++)
+    {
+      kernelBufferPtr[i] = (float) kernel->values[i];
+    }
+    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
+
+  {
+    /* create temp buffer */
+    {
+      length = image->columns * image->rows;
+      tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length * 4 * sizeof(float), NULL, &clStatus);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+        goto cleanup;
+      }
+    }
+
+    /* get the opencl kernel */
+    {
+      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurRow");
+      if (blurRowKernel == NULL)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+
+      unsharpMaskBlurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMaskBlurColumn");
+      if (unsharpMaskBlurColumnKernel == NULL)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+    }
+
+    {
+      chunkSize = 256;
+
+      imageColumns = (unsigned int) image->columns;
+      imageRows = (unsigned int) image->rows;
+
+      kernelWidth = (unsigned int) kernel->width;
+
+      /* set the kernel arguments */
+      i = 0;
+      clStatus=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(ChannelType),&channel);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+      clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(CLPixelPacket)*(chunkSize+kernel->width),(void *) NULL);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+        goto cleanup;
+      }
+    }
+
+    /* launch the kernel */
+    {
+      size_t gsize[2];
+      size_t wsize[2];
+
+      gsize[0] = chunkSize*((image->columns+chunkSize-1)/chunkSize);
+      gsize[1] = image->rows;
+      wsize[0] = chunkSize;
+      wsize[1] = 1;
+
+      clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, blurRowKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+        goto cleanup;
+      }
+      clEnv->library->clFlush(queue);
+    }
+
+
+    {
+      chunkSize = 256;
+      imageColumns = (unsigned int) image->columns;
+      imageRows = (unsigned int) image->rows;
+      kernelWidth = (unsigned int) kernel->width;
+      fGain = (float) gain;
+      fThreshold = (float) threshold;
+
+      i = 0;
+      clStatus=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++, (chunkSize+kernelWidth-1)*sizeof(cl_float4),NULL);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++, kernelWidth*sizeof(float),NULL);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(ChannelType),&channel);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fGain);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fThreshold);
+
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+        goto cleanup;
+      }
+    }
+
+    /* launch the kernel */
+    {
+      size_t gsize[2];
+      size_t wsize[2];
+
+      gsize[0] = image->columns;
+      gsize[1] = chunkSize*((image->rows+chunkSize-1)/chunkSize);
+      wsize[0] = 1;
+      wsize[1] = chunkSize;
+
+      clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, unsharpMaskBlurColumnKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+        goto cleanup;
+      }
+      clEnv->library->clFlush(queue);
+    }
+
+  }
+
+  /* get result */
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    length = image->columns * image->rows;
+    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = image->columns * image->rows;
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
+
+cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+
+  image_view=DestroyCacheView(image_view);
+  if (filteredImage_view != NULL)
+    filteredImage_view=DestroyCacheView(filteredImage_view);
+
+  if (kernel != NULL)                        kernel=DestroyKernelInfo(kernel);
+  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
+  if (filteredImageBuffer!=NULL)              clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (tempImageBuffer!=NULL)                  clEnv->library->clReleaseMemObject(tempImageBuffer);
+  if (imageKernelBuffer!=NULL)                clEnv->library->clReleaseMemObject(imageKernelBuffer);
+  if (blurRowKernel!=NULL)                    RelinquishOpenCLKernel(clEnv, blurRowKernel);
+  if (unsharpMaskBlurColumnKernel!=NULL)      RelinquishOpenCLKernel(clEnv, unsharpMaskBlurColumnKernel);
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return(filteredImage);
+}
+
+static Image *ComputeUnsharpMaskImageSection(const Image *image,
+  const ChannelType channel,const double radius,const double sigma,
+  const double gain,const double threshold,ExceptionInfo *exception)
+{
+  CacheView
+    *filteredImage_view,
+    *image_view;
+
+  char
+    geometry[MagickPathExtent];
+
+  cl_command_queue
+    queue;
+
+  cl_context
+    context;
+
+  cl_int
+    clStatus;
+
+  cl_kernel
+    blurRowKernel,
+    unsharpMaskBlurColumnKernel;
+
+  cl_mem
+    filteredImageBuffer,
+    imageBuffer,
+    imageKernelBuffer,
+    tempImageBuffer;
+
+  cl_mem_flags
+    mem_flags;
+
+  const void
+    *inputPixels;
+
+  float
+    fGain,
+    fThreshold,
+    *kernelBufferPtr;
+
+  Image
+    *filteredImage;
+
+  int
+    chunkSize;
+
+  KernelInfo
+    *kernel;
+
+  MagickBooleanType
+    outputReady;
+
+  MagickCLEnv
+    clEnv;
+
+  MagickSizeType
+    length;
+
+  void
+    *filteredPixels,
+    *hostPtr;
+
+  unsigned int
+    i,
+    imageColumns,
+    imageRows,
+    kernelWidth;
+
+  clEnv = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  kernel = NULL;
+  context = NULL;
+  imageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  tempImageBuffer = NULL;
+  imageKernelBuffer = NULL;
+  blurRowKernel = NULL;
+  unsharpMaskBlurColumnKernel = NULL;
+  queue = NULL;
+  outputReady = MagickFalse;
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  /* Create and initialize OpenCL buffers. */
+  {
+    image_view=AcquireVirtualCacheView(image,exception);
+    inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+    if (inputPixels == (const void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+      goto cleanup;
+    }
+
+    /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+    if (ALIGNED(inputPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create output */
+  {
+    filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
+    assert(filteredImage != NULL);
+    if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
+    filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
+    if (filteredPixels == (void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+      goto cleanup;
+    }
+
+    if (ALIGNED(filteredPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+      hostPtr = filteredPixels;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY;
+      hostPtr = NULL;
+    }
+
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create the blur kernel */
+  {
+    (void) FormatLocaleString(geometry,MagickPathExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
+    kernel=AcquireKernelInfo(geometry,exception);
+    if (kernel == (KernelInfo *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.",".");
+      goto cleanup;
+    }
+
+    imageKernelBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+
+
+    kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
+      goto cleanup;
+    }
+    for (i = 0; i < kernel->width; i++)
+    {
+      kernelBufferPtr[i] = (float) kernel->values[i];
+    }
+    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
+
+  {
+    unsigned int offsetRows;
+    unsigned int sec;
+
+    /* create temp buffer */
+    {
+      length = image->columns * (image->rows / 2 + 1 + (kernel->width-1) / 2);
+      tempImageBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, length * 4 * sizeof(float), NULL, &clStatus);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+        goto cleanup;
+      }
+    }
+
+    /* get the opencl kernel */
+    {
+      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurRowSection");
+      if (blurRowKernel == NULL)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+
+      unsharpMaskBlurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMaskBlurColumnSection");
+      if (unsharpMaskBlurColumnKernel == NULL)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+    }
+
+    for (sec = 0; sec < 2; sec++)
+    {
+      {
+        chunkSize = 256;
+
+        imageColumns = (unsigned int) image->columns;
+        if (sec == 0)
+          imageRows = (unsigned int) (image->rows / 2 + (kernel->width-1) / 2);
+        else
+          imageRows = (unsigned int) ((image->rows - image->rows / 2) + (kernel->width-1) / 2);
+
+        offsetRows = (unsigned int) (sec * image->rows / 2);
+
+        kernelWidth = (unsigned int) kernel->width;
+
+        /* set the kernel arguments */
+        i = 0;
+        clStatus=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(ChannelType),&channel);
+        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(CLPixelPacket)*(chunkSize+kernel->width),(void *) NULL);
+        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&offsetRows);
+        clStatus|=clEnv->library->clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&sec);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+          goto cleanup;
+        }
+      }
+      /* launch the kernel */
+      {
+        size_t gsize[2];
+        size_t wsize[2];
+
+        gsize[0] = chunkSize*((imageColumns+chunkSize-1)/chunkSize);
+        gsize[1] = imageRows;
+        wsize[0] = chunkSize;
+        wsize[1] = 1;
+
+        clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, blurRowKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+          goto cleanup;
+        }
+        clEnv->library->clFlush(queue);
+      }
 
 
- // get the OpenCL kernel
-  motionBlurKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, 
-    "MotionBlur");
-  if (motionBlurKernel == NULL)
-  {
-    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
-      "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  
-  // set the kernel arguments
-  i = 0;
-  clStatus=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_mem),
-    (void *)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_mem),
-    (void *)&filteredImageBuffer);
-  imageWidth = (unsigned int) image->columns;
-  imageHeight = (unsigned int) image->rows;
-  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(unsigned int),
-    &imageWidth);
-  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(unsigned int),
-    &imageHeight);
-  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_mem),
-    (void *)&imageKernelBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(unsigned int),
-    &width);
-  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_mem),
-    (void *)&offsetBuffer);
+      {
+        chunkSize = 256;
 
-  GetPixelInfo(image,&bias);
-  biasPixel.s[0] = bias.red;
-  biasPixel.s[1] = bias.green;
-  biasPixel.s[2] = bias.blue;
-  biasPixel.s[3] = bias.alpha;
-  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(cl_float4), &biasPixel);
+        imageColumns = (unsigned int) image->columns;
+        if (sec == 0)
+          imageRows = (unsigned int) (image->rows / 2);
+        else
+          imageRows = (unsigned int) (image->rows - image->rows / 2);
 
-  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(ChannelType), &channel);
-  matte = (image->alpha_trait == UndefinedPixelTrait)?1:0;
-  clStatus|=clEnv->library->clSetKernelArg(motionBlurKernel,i++,sizeof(unsigned int), &matte);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
-      "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
+        offsetRows = (unsigned int) (sec * image->rows / 2);
 
-  // launch the kernel
-  local_work_size[0] = 16;
-  local_work_size[1] = 16;
-  global_work_size[0] = (size_t)padGlobalWorkgroupSizeToLocalWorkgroupSize(
-                                (unsigned int) image->columns,(unsigned int) local_work_size[0]);
-  global_work_size[1] = (size_t)padGlobalWorkgroupSizeToLocalWorkgroupSize(
-                                (unsigned int) image->rows,(unsigned int) local_work_size[1]);
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, motionBlurKernel, 2, NULL, 
-    global_work_size, local_work_size, 0, NULL, NULL);
+        kernelWidth = (unsigned int) kernel->width;
 
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
-      "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-    goto cleanup;
+        fGain = (float) gain;
+        fThreshold = (float) threshold;
+
+        i = 0;
+        clStatus=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++, (chunkSize+kernelWidth-1)*sizeof(cl_float4),NULL);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++, kernelWidth*sizeof(float),NULL);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(ChannelType),&channel);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fGain);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fThreshold);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&offsetRows);
+        clStatus|=clEnv->library->clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&sec);
+
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+          goto cleanup;
+        }
+      }
+
+      /* launch the kernel */
+      {
+        size_t gsize[2];
+        size_t wsize[2];
+
+        gsize[0] = imageColumns;
+        gsize[1] = chunkSize*((imageRows+chunkSize-1)/chunkSize);
+        wsize[0] = 1;
+        wsize[1] = chunkSize;
+
+        clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, unsharpMaskBlurColumnKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+          goto cleanup;
+        }
+        clEnv->library->clFlush(queue);
+      }
+    }
   }
-  clEnv->library->clFlush(queue);
 
+  /* get result */
   if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
     length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, 
-      CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, 
-      NULL, &clStatus);
+    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
   else 
   {
     length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, 
-      length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
   }
   if (clStatus != CL_SUCCESS)
   {
-    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError,
-      "Reading output image from CL buffer failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
+
   outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
 
 cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
   image_view=DestroyCacheView(image_view);
   if (filteredImage_view != NULL)
     filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  if (filteredImageBuffer!=NULL)  clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (imageBuffer!=NULL)     clEnv->library->clReleaseMemObject(imageBuffer);
-  if (imageKernelBuffer!=NULL)    clEnv->library->clReleaseMemObject(imageKernelBuffer);
-  if (motionBlurKernel!=NULL)  RelinquishOpenCLKernel(clEnv, motionBlurKernel);
-  if (queue != NULL)           RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (outputReady == MagickFalse && filteredImage != NULL)
-    filteredImage=DestroyImage(filteredImage);
-
-  return(filteredImage);
-}
-
-MagickExport Image *AccelerateMotionBlurImage(const Image *image,
-  const ChannelType channel,const double* kernel,const size_t width,
-  const OffsetInfo *offset,ExceptionInfo *exception)
-{
-  Image
-    *filteredImage;
-
-  assert(image != NULL);
-  assert(kernel != (double *) NULL);
-  assert(offset != (OffsetInfo *) NULL);
-  assert(exception != (ExceptionInfo *) NULL);
-
-  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, channel) == MagickFalse))
-    return NULL;
-
-  filteredImage=ComputeMotionBlurImage(image, channel, kernel, width,
-    offset, exception);
-  return(filteredImage);
-}
-
-static MagickBooleanType LaunchCompositeKernel(MagickCLEnv clEnv,
-  cl_command_queue queue,cl_mem imageBuffer,const unsigned int inputWidth,
-  const unsigned int inputHeight,const unsigned int matte,
-  const ChannelType channel,const CompositeOperator compose,
-  const cl_mem compositeImageBuffer,const unsigned int compositeWidth,
-  const unsigned int compositeHeight,const float destination_dissolve,
-  const float source_dissolve,ExceptionInfo *magick_unused(exception))
-{
-  cl_int
-    clStatus;
-
-  cl_kernel
-    compositeKernel;
-
-  int
-    k;
-
-  size_t
-    global_work_size[2],
-    local_work_size[2];
-
-  unsigned int
-    composeOp;
-
-  magick_unreferenced(exception);
-
-  compositeKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE,
-    "Composite");
-
-  k = 0;
-  clStatus=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(cl_mem),(void*)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&inputWidth);
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&inputHeight);
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(cl_mem),(void*)&compositeImageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&compositeWidth);
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&compositeHeight);
-  composeOp = (unsigned int)compose;
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&composeOp);
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(ChannelType),(void*)&channel);
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(unsigned int),(void*)&matte);
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(float),(void*)&destination_dissolve);
-  clStatus|=clEnv->library->clSetKernelArg(compositeKernel,k++,sizeof(float),(void*)&source_dissolve);
-
-  if (clStatus!=CL_SUCCESS)
-    return MagickFalse;
-
-  local_work_size[0] = 64;
-  local_work_size[1] = 1;
-
-  global_work_size[0] = padGlobalWorkgroupSizeToLocalWorkgroupSize(inputWidth,
-    (unsigned int) local_work_size[0]);
-  global_work_size[1] = inputHeight;
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, compositeKernel, 2, NULL, 
-    global_work_size, local_work_size, 0, NULL, NULL);
-
-
-  RelinquishOpenCLKernel(clEnv, compositeKernel);
-
-  return((clStatus==CL_SUCCESS) ? MagickTrue : MagickFalse);
+  if (kernel != NULL)                        kernel=DestroyKernelInfo(kernel);
+  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
+  if (filteredImageBuffer!=NULL)              clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (tempImageBuffer!=NULL)                  clEnv->library->clReleaseMemObject(tempImageBuffer);
+  if (imageKernelBuffer!=NULL)                clEnv->library->clReleaseMemObject(imageKernelBuffer);
+  if (blurRowKernel!=NULL)                    RelinquishOpenCLKernel(clEnv, blurRowKernel);
+  if (unsharpMaskBlurColumnKernel!=NULL)      RelinquishOpenCLKernel(clEnv, unsharpMaskBlurColumnKernel);
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return filteredImage;
 }
 
-static MagickBooleanType ComputeCompositeImage(Image *image,
-  const ChannelType channel,const CompositeOperator compose,
-  const Image *compositeImage,const ssize_t magick_unused(x_offset),
-  const ssize_t magick_unused(y_offset),const float destination_dissolve,
-  const float source_dissolve,ExceptionInfo *exception)
+static Image *ComputeUnsharpMaskImageSingle(const Image *image,
+  const ChannelType channel,const double radius,const double sigma,
+  const double gain,const double threshold,int blurOnly, ExceptionInfo *exception)
 {
   CacheView
+    *filteredImage_view,
     *image_view;
 
+  char
+    geometry[MagickPathExtent];
+
   cl_command_queue
     queue;
 
@@ -7252,21 +6942,36 @@ static MagickBooleanType ComputeCompositeImage(Image *image,
     context;
 
   cl_int
+    justBlur,
     clStatus;
 
+  cl_kernel
+    unsharpMaskKernel;
+
+  cl_mem
+    filteredImageBuffer,
+    imageBuffer,
+    imageKernelBuffer;
+
   cl_mem_flags
     mem_flags;
 
-  cl_mem
-    compositeImageBuffer,
-    imageBuffer;
-
   const void
-    *composePixels;
+    *inputPixels;
+
+  float
+    fGain,
+    fThreshold,
+    *kernelBufferPtr;
+
+  Image
+    *filteredImage;
+
+  KernelInfo
+    *kernel;
 
   MagickBooleanType
-    outputReady,
-    status;
+    outputReady;
 
   MagickCLEnv
     clEnv;
@@ -7275,192 +6980,272 @@ static MagickBooleanType ComputeCompositeImage(Image *image,
     length;
 
   void
-    *inputPixels;
+    *filteredPixels,
+    *hostPtr;
 
-  magick_unreferenced(x_offset);
-  magick_unreferenced(y_offset);
+  unsigned int
+    i,
+    imageColumns,
+    imageRows,
+    kernelWidth;
 
-  status = MagickFalse;
-  outputReady = MagickFalse;
-  composePixels = NULL;
+  clEnv = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  kernel = NULL;
+  context = NULL;
   imageBuffer = NULL;
-  compositeImageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  imageKernelBuffer = NULL;
+  unsharpMaskKernel = NULL;
+  queue = NULL;
+  outputReady = MagickFalse;
 
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
   queue = AcquireOpenCLCommandQueue(clEnv);
 
   /* Create and initialize OpenCL buffers. */
-  image_view=AcquireAuthenticCacheView(image,exception);
-  inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (inputPixels == (void *) NULL)
   {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,
-      "UnableToReadPixelCache.","`%s'",image->filename);
-    goto cleanup;
-  }
+    image_view=AcquireVirtualCacheView(image,exception);
+    inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+    if (inputPixels == (const void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+      goto cleanup;
+    }
 
-  /* If the host pointer is aligned to the size of CLPixelPacket, 
+    /* If the host pointer is aligned to the size of CLPixelPacket, 
      then use the host buffer directly from the GPU; otherwise, 
      create a buffer on the GPU and copy the data over */
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    if (ALIGNED(inputPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
   }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, 
-    length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
-  if (clStatus != CL_SUCCESS)
+
+  /* create output */
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), 
-      ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
+    filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
+    assert(filteredImage != NULL);
+    if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
+    filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
+    if (filteredPixels == (void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+      goto cleanup;
+    }
 
+    if (ALIGNED(filteredPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+      hostPtr = filteredPixels;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY;
+      hostPtr = NULL;
+    }
 
-  /* Create and initialize OpenCL buffers. */
-  composePixels = AcquirePixelCachePixels(compositeImage, &length, exception); 
-  if (composePixels == (void *) NULL)
-  {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,
-      "UnableToReadPixelCache.","`%s'",compositeImage->filename);
-    goto cleanup;
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
   }
 
-  /* If the host pointer is aligned to the size of CLPixelPacket, 
-     then use the host buffer directly from the GPU; otherwise, 
-     create a buffer on the GPU and copy the data over */
-  if (ALIGNED(composePixels,CLPixelPacket)) 
-  {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
-  }
-  else 
+  /* create the blur kernel */
   {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    (void) FormatLocaleString(geometry,MagickPathExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
+    kernel=AcquireKernelInfo(geometry,exception);
+    if (kernel == (KernelInfo *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.",".");
+      goto cleanup;
+    }
+
+    imageKernelBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+
+
+    kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
+      goto cleanup;
+    }
+    for (i = 0; i < kernel->width; i++)
+    {
+      kernelBufferPtr[i] = (float) kernel->values[i];
+    }
+    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      goto cleanup;
+    }
   }
-  /* create a CL buffer from image pixel buffer */
-  length = compositeImage->columns * compositeImage->rows;
-  compositeImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, 
-    length * sizeof(CLPixelPacket), (void*)composePixels, &clStatus);
-  if (clStatus != CL_SUCCESS)
+
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), 
-      ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-  
-  status = LaunchCompositeKernel(clEnv,queue,imageBuffer,
-           (unsigned int) image->columns,
-           (unsigned int) image->rows,
-           (unsigned int) (image->alpha_trait == UndefinedPixelTrait) ? 1 : 0,
-           channel, compose, compositeImageBuffer,
-           (unsigned int) compositeImage->columns,
-           (unsigned int) compositeImage->rows,
-           destination_dissolve,source_dissolve,
-           exception);
+    /* get the opencl kernel */
+    {
+      unsharpMaskKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMask");
+      if (unsharpMaskKernel == NULL)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+    }
 
-  if (status==MagickFalse)
-    goto cleanup;
+    {
+      imageColumns = (unsigned int) image->columns;
+      imageRows = (unsigned int) image->rows;
+      kernelWidth = (unsigned int) kernel->width;
+      fGain = (float) gain;
+      fThreshold = (float) threshold;
+      justBlur = blurOnly;
 
-  length = image->columns * image->rows;
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
+      /* set the kernel arguments */
+      i = 0;
+      clStatus=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_float4)*(8 * (32 + kernel->width)),(void *) NULL);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(float),(void *)&fGain);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(float),(void *)&fThreshold);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_uint),(void *)&justBlur);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+        goto cleanup;
+      }
+    }
+
+    /* launch the kernel */
+    {
+      size_t gsize[2];
+      size_t wsize[2];
+
+      gsize[0] = ((image->columns + 7) / 8) * 8;
+      gsize[1] = ((image->rows + 31) / 32) * 32;
+      wsize[0] = 8;
+      wsize[1] = 32;
+
+      clStatus = clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, unsharpMaskKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+        goto cleanup;
+      }
+      clEnv->library->clFlush(queue);
+    }
+  }
+
+  /* get result */
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
-    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, 
-      CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, 
-      NULL, &clStatus);
+    length = image->columns * image->rows;
+    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
-  else
+  else 
   {
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, 
-      length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
+    length = image->columns * image->rows;
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
   }
-  if (clStatus==CL_SUCCESS)
-    outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
 
 cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
   image_view=DestroyCacheView(image_view);
-  if (imageBuffer!=NULL)      clEnv->library->clReleaseMemObject(imageBuffer);
-  if (compositeImageBuffer!=NULL)  clEnv->library->clReleaseMemObject(compositeImageBuffer);
-  if (queue != NULL)               RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (filteredImage_view != NULL)
+    filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  return(outputReady);
+  if (kernel != NULL)                        kernel=DestroyKernelInfo(kernel);
+  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
+  if (filteredImageBuffer!=NULL)              clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (imageKernelBuffer!=NULL)                clEnv->library->clReleaseMemObject(imageKernelBuffer);
+  if (unsharpMaskKernel!=NULL)                RelinquishOpenCLKernel(clEnv, unsharpMaskKernel);
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return(filteredImage);
 }
 
-MagickExport MagickBooleanType AccelerateCompositeImage(Image *image,
-  const ChannelType channel,const CompositeOperator compose,
-  const Image *composite,const ssize_t x_offset,const ssize_t y_offset,
-  const float destination_dissolve,const float source_dissolve,
-  ExceptionInfo *exception)
+MagickExport Image *AccelerateUnsharpMaskImage(const Image *image,
+  const ChannelType channel,const double radius,const double sigma,
+  const double gain,const double threshold,ExceptionInfo *exception)
 {
-  MagickBooleanType
-    status;
+  Image
+    *filteredImage;
 
   assert(image != NULL);
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
       (checkAccelerateCondition(image, channel) == MagickFalse))
-    return(MagickFalse);
-
-  /* only support zero offset and
-     images with the size for now */
-  if (x_offset!=0
-    || y_offset!=0
-    || image->columns!=composite->columns
-    || image->rows!=composite->rows)
-    return MagickFalse;
-
-  switch(compose) {
-  case ColorDodgeCompositeOp: 
-  case BlendCompositeOp:
-    break;
-  default:
-    // unsupported compose operator, quit
-    return MagickFalse;
-  };
-
-  status = ComputeCompositeImage(image,channel,compose,composite,
-    x_offset,y_offset,destination_dissolve,source_dissolve,exception);
+    return NULL;
 
-  return(status);
+  if (radius < 12.1)
+    filteredImage = ComputeUnsharpMaskImageSingle(image,channel,radius,sigma,gain,threshold, 0, exception);
+  else if (splitImage(image) && (image->rows / 2 > radius)) 
+    filteredImage = ComputeUnsharpMaskImageSection(image,channel,radius,sigma,gain,threshold,exception);
+  else
+    filteredImage = ComputeUnsharpMaskImage(image,channel,radius,sigma,gain,threshold,exception);
+  return(filteredImage);
 }
 
 #else  /* MAGICKCORE_OPENCL_SUPPORT  */
 
-MagickExport Image *AccelerateConvolveImageChannel(
-  const Image *magick_unused(image),const ChannelType magick_unused(channel),
-  const KernelInfo *magick_unused(kernel),
-  ExceptionInfo *magick_unused(exception))
-{
-  magick_unreferenced(image);
-  magick_unreferenced(channel);
-  magick_unreferenced(kernel);
-  magick_unreferenced(exception);
-
-  return NULL;
-}
-
-MagickExport MagickBooleanType AccelerateFunctionImage(
-  Image *magick_unused(image),const ChannelType magick_unused(channel),
-  const MagickFunction magick_unused(function),
-  const size_t magick_unused(number_parameters),
-  const double *magick_unused(parameters),
-  ExceptionInfo *magick_unused(exception))
+MagickExport Image *AccelerateAddNoiseImage(const Image *magick_unused(image),
+  const ChannelType magick_unused(channel),
+  const NoiseType magick_unused(noise_type),
+  ExceptionInfo *magick_unused(exception)) 
 {
   magick_unreferenced(image);
   magick_unreferenced(channel);
-  magick_unreferenced(function);
-  magick_unreferenced(number_parameters);
-  magick_unreferenced(parameters);
+  magick_unreferenced(noise_type);
   magick_unreferenced(exception);
-
-  return MagickFalse;
+  return((Image *) NULL);
 }
 
 MagickExport Image *AccelerateBlurImage(const Image *magick_unused(image),
@@ -7473,57 +7258,17 @@ MagickExport Image *AccelerateBlurImage(const Image *magick_unused(image),
   magick_unreferenced(sigma);
   magick_unreferenced(exception);
 
-  return NULL;
-}
-
-MagickExport Image *AccelerateLocalContrastImage(
-  const Image *magick_unused(image),const double magick_unused(radius),
-  const double magick_unused(strength),ExceptionInfo *magick_unused(exception))
-{
-  magick_unreferenced(image);
-  magick_unreferenced(radius);
-  magick_unreferenced(strength);
-  magick_unreferenced(exception);
-
-  return NULL;
-}
-
-MagickExport Image *AccelerateRotationalBlurImage(
-  const Image *magick_unused(image),const ChannelType magick_unused(channel),
-  const double magick_unused(angle),ExceptionInfo *magick_unused(exception))
-{
-  magick_unreferenced(image);
-  magick_unreferenced(channel);
-  magick_unreferenced(angle);
-  magick_unreferenced(exception);
-
-  return NULL;
+  return((Image *) NULL);
 }
 
-
-MagickExport Image *AccelerateUnsharpMaskImage(
-  const Image *magick_unused(image),const ChannelType magick_unused(channel),
-  const double magick_unused(radius),const double magick_unused(sigma),
-  const double magick_unused(gain),const double magick_unused(threshold),
+MagickExport MagickBooleanType AccelerateCompositeImage(
+  Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const CompositeOperator magick_unused(compose),
+  const Image *magick_unused(composite),const ssize_t magick_unused(x_offset),
+  const ssize_t magick_unused(y_offset),
+  const float magick_unused(destination_dissolve),
+  const float magick_unused(source_dissolve),
   ExceptionInfo *magick_unused(exception))
-{
-  magick_unreferenced(image);
-  magick_unreferenced(channel);
-  magick_unreferenced(radius);
-  magick_unreferenced(sigma);
-  magick_unreferenced(gain);
-  magick_unreferenced(threshold);
-  magick_unreferenced(exception);
-
-  return NULL;
-}
-
-MagickExport
-MagickBooleanType AccelerateCompositeImage(Image *image,
-  const ChannelType channel,const CompositeOperator compose,
-  const Image *composite,const ssize_t x_offset,const ssize_t y_offset,
-  const float destination_dissolve,const float source_dissolve,
-  ExceptionInfo *exception)
 {
   magick_unreferenced(image);
   magick_unreferenced(channel);
@@ -7535,10 +7280,9 @@ MagickBooleanType AccelerateCompositeImage(Image *image,
   magick_unreferenced(source_dissolve);
   magick_unreferenced(exception);
 
-  return MagickFalse;
+  return(MagickFalse);
 }
 
-
 MagickExport MagickBooleanType AccelerateContrastImage(
   Image* magick_unused(image),const MagickBooleanType magick_unused(sharpen),
   ExceptionInfo* magick_unused(exception))
@@ -7547,12 +7291,14 @@ MagickExport MagickBooleanType AccelerateContrastImage(
   magick_unreferenced(sharpen);
   magick_unreferenced(exception);
 
-  return MagickFalse;
+  return(MagickFalse);
 }
 
-MagickExport MagickBooleanType AccelerateContrastStretchImageChannel(
-    Image * image, const ChannelType channel, const double black_point, const double white_point, 
-    ExceptionInfo* magick_unused(exception))
+MagickExport MagickBooleanType AccelerateContrastStretchImage(
+  Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const double magick_unused(black_point),
+  const double magick_unused(white_point),
+  ExceptionInfo* magick_unused(exception))
 {
   magick_unreferenced(image);
   magick_unreferenced(channel);
@@ -7560,7 +7306,20 @@ MagickExport MagickBooleanType AccelerateContrastStretchImageChannel(
   magick_unreferenced(white_point);
   magick_unreferenced(exception);
 
-  return MagickFalse;
+  return(MagickFalse);
+}
+
+MagickExport Image *AccelerateConvolveImageChannel(
+  const Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const KernelInfo *magick_unused(kernel),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(channel);
+  magick_unreferenced(kernel);
+  magick_unreferenced(exception);
+
+  return((Image *) NULL);
 }
 
 MagickExport MagickBooleanType AccelerateEqualizeImage(
@@ -7571,7 +7330,7 @@ MagickExport MagickBooleanType AccelerateEqualizeImage(
   magick_unreferenced(channel);
   magick_unreferenced(exception);
 
-  return MagickFalse;
+  return(MagickFalse);
 }
 
 MagickExport Image *AccelerateDespeckleImage(const Image* magick_unused(image),
@@ -7580,28 +7339,54 @@ MagickExport Image *AccelerateDespeckleImage(const Image* magick_unused(image),
   magick_unreferenced(image);
   magick_unreferenced(exception);
 
-  return NULL;
+  return((Image *) NULL);
 }
 
-MagickExport Image *AccelerateResizeImage(const Image* magick_unused(image),
-  const size_t magick_unused(resizedColumns),
-  const size_t magick_unused(resizedRows),
-  const ResizeFilter* magick_unused(resizeFilter),
+MagickExport MagickBooleanType AccelerateFunctionImage(
+  Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const MagickFunction magick_unused(function),
+  const size_t magick_unused(number_parameters),
+  const double *magick_unused(parameters),
   ExceptionInfo *magick_unused(exception))
 {
   magick_unreferenced(image);
-  magick_unreferenced(resizedColumns);
-  magick_unreferenced(resizedRows);
-  magick_unreferenced(resizeFilter);
+  magick_unreferenced(channel);
+  magick_unreferenced(function);
+  magick_unreferenced(number_parameters);
+  magick_unreferenced(parameters);
+  magick_unreferenced(exception);
+
+  return(MagickFalse);
+}
+
+MagickExport MagickBooleanType AccelerateGrayscaleImage(
+  Image *magick_unused(image),const PixelIntensityMethod magick_unused(method),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(method);
+  magick_unreferenced(exception);
+
+  return(MagickFalse);
+}
+
+MagickExport Image *AccelerateLocalContrastImage(
+  const Image *magick_unused(image),const double magick_unused(radius),
+  const double magick_unused(strength),ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(radius);
+  magick_unreferenced(strength);
   magick_unreferenced(exception);
 
-  return NULL;
+  return((Image *) NULL);
 }
 
-MagickExport
-MagickBooleanType AccelerateModulateImage(
-  Image* image, double percent_brightness, double percent_hue, 
-  double percent_saturation, ColorspaceType colorspace, ExceptionInfo* exception)
+MagickExport MagickBooleanType AccelerateModulateImage(
+  Image *magick_unused(image),double magick_unused(percent_brightness),
+  double magick_unused(percent_hue),double magick_unused(percent_saturation),
+  ColorspaceType magick_unused(colorspace),
+  ExceptionInfo *magick_unused(exception))
 {
   magick_unreferenced(image);
   magick_unreferenced(percent_brightness);
@@ -7609,50 +7394,77 @@ MagickBooleanType AccelerateModulateImage(
   magick_unreferenced(percent_saturation);
   magick_unreferenced(colorspace);
   magick_unreferenced(exception);
+
   return(MagickFalse);
 }
 
-MagickExport
-MagickBooleanType AccelerateGrayscaleImage(
-  Image* image, const PixelIntensityMethod method, ExceptionInfo* exception)
+MagickExport Image *AccelerateMotionBlurImage(
+  const Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const double *magick_unused(kernel),const size_t magick_unused(width),
+  const OffsetInfo *magick_unused(offset),
+  ExceptionInfo *magick_unused(exception))
 {
   magick_unreferenced(image);
-  magick_unreferenced(method);
+  magick_unreferenced(channel);
+  magick_unreferenced(kernel);
+  magick_unreferenced(width);
+  magick_unreferenced(offset);
   magick_unreferenced(exception);
-  return(MagickFalse);
+
+  return((Image *) NULL);
 }
 
-MagickExport Image *AccelerateAddNoiseImage(const Image *image, 
-  const ChannelType channel, const NoiseType noise_type,ExceptionInfo *exception) 
+MagickExport MagickBooleanType AccelerateRandomImage(
+  Image *magick_unused(image),ExceptionInfo *magick_unused(exception))
 {
   magick_unreferenced(image);
-  magick_unreferenced(channel);
-  magick_unreferenced(noise_type);
   magick_unreferenced(exception);
-  return NULL;
+
+  return MagickFalse;
 }
 
+MagickExport Image *AccelerateResizeImage(const Image *magick_unused(image),
+  const size_t magick_unused(resizedColumns),
+  const size_t magick_unused(resizedRows),
+  const ResizeFilter *magick_unused(resizeFilter),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(resizedColumns);
+  magick_unreferenced(resizedRows);
+  magick_unreferenced(resizeFilter);
+  magick_unreferenced(exception);
+
+  return((Image *) NULL);
+}
 
-MagickExport MagickBooleanType AccelerateRandomImage(Image* image, ExceptionInfo* exception)
+MagickExport Image *AccelerateRotationalBlurImage(
+  const Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const double magick_unused(angle),ExceptionInfo *magick_unused(exception))
 {
   magick_unreferenced(image);
+  magick_unreferenced(channel);
+  magick_unreferenced(angle);
   magick_unreferenced(exception);
-  return MagickFalse;
+
+  return((Image *) NULL);
 }
 
-MagickExport
-Image* AccelerateMotionBlurImage(const Image *image, const ChannelType channel,
-                                const double* kernel, const size_t width,
-                                const OffsetInfo *offset, 
-                                ExceptionInfo *exception)
+MagickExport Image *AccelerateUnsharpMaskImage(
+  const Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const double magick_unused(radius),const double magick_unused(sigma),
+  const double magick_unused(gain),const double magick_unused(threshold),
+  ExceptionInfo *magick_unused(exception))
 {
   magick_unreferenced(image);
   magick_unreferenced(channel);
-  magick_unreferenced(kernel);
-  magick_unreferenced(width);
-  magick_unreferenced(offset);
+  magick_unreferenced(radius);
+  magick_unreferenced(sigma);
+  magick_unreferenced(gain);
+  magick_unreferenced(threshold);
   magick_unreferenced(exception);
-  return NULL;
+
+  return((Image *) NULL);
 }
 
 #endif /* MAGICKCORE_OPENCL_SUPPORT */
\ No newline at end of file
index 86b64945f2f86563a4a913f6bc6e63c95439288c..e10b2f70ad01ab74363d36ea4fa03c16142dd543 100644 (file)
@@ -33,7 +33,7 @@ extern MagickExport MagickBooleanType
     const Image *,const ssize_t,const ssize_t,const float,const float,
     ExceptionInfo *),
   AccelerateContrastImage(Image *,const MagickBooleanType,ExceptionInfo *),
-  AccelerateContrastStretchImageChannel(Image *, const ChannelType,
+  AccelerateContrastStretchImage(Image *, const ChannelType,
     const double,const double,ExceptionInfo*),
   AccelerateEqualizeImage(Image *,const ChannelType,ExceptionInfo *),
   AccelerateFunctionImage(Image *,const ChannelType,const MagickFunction,