]> granicus.if.org Git - imagemagick/commitdiff
OpenCL performance improvements.
authordirk <dirk@git.imagemagick.org>
Wed, 13 Aug 2014 20:39:42 +0000 (20:39 +0000)
committerdirk <dirk@git.imagemagick.org>
Wed, 13 Aug 2014 20:39:42 +0000 (20:39 +0000)
MagickCore/accelerate-private.h
MagickCore/accelerate.c

index 726bbd362f503a9f949a61f4a3d5eab21c9fb13f..d5afd755fd63be3f47fc91fdc2bee2c64c3c1316 100644 (file)
@@ -1325,6 +1325,96 @@ const char* accelerateKernels =
     )
 
 
+    STRINGIFY(\r
+      __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,\r
+                         __constant float *filter,\r
+                         const unsigned int width, \r
+                         const unsigned int imageColumns, const unsigned int imageRows,\r
+                         __local float4 *pixels, \r
+                         const float gain, const float threshold, const unsigned int justBlur)\r
+      {\r
+        const int x = get_global_id(0);\r
+        const int y = get_global_id(1);\r
+\r
+        const unsigned int radius = (width - 1) / 2;\r
+                               \r
+               int row = y - radius;\r
+               int baseRow = get_group_id(1) * get_local_size(1) - radius;\r
+               int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;\r
+                               \r
+               while (row < endRow) {\r
+                       int srcy =  (row < 0) ? -row : row;                     // mirror pad\r
+                       srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;\r
+                                       \r
+                       float4 value = 0.0f;\r
+                                       \r
+                       int ix = x - radius;\r
+                       int i = 0;\r
+\r
+                       while (i + 7 < width) {\r
+                               for (int j = 0; j < 8; ++j) {           // unrolled\r
+                                       int srcx = ix + j;\r
+                                       srcx = (srcx < 0) ? -srcx : srcx;\r
+                                       srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;\r
+                                       value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);\r
+                               }\r
+                               ix += 8;\r
+                               i += 8;\r
+                       }\r
+\r
+                       while (i < width) {\r
+                               int srcx = (ix < 0) ? -ix : ix;                 // mirror pad\r
+                               srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;\r
+                               value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);\r
+                               ++i;\r
+                               ++ix;\r
+                       }       \r
+                       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;\r
+                       row += get_local_size(1);\r
+               }\r
+                               \r
+                       \r
+               barrier(CLK_LOCAL_MEM_FENCE);\r
+\r
+                                               \r
+               const int px = get_local_id(0);\r
+               const int py = get_local_id(1);\r
+               const int prp = get_local_size(0);\r
+               float4 value = (float4)(0.0f);\r
+                       \r
+               int i = 0;\r
+               while (i + 7 < width) {                 // unrolled\r
+                       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];\r
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];\r
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];\r
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];\r
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];\r
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];\r
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];\r
+                       value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];\r
+                       i += 8;\r
+               }\r
+               while (i < width) {\r
+                       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];\r
+                       ++i;\r
+               }\r
+\r
+               if (justBlur == 0) {            // apply sharpening\r
+                       float4 srcPixel = convert_float4(im[x + y * imageColumns]);\r
+                       float4 diff = srcPixel - value;\r
+\r
+                       float quantumThreshold = QuantumRange*threshold;\r
+\r
+                       int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);\r
+                       value = select(srcPixel + diff * gain, srcPixel, mask);\r
+               }\r
+       \r
+               if ((x < imageColumns) && (y < imageRows))\r
+                       filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));\r
+               }       \r
+       )
+
+
 
   STRINGIFY(
 
@@ -2654,220 +2744,308 @@ const char* accelerateKernels =
   )
 
 
-  STRINGIFY(
-  
-  typedef enum
-  {
-    UndefinedNoise,
-    UniformNoise,
-    GaussianNoise,
-    MultiplicativeGaussianNoise,
-    ImpulseNoise,
-    LaplacianNoise,
-    PoissonNoise,
-    RandomNoise
-  } NoiseType;
-
-  typedef struct {
-    const global float* rns;
-  } RandomNumbers;
-
-
-  float ReadPseudoRandomValue(RandomNumbers* r) {
-    float v = *r->rns;
-    r->rns++;
-    return v;
-  }
-  )
-
-  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
-  OPENCL_DEFINE(SigmaGaussian,(attenuate*0.015625f))
-  OPENCL_DEFINE(SigmaImpulse,  (attenuate*0.1f))
-  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
-  OPENCL_DEFINE(SigmaMultiplicativeGaussian,  (attenuate*0.5f))
-  OPENCL_DEFINE(SigmaPoisson,  (attenuate*12.5f))
-  OPENCL_DEFINE(SigmaRandom,  (attenuate))
-  OPENCL_DEFINE(TauGaussian,  (attenuate*0.078125f))
-
-  STRINGIFY(
-  float GenerateDifferentialNoise(RandomNumbers* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
-    float 
-      alpha,
-      beta,
-      noise,
-      sigma;
-
-    noise = 0.0f;
-    alpha=ReadPseudoRandomValue(r);
-    switch(noise_type) {
-    case UniformNoise:
-    default:
-      {
-        noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
-        break;
-      }
-    case GaussianNoise:
-      {
-        float
-          gamma,
-          tau;
-
-        if (alpha == 0.0f)
-          alpha=1.0f;
-        beta=ReadPseudoRandomValue(r);
-        gamma=sqrt(-2.0f*log(alpha));
-        sigma=gamma*cospi((2.0f*beta));
-        tau=gamma*sinpi((2.0f*beta));
-        noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
-                      QuantumRange*TauGaussian*tau);        
-        break;
-      }
+OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
+OPENCL_DEFINE(SigmaGaussian,(attenuate*0.015625f))
+OPENCL_DEFINE(SigmaImpulse,  (attenuate*0.1f))
+OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
+OPENCL_DEFINE(SigmaMultiplicativeGaussian,  (attenuate*0.5f))
+OPENCL_DEFINE(SigmaPoisson,  (attenuate*12.5f))
+OPENCL_DEFINE(SigmaRandom,  (attenuate))
+OPENCL_DEFINE(TauGaussian,  (attenuate*0.078125f))
 
+STRINGIFY(
 
-    case ImpulseNoise:
-    {
-      if (alpha < (SigmaImpulse/2.0f))
-        noise=0.0f;
-      else
-        if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
-          noise=(float)QuantumRange;
-        else
-          noise=(float)pixel;
-      break;
-    }
-    case LaplacianNoise:
-    {
-      if (alpha <= 0.5f)
-        {
-          if (alpha <= MagickEpsilon)
-            noise=(float) (pixel-QuantumRange);
-          else
-            noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
-              0.5f);
-          break;
-        }
-      beta=1.0f-alpha;
-      if (beta <= (0.5f*MagickEpsilon))
-        noise=(float) (pixel+QuantumRange);
-      else
-        noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
-      break;
-    }
-    case MultiplicativeGaussianNoise:
-    {
-      sigma=1.0f;
-      if (alpha > MagickEpsilon)
-        sigma=sqrt(-2.0f*log(alpha));
-      beta=ReadPseudoRandomValue(r);
-      noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
-        cospi((float) (2.0f*beta))/2.0f);
-      break;
-    }
-    case PoissonNoise:
-    {
-      float 
-        poisson;
-      unsigned int i;
-      poisson=exp(-SigmaPoisson*QuantumScale*pixel);
-      for (i=0; alpha > poisson; i++)
-      {
-        beta=ReadPseudoRandomValue(r);
-        alpha*=beta;
-      }
-      noise=(float) (QuantumRange*i/SigmaPoisson);
-      break;
-    }
-    case RandomNoise:
-    {
-      noise=(float) (QuantumRange*SigmaRandom*alpha);
-      break;
-    }
-
-    };
-    return noise;
-  }
+/*
+Part of MWC64X by David Thomas, dt10@imperial.ac.uk
+This is provided under BSD, full license is with the main package.
+See http://www.doc.ic.ac.uk/~dt10/research
+*/\r
+\r
+// Pre: a<M, b<M
+// Post: r=(a+b) mod M
+ulong MWC_AddMod64(ulong a, ulong b, ulong M)
+{
+       ulong v=a+b;
+       //if( (v>=M) || (v<a) )
+       if( (v>=M) || (convert_float(v) < convert_float(a)) )   // workaround for what appears to be an optimizer bug.
+               v=v-M;
+       return v;
+}
 
-  __kernel
-  void AddNoiseImage(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
-                    ,const unsigned int inputColumns, const unsigned int inputRows
-                    ,const ChannelType channel 
-                    ,const NoiseType noise_type, const float attenuate
-                    ,const __global float* randomNumbers, const unsigned int numRandomNumbersPerPixel
-                    ,const unsigned int rowOffset) {
+// Pre: a<M,b<M
+// Post: r=(a*b) mod M
+// This could be done more efficently, but it is portable, and should
+// be easy to understand. It can be replaced with any of the better
+// modular multiplication algorithms (for example if you know you have
+// double precision available or something).
+ulong MWC_MulMod64(ulong a, ulong b, ulong M)
+{      
+       ulong r=0;
+       while(a!=0){
+               if(a&1)
+                       r=MWC_AddMod64(r,b,M);
+               b=MWC_AddMod64(b,b,M);
+               a=a>>1;
+       }
+       return r;
+}
 
-    unsigned int x = get_global_id(0);
-    unsigned int y = get_global_id(1) + rowOffset;
-    RandomNumbers r;
-    r.rns = randomNumbers + (get_global_id(1) * inputColumns + get_global_id(0))*numRandomNumbersPerPixel;
 
-    CLPixelType p = inputImage[y*inputColumns+x];
-    CLPixelType q = filteredImage[y*inputColumns+x];
+// Pre: a<M, e>=0
+// Post: r=(a^b) mod M
+// This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
+// most architectures
+ulong MWC_PowMod64(ulong a, ulong e, ulong M)
+{
+       ulong sqr=a, acc=1;
+       while(e!=0){
+               if(e&1)
+                       acc=MWC_MulMod64(acc,sqr,M);
+               sqr=MWC_MulMod64(sqr,sqr,M);
+               e=e>>1;
+       }
+       return acc;
+}
 
-    if ((channel&RedChannel)!=0) {
-      setRed(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getRed(p),noise_type,attenuate)));
-    }
-    
-    if ((channel&GreenChannel)!=0) {
-      setGreen(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getGreen(p),noise_type,attenuate)));
-    }
+uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
+{
+       ulong m=MWC_PowMod64(A, distance, M);
+       ulong x=curr.x*(ulong)A+curr.y;
+       x=MWC_MulMod64(x, m, M);
+       return (uint2)((uint)(x/A), (uint)(x%A));
+}
 
-    if ((channel&BlueChannel)!=0) {
-      setBlue(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getBlue(p),noise_type,attenuate)));
-    }
+uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
+{
+       // This is an arbitrary constant for starting LCG jumping from. I didn't
+       // want to start from 1, as then you end up with the two or three first values
+       // being a bit poor in ones - once you've decided that, one constant is as
+       // good as any another. There is no deep mathematical reason for it, I just
+       // generated a random number.
+       enum{ MWC_BASEID = 4077358422479273989UL };
+       
+       ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
+       ulong m=MWC_PowMod64(A, dist, M);
+       
+       ulong x=MWC_MulMod64(MWC_BASEID, m, M);
+       return (uint2)((uint)(x/A), (uint)(x%A));
+}\r
+\r
+//! Represents the state of a particular generator\r
+typedef struct{ uint x; uint c; } mwc64x_state_t;
+
+enum{ MWC64X_A = 4294883355U };
+enum{ MWC64X_M = 18446383549859758079UL };
+
+void MWC64X_Step(mwc64x_state_t *s)
+{
+       uint X=s->x, C=s->c;
+       
+       uint Xn=MWC64X_A*X+C;
+       uint carry=(uint)(Xn<C);                                // The (Xn<C) will be zero or one for scalar
+       uint Cn=mad_hi(MWC64X_A,X,carry);  
+       
+       s->x=Xn;
+       s->c=Cn;
+}
 
-    if ((channel & OpacityChannel) != 0) {
-      setOpacity(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getOpacity(p),noise_type,attenuate)));
-    }
+void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
+{
+       uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
+       s->x=tmp.x;
+       s->c=tmp.y;
+}
 
-    filteredImage[y*inputColumns+x] = q;
-  }
+void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
+{
+       uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
+       s->x=tmp.x;
+       s->c=tmp.y;
+}
 
+//! Return a 32-bit integer in the range [0..2^32)
+uint MWC64X_NextUint(mwc64x_state_t *s)
+{
+       uint res=s->x ^ s->c;
+       MWC64X_Step(s);
+       return res;
+}\r
+\r
+//\r
+// End of MWC64X excerpt\r
+//\r
+\r
+\r
+  typedef enum\r
+  {\r
+    UndefinedNoise,\r
+    UniformNoise,\r
+    GaussianNoise,\r
+    MultiplicativeGaussianNoise,\r
+    ImpulseNoise,\r
+    LaplacianNoise,\r
+    PoissonNoise,\r
+    RandomNoise\r
+  } NoiseType;\r
+\r
+\r
+  float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {\r
+       return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff);     // normalized to 1.0\r
+  }\r
+\r
+  \r
+  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {\r
\r
+    float \r
+      alpha,\r
+      beta,\r
+      noise,\r
+      sigma;\r
+\r
+    noise = 0.0f;\r
+    alpha=mwcReadPseudoRandomValue(r);\r
+    switch(noise_type) {\r
+    case UniformNoise:\r
+    default:\r
+      {\r
+        noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));\r
+        break;\r
+      }\r
+    case GaussianNoise:\r
+      {\r
+        float\r
+          gamma,\r
+          tau;\r
+\r
+        if (alpha == 0.0f)\r
+          alpha=1.0f;\r
+        beta=mwcReadPseudoRandomValue(r);\r
+        gamma=sqrt(-2.0f*log(alpha));\r
+        sigma=gamma*cospi((2.0f*beta));\r
+        tau=gamma*sinpi((2.0f*beta));\r
+        noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+\r
+                      QuantumRange*TauGaussian*tau);        \r
+        break;\r
+      }\r
+\r
+\r
+    case ImpulseNoise:\r
+    {\r
+      if (alpha < (SigmaImpulse/2.0f))\r
+        noise=0.0f;\r
+      else\r
+        if (alpha >= (1.0f-(SigmaImpulse/2.0f)))\r
+          noise=(float)QuantumRange;\r
+        else\r
+          noise=(float)pixel;\r
+      break;\r
+    }\r
+    case LaplacianNoise:\r
+    {\r
+      if (alpha <= 0.5f)\r
+        {\r
+          if (alpha <= MagickEpsilon)\r
+            noise=(float) (pixel-QuantumRange);\r
+          else\r
+            noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+\r
+              0.5f);\r
+          break;\r
+        }\r
+      beta=1.0f-alpha;\r
+      if (beta <= (0.5f*MagickEpsilon))\r
+        noise=(float) (pixel+QuantumRange);\r
+      else\r
+        noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);\r
+      break;\r
+    }\r
+    case MultiplicativeGaussianNoise:\r
+    {\r
+      sigma=1.0f;\r
+      if (alpha > MagickEpsilon)\r
+        sigma=sqrt(-2.0f*log(alpha));\r
+      beta=mwcReadPseudoRandomValue(r);\r
+      noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*\r
+        cospi((float) (2.0f*beta))/2.0f);\r
+      break;\r
+    }\r
+    case PoissonNoise:\r
+    {\r
+      float \r
+        poisson;\r
+      unsigned int i;\r
+      poisson=exp(-SigmaPoisson*QuantumScale*pixel);\r
+      for (i=0; alpha > poisson; i++)\r
+      {\r
+        beta=mwcReadPseudoRandomValue(r);\r
+        alpha*=beta;\r
+      }\r
+      noise=(float) (QuantumRange*i/SigmaPoisson);\r
+      break;\r
+    }\r
+    case RandomNoise:\r
+    {\r
+      noise=(float) (QuantumRange*SigmaRandom*alpha);\r
+      break;\r
+    }\r
+\r
+    };\r
+    return noise;\r
+  }\r
+\r
+\r
+\r
+\r
+\r
+  __kernel\r
+  void GenerateNoiseImage(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage\r
+                    ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem\r
+                    ,const ChannelType channel \r
+                    ,const NoiseType noise_type, const float attenuate\r
+                    ,const unsigned int seed0, const unsigned int seed1\r
+                                       ,const unsigned int numRandomNumbersPerPixel) {\r
+\r
+       mwc64x_state_t rng;
+       rng.x = seed0;
+       rng.c = seed1;\r
+\r
+       uint span = pixelsPerWorkItem * numRandomNumbersPerPixel;       // length of RNG substream each workitem will use\r
+       uint offset = span * get_local_size(0) * get_group_id(0);       // offset of this workgroup's RNG substream (in master stream);\r
+\r
+       MWC64X_SeedStreams(&rng, offset, span);                                         // Seed the RNG streams\r
+\r
+       uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0);   // pixel to process\r
+\r
+       uint count = pixelsPerWorkItem;\r
+\r
+       while (count > 0) {\r
+               if (pos < inputPixelCount) {\r
+                       CLPixelType p = inputImage[pos];\r
+\r
+                       if ((channel&RedChannel)!=0) {
+                         setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
+                       }
+    
+                       if ((channel&GreenChannel)!=0) {
+                         setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
+                       }
+
+                       if ((channel&BlueChannel)!=0) {
+                         setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
+                       }
+
+                       if ((channel & OpacityChannel) != 0) {
+                         setOpacity(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getOpacity(p),noise_type,attenuate)));
+                       }\r
+\r
+                       filteredImage[pos] = p;\r
+                       //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);\r
+               }\r
+               pos += get_local_size(0);\r
+               --count;\r
+       }\r
+  }\r
   )
 
-  STRINGIFY(
-  __kernel 
-  void RandomImage(__global CLPixelType* inputImage,
-                   const uint imageColumns, const uint imageRows,
-                   __global uint* seeds,
-                   const float randNormNumerator,
-                   const uint randNormDenominator) {
-
-    unsigned int numGenerators = get_global_size(0);
-    unsigned numRandPixelsPerWorkItem = ((imageColumns*imageRows) + (numGenerators-1))
-                                        / numGenerators;
-
-    uint4 s;
-    s.x = seeds[get_global_id(0)*4];
-    s.y = seeds[get_global_id(0)*4+1];
-    s.z = seeds[get_global_id(0)*4+2];
-    s.w = seeds[get_global_id(0)*4+3];
-
-    unsigned int offset = get_group_id(0) * get_local_size(0) * numRandPixelsPerWorkItem;
-    for (unsigned int n = 0; n < numRandPixelsPerWorkItem; n++)
-    {
-      int i = offset + n*get_local_size(0) + get_local_id(0);
-      if (i >= imageColumns*imageRows)
-        break;
-
-      float rand = GetPseudoRandomValue(&s,randNormNumerator/randNormDenominator);
-      CLQuantum v = ClampToQuantum(QuantumRange*rand);
-
-      CLPixelType p;
-      setRed(&p,v);
-      setGreen(&p,v);
-      setBlue(&p,v);
-      setOpacity(&p,0);
-
-      inputImage[i] = p;
-    }
-
-    seeds[get_global_id(0)*4]   = s.x;
-    seeds[get_global_id(0)*4+1] = s.y;
-    seeds[get_global_id(0)*4+2] = s.z;
-    seeds[get_global_id(0)*4+3] = s.w;
-  }
-  )
 
   STRINGIFY(
     __kernel 
index 246386278874e7a76af1d2bcc6b90f75e767d60b..2d4ac35870d876a1542ad484869c367844e6abd5 100644 (file)
@@ -1535,6 +1535,17 @@ cleanup:
   return filteredImage;
 }
 
+static Image *ComputeUnsharpMaskImageSingle(const Image *image,
+  const ChannelType channel,const double radius,const double sigma,
+  const double gain,const double threshold,int blurOnly, ExceptionInfo *exception);
+
+static Image* ComputeBlurImageSingle(const Image* image,
+  const ChannelType channel,const double radius,const double sigma,
+  ExceptionInfo *exception)
+{
+  return ComputeUnsharpMaskImageSingle(image, channel, radius, sigma, 0.0, 0.0, 1, exception);
+}
+
 MagickExport Image* AccelerateBlurImage(const Image *image,
   const ChannelType channel,const double radius,const double sigma,
   ExceptionInfo *exception)
@@ -1549,7 +1560,9 @@ MagickExport Image* AccelerateBlurImage(const Image *image,
       (checkAccelerateCondition(image, channel) == MagickFalse))
     return NULL;
 
-  if (splitImage(image) && (image->rows / 2 > radius)) 
+  if (radius < 12.1)
+       filteredImage=ComputeBlurImageSingle(image, channel, radius, sigma, exception);
+  else if (splitImage(image) && (image->rows / 2 > radius)) 
     filteredImage=ComputeBlurImageSection(image, channel, radius, sigma, exception);
   else
     filteredImage=ComputeBlurImage(image, channel, radius, sigma, exception);
@@ -2688,284 +2701,365 @@ cleanup:
   return filteredImage;
 }
 
-MagickExport Image *AccelerateUnsharpMaskImage(const Image *image,
+static Image *ComputeUnsharpMaskImageSingle(const Image *image,
   const ChannelType channel,const double radius,const double sigma,
-  const double gain,const double threshold,ExceptionInfo *exception)
+  const double gain,const double threshold,int blurOnly, ExceptionInfo *exception)
 {
-  Image
-    *filteredImage;
+  CacheView
+    *filteredImage_view,
+    *image_view;
 
-  assert(image != NULL);
-  assert(exception != (ExceptionInfo *) NULL);
+  char
+    geometry[MaxTextExtent];
 
-  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, channel) == MagickFalse))
-    return NULL;
+  cl_command_queue
+    queue;
 
-  if (splitImage(image) && (image->rows / 2 > radius)) 
-    filteredImage = ComputeUnsharpMaskImageSection(image,channel,radius,sigma,gain,threshold,exception);
-  else
-    filteredImage = ComputeUnsharpMaskImage(image,channel,radius,sigma,gain,threshold,exception);
-  return(filteredImage);
-}
+  cl_context
+    context;
 
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%   A c c e l e r a t e R e s i z e I m a g e                                 %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  AccelerateResizeImage() is an OpenCL implementation of ResizeImage()
-%
-%  AccelerateResizeImage() scales an image to the desired dimensions, using the given
-%  filter (see AcquireFilterInfo()).
-%
-%  If an undefined filter is given the filter defaults to Mitchell for a
-%  colormapped image, a image with a matte channel, or if the image is
-%  enlarged.  Otherwise the filter defaults to a Lanczos.
-%
-%  AccelerateResizeImage() was inspired by Paul Heckbert's "zoom" program.
-%
-%  The format of the AccelerateResizeImage method is:
-%
-%      Image *ResizeImage(Image *image,const size_t columns,
-%        const size_t rows, const ResizeFilter* filter,
-%        ExceptionInfo *exception)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o columns: the number of columns in the scaled image.
-%
-%    o rows: the number of rows in the scaled image.
-%
-%    o filter: Image filter to use.
-%
-%    o exception: return any errors or warnings in this structure.
-%
-*/
+  cl_int
+    justBlur,
+    clStatus;
 
-static MagickBooleanType resizeHorizontalFilter(cl_mem image,
-  const unsigned int imageColumns,const unsigned int imageRows,
-  const unsigned int matte,cl_mem resizedImage,
-  const unsigned int resizedColumns,const unsigned int resizedRows,
-  const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients,
-  const float xFactor,MagickCLEnv clEnv,cl_command_queue queue,
-  ExceptionInfo *exception)
-{
   cl_kernel
-    horizontalKernel;
+    unsharpMaskKernel;
 
-  cl_int clStatus;
+  cl_mem
+    filteredImageBuffer,
+    imageBuffer,
+    imageKernelBuffer;
 
-  const unsigned int
-    workgroupSize = 256;
+  cl_mem_flags
+    mem_flags;
+
+  const void
+    *inputPixels;
 
   float
-    resizeFilterScale,
-    resizeFilterSupport,
-    resizeFilterWindowSupport,
-    resizeFilterBlur,
-    scale,
-    support;
+    fGain,
+    fThreshold,
+    *kernelBufferPtr;
 
-  int
-    cacheRangeStart,
-    cacheRangeEnd,
-    numCachedPixels,
-    resizeFilterType,
-    resizeWindowType;
+  Image
+    *filteredImage;
+
+  KernelInfo
+    *kernel;
 
   MagickBooleanType
-    status = MagickFalse;
+    outputReady;
 
-  size_t
-    deviceLocalMemorySize,
-    gammaAccumulatorLocalMemorySize,
-    global_work_size[2],
-    imageCacheLocalMemorySize,
-    pixelAccumulatorLocalMemorySize,
-    local_work_size[2],
-    totalLocalMemorySize,
-    weightAccumulatorLocalMemorySize;
+  MagickCLEnv
+    clEnv;
 
-  unsigned int
-    chunkSize,
-    i,
-    pixelPerWorkgroup;
+  MagickSizeType
+    length;
 
-  horizontalKernel = NULL;
-  status = MagickFalse;
+  void
+    *filteredPixels,
+    *hostPtr;
 
-  /*
-  Apply filter to resize vertically from image to resize image.
-  */
-  scale=MAGICK_MAX(1.0/xFactor+MagickEpsilon,1.0);
-  support=scale*GetResizeFilterSupport(resizeFilter);
-  if (support < 0.5)
-  {
-    /*
-    Support too small even for nearest neighbour: Reduce to point
-    sampling.
-    */
-    support=(MagickRealType) 0.5;
-    scale=1.0;
-  }
-  scale=PerceptibleReciprocal(scale);
+  unsigned int
+    i,
+    imageColumns,
+    imageRows,
+    kernelWidth;
 
-  if (resizedColumns < workgroupSize) 
-  {
-    chunkSize = 32;
-    pixelPerWorkgroup = 32;
-  }
-  else
-  {
-    chunkSize = workgroupSize;
-    pixelPerWorkgroup = workgroupSize;
-  }
+  clEnv = NULL;
+  filteredImage = NULL;
+  filteredImage_view = NULL;
+  kernel = NULL;
+  context = NULL;
+  imageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  imageKernelBuffer = NULL;
+  unsharpMaskKernel = NULL;
+  queue = NULL;
+  outputReady = MagickFalse;
 
-  /* get the local memory size supported by the device */
-  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
 
-DisableMSCWarning(4127)
-  while(1)
-RestoreMSCWarning
+  /* Create and initialize OpenCL buffers. */
   {
-    /* calculate the local memory size needed per workgroup */
-    cacheRangeStart = (int) (((0 + 0.5)/xFactor+MagickEpsilon)-support+0.5);
-    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/xFactor+MagickEpsilon)+support+0.5);
-    numCachedPixels = cacheRangeEnd - cacheRangeStart + 1;
-    imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket);
-    totalLocalMemorySize = imageCacheLocalMemorySize;
-
-    /* local size for the pixel accumulator */
-    pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4);
-    totalLocalMemorySize+=pixelAccumulatorLocalMemorySize;
-
-    /* local memory size for the weight accumulator */
-    weightAccumulatorLocalMemorySize = chunkSize * sizeof(float);
-    totalLocalMemorySize+=weightAccumulatorLocalMemorySize;
-
-    /* local memory size for the gamma accumulator */
-    if (matte == 0)
-      gammaAccumulatorLocalMemorySize = sizeof(float);
-    else
-      gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float);
-    totalLocalMemorySize+=gammaAccumulatorLocalMemorySize;
+    image_view=AcquireVirtualCacheView(image,exception);
+    inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
+    if (inputPixels == (const void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
+      goto cleanup;
+    }
 
-    if (totalLocalMemorySize <= deviceLocalMemorySize)
-      break;
-    else
+    /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+    if (ALIGNED(inputPixels,CLPixelPacket)) 
     {
-      pixelPerWorkgroup = pixelPerWorkgroup/2;
-      chunkSize = chunkSize/2;
-      if (pixelPerWorkgroup == 0
-          || chunkSize == 0)
-      {
-        /* quit, fallback to CPU */
-        goto cleanup;
-      }
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
     }
   }
 
-  resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter);
-  resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter);
-
-
-  if (resizeFilterType == SincFastWeightingFunction
-    && resizeWindowType == SincFastWeightingFunction)
-  {
-    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilterSinc");
-  }
-  else
+  /* create output */
   {
-    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilter");
-  }
-  if (horizontalKernel == NULL)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
+    filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
+    assert(filteredImage != NULL);
+    if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
+    filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
+    if (filteredPixels == (void *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+      goto cleanup;
+    }
+
+    if (ALIGNED(filteredPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+      hostPtr = filteredPixels;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY;
+      hostPtr = NULL;
+    }
+
+    /* create a CL buffer from image pixel buffer */
+    length = image->columns * image->rows;
+    filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
   }
 
-  i = 0;
-  clStatus = clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&image);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageColumns);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageRows);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&matte);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&xFactor);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizedImage);
+  /* create the blur kernel */
+  {
+    (void) FormatLocaleString(geometry,MaxTextExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
+    kernel=AcquireKernelInfo(geometry);
+    if (kernel == (KernelInfo *) NULL)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.",".");
+      goto cleanup;
+    }
 
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedRows);
+    imageKernelBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
 
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeFilterType);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeWindowType);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients);
 
-  resizeFilterScale = (float) GetResizeFilterScale(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterScale);
+    kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
+      goto cleanup;
+    }
+    for (i = 0; i < kernel->width; i++)
+    {
+      kernelBufferPtr[i] = (float) kernel->values[i];
+    }
+    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
 
-  resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterSupport);
+  {
+    /* get the opencl kernel */
+    {
+      unsharpMaskKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMask");
+      if (unsharpMaskKernel == NULL)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+    }
 
-  resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport);
+    {
+      imageColumns = image->columns;
+      imageRows = image->rows;
+      kernelWidth = kernel->width;
+      fGain = (float)gain;
+      fThreshold = (float)threshold;
+      justBlur = blurOnly;
 
-  resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterBlur);
+      /* set the kernel arguments */
+      i = 0;
+      clStatus=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_float4)*(8 * (32 + kernel->width)),(void *)NULL);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(float),(void *)&fGain);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(float),(void *)&fThreshold);
+      clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_uint),(void *)&justBlur);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+        goto cleanup;
+      }
+    }
 
+    /* launch the kernel */
+    {
+      size_t gsize[2];
+      size_t wsize[2];
 
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, imageCacheLocalMemorySize, NULL);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), &numCachedPixels);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &chunkSize);
-  
+      gsize[0] = ((image->columns + 7) / 8) * 8;
+      gsize[1] = ((image->rows + 31) / 32) * 32;
+      wsize[0] = 8;
+      wsize[1] = 32;
 
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, pixelAccumulatorLocalMemorySize, NULL);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, weightAccumulatorLocalMemorySize, NULL);
-  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, gammaAccumulatorLocalMemorySize, NULL);
+      clStatus = clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, unsharpMaskKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+        goto cleanup;
+      }
+      clEnv->library->clFlush(queue);
+    }
+  }
 
-  if (clStatus != CL_SUCCESS)
+  /* get result */
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
+    length = image->columns * image->rows;
+    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = image->columns * image->rows;
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
   }
-
-  global_work_size[0] = (resizedColumns+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize;
-  global_work_size[1] = resizedRows;
-
-  local_work_size[0] = workgroupSize;
-  local_work_size[1] = 1;
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, horizontalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
   if (clStatus != CL_SUCCESS)
   {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
     goto cleanup;
   }
-  clEnv->library->clFlush(queue);
-  status = MagickTrue;
 
+  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
 
 cleanup:
   OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
-  if (horizontalKernel != NULL) RelinquishOpenCLKernel(clEnv, horizontalKernel);
+  image_view=DestroyCacheView(image_view);
+  if (filteredImage_view != NULL)
+    filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  return(status);
+  if (kernel != NULL)                        kernel=DestroyKernelInfo(kernel);
+  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
+  if (filteredImageBuffer!=NULL)              clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (imageKernelBuffer!=NULL)                clEnv->library->clReleaseMemObject(imageKernelBuffer);
+  if (unsharpMaskKernel!=NULL)                RelinquishOpenCLKernel(clEnv, unsharpMaskKernel);
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return(filteredImage);
 }
 
-static MagickBooleanType resizeVerticalFilter(cl_mem image,
+
+MagickExport Image *AccelerateUnsharpMaskImage(const Image *image,
+  const ChannelType channel,const double radius,const double sigma,
+  const double gain,const double threshold,ExceptionInfo *exception)
+{
+  Image
+    *filteredImage;
+
+  assert(image != NULL);
+  assert(exception != (ExceptionInfo *) NULL);
+
+  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
+      (checkAccelerateCondition(image, channel) == MagickFalse))
+    return NULL;
+
+  if (radius < 12.1)
+    filteredImage = ComputeUnsharpMaskImageSingle(image,channel,radius,sigma,gain,threshold, 0, exception);
+  else if (splitImage(image) && (image->rows / 2 > radius)) 
+    filteredImage = ComputeUnsharpMaskImageSection(image,channel,radius,sigma,gain,threshold,exception);
+  else
+    filteredImage = ComputeUnsharpMaskImage(image,channel,radius,sigma,gain,threshold,exception);
+  return(filteredImage);
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%   A c c e l e r a t e R e s i z e I m a g e                                 %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  AccelerateResizeImage() is an OpenCL implementation of ResizeImage()
+%
+%  AccelerateResizeImage() scales an image to the desired dimensions, using the given
+%  filter (see AcquireFilterInfo()).
+%
+%  If an undefined filter is given the filter defaults to Mitchell for a
+%  colormapped image, a image with a matte channel, or if the image is
+%  enlarged.  Otherwise the filter defaults to a Lanczos.
+%
+%  AccelerateResizeImage() was inspired by Paul Heckbert's "zoom" program.
+%
+%  The format of the AccelerateResizeImage method is:
+%
+%      Image *ResizeImage(Image *image,const size_t columns,
+%        const size_t rows, const ResizeFilter* filter,
+%        ExceptionInfo *exception)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o columns: the number of columns in the scaled image.
+%
+%    o rows: the number of rows in the scaled image.
+%
+%    o filter: Image filter to use.
+%
+%    o exception: return any errors or warnings in this structure.
+%
+*/
+
+static MagickBooleanType resizeHorizontalFilter(cl_mem image,
   const unsigned int imageColumns,const unsigned int imageRows,
   const unsigned int matte,cl_mem resizedImage,
   const unsigned int resizedColumns,const unsigned int resizedRows,
   const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients,
-  const float yFactor,MagickCLEnv clEnv,cl_command_queue queue,
+  const float xFactor,MagickCLEnv clEnv,cl_command_queue queue,
   ExceptionInfo *exception)
 {
   cl_kernel
@@ -3015,7 +3109,7 @@ static MagickBooleanType resizeVerticalFilter(cl_mem image,
   /*
   Apply filter to resize vertically from image to resize image.
   */
-  scale=MAGICK_MAX(1.0/yFactor+MagickEpsilon,1.0);
+  scale=MAGICK_MAX(1.0/xFactor+MagickEpsilon,1.0);
   support=scale*GetResizeFilterSupport(resizeFilter);
   if (support < 0.5)
   {
@@ -3028,7 +3122,7 @@ static MagickBooleanType resizeVerticalFilter(cl_mem image,
   }
   scale=PerceptibleReciprocal(scale);
 
-  if (resizedRows < workgroupSize) 
+  if (resizedColumns < workgroupSize) 
   {
     chunkSize = 32;
     pixelPerWorkgroup = 32;
@@ -3047,8 +3141,8 @@ DisableMSCWarning(4127)
 RestoreMSCWarning
   {
     /* calculate the local memory size needed per workgroup */
-    cacheRangeStart = (int) (((0 + 0.5)/yFactor+MagickEpsilon)-support+0.5);
-    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/yFactor+MagickEpsilon)+support+0.5);
+    cacheRangeStart = (int) (((0 + 0.5)/xFactor+MagickEpsilon)-support+0.5);
+    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/xFactor+MagickEpsilon)+support+0.5);
     numCachedPixels = cacheRangeEnd - cacheRangeStart + 1;
     imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket);
     totalLocalMemorySize = imageCacheLocalMemorySize;
@@ -3086,13 +3180,222 @@ RestoreMSCWarning
   resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter);
   resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter);
 
+
   if (resizeFilterType == SincFastWeightingFunction
     && resizeWindowType == SincFastWeightingFunction)
-    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilterSinc");
-  else 
-    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilter");
-
-  if (horizontalKernel == NULL)
+  {
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilterSinc");
+  }
+  else
+  {
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilter");
+  }
+  if (horizontalKernel == NULL)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  i = 0;
+  clStatus = clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&image);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageColumns);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageRows);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&matte);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&xFactor);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizedImage);
+
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedRows);
+
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeFilterType);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeWindowType);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients);
+
+  resizeFilterScale = (float) GetResizeFilterScale(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterScale);
+
+  resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterSupport);
+
+  resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport);
+
+  resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterBlur);
+
+
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, imageCacheLocalMemorySize, NULL);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), &numCachedPixels);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &chunkSize);
+  
+
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, pixelAccumulatorLocalMemorySize, NULL);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, weightAccumulatorLocalMemorySize, NULL);
+  clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, gammaAccumulatorLocalMemorySize, NULL);
+
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  global_work_size[0] = (resizedColumns+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize;
+  global_work_size[1] = resizedRows;
+
+  local_work_size[0] = workgroupSize;
+  local_work_size[1] = 1;
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, horizontalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clEnv->library->clFlush(queue);
+  status = MagickTrue;
+
+
+cleanup:
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+
+  if (horizontalKernel != NULL) RelinquishOpenCLKernel(clEnv, horizontalKernel);
+
+  return(status);
+}
+
+static MagickBooleanType resizeVerticalFilter(cl_mem image,
+  const unsigned int imageColumns,const unsigned int imageRows,
+  const unsigned int matte,cl_mem resizedImage,
+  const unsigned int resizedColumns,const unsigned int resizedRows,
+  const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients,
+  const float yFactor,MagickCLEnv clEnv,cl_command_queue queue,
+  ExceptionInfo *exception)
+{
+  cl_kernel
+    horizontalKernel;
+
+  cl_int clStatus;
+
+  const unsigned int
+    workgroupSize = 256;
+
+  float
+    resizeFilterScale,
+    resizeFilterSupport,
+    resizeFilterWindowSupport,
+    resizeFilterBlur,
+    scale,
+    support;
+
+  int
+    cacheRangeStart,
+    cacheRangeEnd,
+    numCachedPixels,
+    resizeFilterType,
+    resizeWindowType;
+
+  MagickBooleanType
+    status = MagickFalse;
+
+  size_t
+    deviceLocalMemorySize,
+    gammaAccumulatorLocalMemorySize,
+    global_work_size[2],
+    imageCacheLocalMemorySize,
+    pixelAccumulatorLocalMemorySize,
+    local_work_size[2],
+    totalLocalMemorySize,
+    weightAccumulatorLocalMemorySize;
+
+  unsigned int
+    chunkSize,
+    i,
+    pixelPerWorkgroup;
+
+  horizontalKernel = NULL;
+  status = MagickFalse;
+
+  /*
+  Apply filter to resize vertically from image to resize image.
+  */
+  scale=MAGICK_MAX(1.0/yFactor+MagickEpsilon,1.0);
+  support=scale*GetResizeFilterSupport(resizeFilter);
+  if (support < 0.5)
+  {
+    /*
+    Support too small even for nearest neighbour: Reduce to point
+    sampling.
+    */
+    support=(MagickRealType) 0.5;
+    scale=1.0;
+  }
+  scale=PerceptibleReciprocal(scale);
+
+  if (resizedRows < workgroupSize) 
+  {
+    chunkSize = 32;
+    pixelPerWorkgroup = 32;
+  }
+  else
+  {
+    chunkSize = workgroupSize;
+    pixelPerWorkgroup = workgroupSize;
+  }
+
+  /* get the local memory size supported by the device */
+  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
+
+DisableMSCWarning(4127)
+  while(1)
+RestoreMSCWarning
+  {
+    /* calculate the local memory size needed per workgroup */
+    cacheRangeStart = (int) (((0 + 0.5)/yFactor+MagickEpsilon)-support+0.5);
+    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/yFactor+MagickEpsilon)+support+0.5);
+    numCachedPixels = cacheRangeEnd - cacheRangeStart + 1;
+    imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket);
+    totalLocalMemorySize = imageCacheLocalMemorySize;
+
+    /* local size for the pixel accumulator */
+    pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4);
+    totalLocalMemorySize+=pixelAccumulatorLocalMemorySize;
+
+    /* local memory size for the weight accumulator */
+    weightAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=weightAccumulatorLocalMemorySize;
+
+    /* local memory size for the gamma accumulator */
+    if (matte == 0)
+      gammaAccumulatorLocalMemorySize = sizeof(float);
+    else
+      gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=gammaAccumulatorLocalMemorySize;
+
+    if (totalLocalMemorySize <= deviceLocalMemorySize)
+      break;
+    else
+    {
+      pixelPerWorkgroup = pixelPerWorkgroup/2;
+      chunkSize = chunkSize/2;
+      if (pixelPerWorkgroup == 0
+          || chunkSize == 0)
+      {
+        /* quit, fallback to CPU */
+        goto cleanup;
+      }
+    }
+  }
+
+  resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter);
+  resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter);
+
+  if (resizeFilterType == SincFastWeightingFunction
+    && resizeWindowType == SincFastWeightingFunction)
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilterSinc");
+  else 
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilter");
+
+  if (horizontalKernel == NULL)
   {
     (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
     goto cleanup;
@@ -5343,468 +5646,62 @@ MagickExport MagickBooleanType ComputeContrastStretchImageChannel(Image *image,
   }
   else 
   {
-    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
-    hostPtr = stretch_map;
-  }
-  /* create a CL buffer for stretch_map  */
-  length = (MaxMap+1); 
-  stretchMapBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(PixelPacket), hostPtr, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-
-  /* get the OpenCL kernel */
-  stretchKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Stretch");
-  if (stretchKernel == NULL)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  /* set the kernel arguments */
-  i = 0;
-  clStatus=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(ChannelType),&channel);
-  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&stretchMapBuffer);
-  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&white);
-  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&black);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  /* launch the kernel */
-  global_work_size[0] = image->columns;
-  global_work_size[1] = image->rows;
-
-  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, stretchKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  clEnv->library->clFlush(queue);
-
-  /* read the data back */
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
-  {
-    length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
-  }
-  else 
-  {
-    length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
-  }
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
-
-cleanup:
-  OpenCLLogException(__FUNCTION__,__LINE__,exception);
-
-  image_view=DestroyCacheView(image_view);
-
-  if (imageBuffer!=NULL)                     
-    clEnv->library->clReleaseMemObject(imageBuffer);
-
-  if (stretchMapBuffer!=NULL)
-    clEnv->library->clReleaseMemObject(stretchMapBuffer);
-  if (stretch_map!=NULL)
-    stretch_map=(PixelPacket *) RelinquishMagickMemory(stretch_map);
-
-
-  if (histogramBuffer!=NULL)
-    clEnv->library->clReleaseMemObject(histogramBuffer);
-  if (histogram!=NULL)
-    histogram=(cl_uint4 *) RelinquishMagickMemory(histogram);
-
-
-  if (histogramKernel!=NULL)                     
-    RelinquishOpenCLKernel(clEnv, histogramKernel);
-  if (stretchKernel!=NULL)                     
-    RelinquishOpenCLKernel(clEnv, stretchKernel);
-
-  if (queue != NULL)                          
-    RelinquishOpenCLCommandQueue(clEnv, queue);
-
-  return(outputReady);
-}
-
-MagickExport MagickBooleanType AccelerateContrastStretchImageChannel(
-  Image *image,const ChannelType channel,const double black_point,
-  const double white_point,ExceptionInfo *exception)
-{
-  MagickBooleanType
-    status;
-
-  assert(image != NULL);
-  assert(exception != (ExceptionInfo *) NULL);
-
-  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, channel) == MagickFalse) ||
-      (checkHistogramCondition(image, channel) == MagickFalse))
-    return(MagickFalse);
-
-  status=ComputeContrastStretchImageChannel(image,channel, black_point, white_point, exception);
-  return(status);
-}
-
-/*
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%     D e s p e c k l e I m a g e  w i t h  O p e n C L                       %
-%                                                                             %
-%                                                                             %
-%                                                                             %
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-%  DespeckleImage() reduces the speckle noise in an image while perserving the
-%  edges of the original image.  A speckle removing filter uses a complementary 
-%  hulling technique (raising pixels that are darker than their surrounding
-%  neighbors, then complementarily lowering pixels that are brighter than their
-%  surrounding neighbors) to reduce the speckle index of that image (reference
-%  Crimmins speckle removal).
-%
-%  The format of the DespeckleImage method is:
-%
-%      Image *DespeckleImage(const Image *image,ExceptionInfo *exception)
-%
-%  A description of each parameter follows:
-%
-%    o image: the image.
-%
-%    o exception: return any errors or warnings in this structure.
-%
-*/
-
-static Image *ComputeDespeckleImage(const Image *image,
-  ExceptionInfo*exception)
-{
-  static const int 
-    X[4] = {0, 1, 1,-1},
-    Y[4] = {1, 0, 1, 1};
-
-  CacheView
-    *filteredImage_view,
-    *image_view;
-
-  cl_command_queue
-    queue;
-
-  cl_context
-    context;
-
-  cl_int
-    clStatus;
-
-  cl_kernel
-    hullPass1,
-    hullPass2;
-
-  cl_mem_flags
-    mem_flags;
-
-  cl_mem
-    filteredImageBuffer,
-    imageBuffer,
-    tempImageBuffer[2];
-
-  const void
-    *inputPixels;
-
-  Image
-    *filteredImage;
-
-  int
-    k,
-    matte;
-
-  MagickBooleanType
-    outputReady;
-
-  MagickCLEnv
-    clEnv;
-
-  MagickSizeType
-    length;
-
-  size_t
-    global_work_size[2];
-
-  unsigned int
-    imageHeight,
-    imageWidth;
-
-  void
-    *filteredPixels,
-    *hostPtr;
-
-  outputReady = MagickFalse;
-  clEnv = NULL;
-  inputPixels = NULL;
-  filteredImage = NULL;
-  filteredImage_view = NULL;
-  filteredPixels = NULL;
-  context = NULL;
-  imageBuffer = NULL;
-  filteredImageBuffer = NULL;
-  hullPass1 = NULL;
-  hullPass2 = NULL;
-  queue = NULL;
-  tempImageBuffer[0] = tempImageBuffer[1] = NULL;
-  clEnv = GetDefaultOpenCLEnv();
-  context = GetOpenCLContext(clEnv);
-  queue = AcquireOpenCLCommandQueue(clEnv);
-  image_view=AcquireVirtualCacheView(image,exception);
-  inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception);
-  if (inputPixels == (void *) NULL)
-  {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename);
-    goto cleanup;
-  }
-
-  if (ALIGNED(inputPixels,CLPixelPacket)) 
-  {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
-  }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-
-  mem_flags = CL_MEM_READ_WRITE;
-  length = image->columns * image->rows;
-  for (k = 0; k < 2; k++)
-  {
-    tempImageBuffer[k] = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
-  }
-
-  filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
-  assert(filteredImage != NULL);
-  if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
-    goto cleanup;
-  }
-  filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception);
-  filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception);
-  if (filteredPixels == (void *) NULL)
-  {
-    (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
-    goto cleanup;
-  }
-
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
-  {
-    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
-    hostPtr = filteredPixels;
-  }
-  else 
-  {
-    mem_flags = CL_MEM_WRITE_ONLY;
-    hostPtr = NULL;
-  }
-  /* create a CL buffer from image pixel buffer */
-  length = image->columns * image->rows;
-  filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-    goto cleanup;
-  }
-
-  hullPass1 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass1");
-  hullPass2 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass2");
-
-  clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)&imageBuffer);
-  clStatus |=clEnv->library->clSetKernelArg(hullPass1,1,sizeof(cl_mem),(void *)(tempImageBuffer+1));
-  imageWidth = image->columns;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass1,2,sizeof(unsigned int),(void *)&imageWidth);
-  imageHeight = image->rows;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass1,3,sizeof(unsigned int),(void *)&imageHeight);
-  matte = (image->alpha_trait==BlendPixelTrait)?0:1;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass1,6,sizeof(int),(void *)&matte);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-  clStatus = clEnv->library->clSetKernelArg(hullPass2,0,sizeof(cl_mem),(void *)(tempImageBuffer+1));
-  clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)tempImageBuffer);
-  imageWidth = image->columns;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass2,2,sizeof(unsigned int),(void *)&imageWidth);
-  imageHeight = image->rows;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass2,3,sizeof(unsigned int),(void *)&imageHeight);
-  matte = (image->alpha_trait==BlendPixelTrait)?0:1;
-  clStatus |=clEnv->library->clSetKernelArg(hullPass2,6,sizeof(int),(void *)&matte);
-  if (clStatus != CL_SUCCESS)
-  {
-    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-    goto cleanup;
-  }
-
-
-  global_work_size[0] = image->columns;
-  global_work_size[1] = image->rows;
-
-  
-  for (k = 0; k < 4; k++)
-  {
-    cl_int2 offset;
-    int polarity;
-
-    
-    offset.s[0] = X[k];
-    offset.s[1] = Y[k];
-    polarity = 1;
-    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-
-
-    if (k == 0)
-      clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)(tempImageBuffer));
-    offset.s[0] = -X[k];
-    offset.s[1] = -Y[k];
-    polarity = 1;
-    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-
-    offset.s[0] = -X[k];
-    offset.s[1] = -Y[k];
-    polarity = -1;
-    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    hostPtr = stretch_map;
+  }
+  /* create a CL buffer for stretch_map  */
+  length = (MaxMap+1); 
+  stretchMapBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(PixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+    goto cleanup;
+  }
 
-    offset.s[0] = X[k];
-    offset.s[1] = Y[k];
-    polarity = -1;
-    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
-    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+  /* get the OpenCL kernel */
+  stretchKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Stretch");
+  if (stretchKernel == NULL)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-    if (k == 3)
-      clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)&filteredImageBuffer);
+  /* set the kernel arguments */
+  i = 0;
+  clStatus=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(ChannelType),&channel);
+  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&stretchMapBuffer);
+  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&white);
+  clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&black);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
-      goto cleanup;
-    }
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
-    /* launch the kernel */
-    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
-      goto cleanup;
-    }  
+  /* launch the kernel */
+  global_work_size[0] = image->columns;
+  global_work_size[1] = image->rows;
+
+  clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, stretchKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
   }
+  clEnv->library->clFlush(queue);
 
-  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  /* read the data back */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
   {
     length = image->columns * image->rows;
-    clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+    clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
   }
   else 
   {
     length = image->columns * image->rows;
-    clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+    clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
   }
   if (clStatus != CL_SUCCESS)
   {
@@ -5812,50 +5709,95 @@ static Image *ComputeDespeckleImage(const Image *image,
     goto cleanup;
   }
 
-  outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception);
+  outputReady=SyncCacheViewAuthenticPixels(image_view,exception);
 
 cleanup:
   OpenCLLogException(__FUNCTION__,__LINE__,exception);
 
   image_view=DestroyCacheView(image_view);
-  if (filteredImage_view != NULL)
-    filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
-  for (k = 0; k < 2; k++)
-  {
-    if (tempImageBuffer[k]!=NULL)            clEnv->library->clReleaseMemObject(tempImageBuffer[k]);
-  }
-  if (filteredImageBuffer!=NULL)             clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (hullPass1!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass1);
-  if (hullPass2!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass2);
-  if (outputReady == MagickFalse && filteredImage != NULL)
-    filteredImage=DestroyImage(filteredImage);
-  return(filteredImage);
+  if (imageBuffer!=NULL)                     
+    clEnv->library->clReleaseMemObject(imageBuffer);
+
+  if (stretchMapBuffer!=NULL)
+    clEnv->library->clReleaseMemObject(stretchMapBuffer);
+  if (stretch_map!=NULL)
+    stretch_map=(PixelPacket *) RelinquishMagickMemory(stretch_map);
+
+
+  if (histogramBuffer!=NULL)
+    clEnv->library->clReleaseMemObject(histogramBuffer);
+  if (histogram!=NULL)
+    histogram=(cl_uint4 *) RelinquishMagickMemory(histogram);
+
+
+  if (histogramKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, histogramKernel);
+  if (stretchKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, stretchKernel);
+
+  if (queue != NULL)                          
+    RelinquishOpenCLCommandQueue(clEnv, queue);
+
+  return(outputReady);
 }
 
-MagickExport Image *AccelerateDespeckleImage(const Image* image,
-  ExceptionInfo* exception)
+MagickExport MagickBooleanType AccelerateContrastStretchImageChannel(
+  Image *image,const ChannelType channel,const double black_point,
+  const double white_point,ExceptionInfo *exception)
 {
-  Image
-    *filteredImage;
+  MagickBooleanType
+    status;
 
   assert(image != NULL);
   assert(exception != (ExceptionInfo *) NULL);
 
   if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
-      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
-    return NULL;
+      (checkAccelerateCondition(image, channel) == MagickFalse) ||
+      (checkHistogramCondition(image, channel) == MagickFalse))
+    return(MagickFalse);
 
-  filteredImage=ComputeDespeckleImage(image,exception);
-  return(filteredImage);
+  status=ComputeContrastStretchImageChannel(image,channel, black_point, white_point, exception);
+  return(status);
 }
 
-static Image *ComputeAddNoiseImage(const Image *image,
-  const ChannelType channel,const NoiseType noise_type,
-  ExceptionInfo *exception)
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     D e s p e c k l e I m a g e  w i t h  O p e n C L                       %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  DespeckleImage() reduces the speckle noise in an image while perserving the
+%  edges of the original image.  A speckle removing filter uses a complementary 
+%  hulling technique (raising pixels that are darker than their surrounding
+%  neighbors, then complementarily lowering pixels that are brighter than their
+%  surrounding neighbors) to reduce the speckle index of that image (reference
+%  Crimmins speckle removal).
+%
+%  The format of the DespeckleImage method is:
+%
+%      Image *DespeckleImage(const Image *image,ExceptionInfo *exception)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o exception: return any errors or warnings in this structure.
+%
+*/
+
+static Image *ComputeDespeckleImage(const Image *image,
+  ExceptionInfo*exception)
 {
+  static const int 
+    X[4] = {0, 1, 1,-1},
+    Y[4] = {1, 0, 1, 1};
+
   CacheView
     *filteredImage_view,
     *image_view;
@@ -5870,7 +5812,8 @@ static Image *ComputeAddNoiseImage(const Image *image,
     clStatus;
 
   cl_kernel
-    addNoiseKernel;
+    hullPass1,
+    hullPass2;
 
   cl_mem_flags
     mem_flags;
@@ -5878,17 +5821,17 @@ static Image *ComputeAddNoiseImage(const Image *image,
   cl_mem
     filteredImageBuffer,
     imageBuffer,
-    randomNumberBuffer;
-
-  const char
-    *option;
+    tempImageBuffer[2];
 
   const void
     *inputPixels;
 
-  float
-    attenuate,
-    *randomNumberBufferPtr;
+  Image
+    *filteredImage;
+
+  int
+    k,
+    matte;
 
   MagickBooleanType
     outputReady;
@@ -5899,31 +5842,12 @@ static Image *ComputeAddNoiseImage(const Image *image,
   MagickSizeType
     length;
 
-  Image
-    *filteredImage;
-
-  int
-    i;
-
-  RandomInfo
-    **restrict random_info;
-
   size_t
     global_work_size[2];
 
   unsigned int
-    inputColumns,
-    inputRows,
-    k,
-    numRandomNumberPerBuffer,
-    numRandomNumberPerPixel,
-    numRowsPerKernelLaunch,
-    r;
-
-#if defined(MAGICKCORE_OPENMP_SUPPORT)
-  unsigned long
-    key;
-#endif
+    imageHeight,
+    imageWidth;
 
   void
     *filteredPixels,
@@ -5935,14 +5859,13 @@ static Image *ComputeAddNoiseImage(const Image *image,
   filteredImage = NULL;
   filteredImage_view = NULL;
   filteredPixels = NULL;
-  randomNumberBufferPtr = NULL;
   context = NULL;
   imageBuffer = NULL;
-  randomNumberBuffer = NULL;
   filteredImageBuffer = NULL;
+  hullPass1 = NULL;
+  hullPass2 = NULL;
   queue = NULL;
-  addNoiseKernel = NULL;
-
+  tempImageBuffer[0] = tempImageBuffer[1] = NULL;
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
   queue = AcquireOpenCLCommandQueue(clEnv);
@@ -5972,6 +5895,17 @@ static Image *ComputeAddNoiseImage(const Image *image,
     goto cleanup;
   }
 
+  mem_flags = CL_MEM_READ_WRITE;
+  length = image->columns * image->rows;
+  for (k = 0; k < 2; k++)
+  {
+    tempImageBuffer[k] = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
 
   filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception);
   assert(filteredImage != NULL);
@@ -6007,105 +5941,162 @@ static Image *ComputeAddNoiseImage(const Image *image,
     goto cleanup;
   }
 
-  /* find out how many random numbers needed by pixel */
-  numRandomNumberPerPixel = 0;
-  {
-    unsigned int numRandPerChannel = 0;
-    switch (noise_type)
-    {
-    case UniformNoise:
-    case ImpulseNoise:
-    case LaplacianNoise:
-    case RandomNoise:
-    default:
-      numRandPerChannel = 1;
-      break;
-    case GaussianNoise:
-    case MultiplicativeGaussianNoise:
-    case PoissonNoise:
-      numRandPerChannel = 2;
-      break;
-    };
+  hullPass1 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass1");
+  hullPass2 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass2");
 
-    if ((channel & RedChannel) != 0)
-      numRandomNumberPerPixel+=numRandPerChannel;
-    if ((channel & GreenChannel) != 0)
-      numRandomNumberPerPixel+=numRandPerChannel;
-    if ((channel & BlueChannel) != 0)
-      numRandomNumberPerPixel+=numRandPerChannel;
-    if ((channel & OpacityChannel) != 0)
-      numRandomNumberPerPixel+=numRandPerChannel;
+  clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus |=clEnv->library->clSetKernelArg(hullPass1,1,sizeof(cl_mem),(void *)(tempImageBuffer+1));
+  imageWidth = image->columns;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass1,2,sizeof(unsigned int),(void *)&imageWidth);
+  imageHeight = image->rows;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass1,3,sizeof(unsigned int),(void *)&imageHeight);
+  matte = (image->alpha_trait==BlendPixelTrait)?0:1;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass1,6,sizeof(int),(void *)&matte);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
   }
 
-  numRowsPerKernelLaunch = 512;
-  /* create a buffer for random numbers */
-  numRandomNumberPerBuffer = (image->columns*numRowsPerKernelLaunch)*numRandomNumberPerPixel;
-  randomNumberBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, numRandomNumberPerBuffer*sizeof(float)
-                                      , NULL, &clStatus);
-
-
-  /* set up the random number generators */
-  attenuate=1.0;
-  option=GetImageArtifact(image,"attenuate");
-  if (option != (char *) NULL)
-    attenuate=StringToDouble(option,(char **) NULL);
-  random_info=AcquireRandomInfoThreadSet();
-#if defined(MAGICKCORE_OPENMP_SUPPORT)
-  key=GetRandomSecretKey(random_info[0]);
-#endif
+  clStatus = clEnv->library->clSetKernelArg(hullPass2,0,sizeof(cl_mem),(void *)(tempImageBuffer+1));
+  clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)tempImageBuffer);
+  imageWidth = image->columns;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass2,2,sizeof(unsigned int),(void *)&imageWidth);
+  imageHeight = image->rows;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass2,3,sizeof(unsigned int),(void *)&imageHeight);
+  matte = (image->alpha_trait==BlendPixelTrait)?0:1;
+  clStatus |=clEnv->library->clSetKernelArg(hullPass2,6,sizeof(int),(void *)&matte);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
 
-  addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"AddNoiseImage");
 
-  k = 0;
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&imageBuffer);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-  inputColumns = image->columns;
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&inputColumns);
-  inputRows = image->rows;
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&inputRows);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(ChannelType),(void *)&channel);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(NoiseType),(void *)&noise_type);
-  attenuate=1.0f;
-  option=GetImageArtifact(image,"attenuate");
-  if (option != (char *) NULL)
-    attenuate=(float)StringToDouble(option,(char **) NULL);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(float),(void *)&attenuate);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&randomNumberBuffer);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&numRandomNumberPerPixel);
+  global_work_size[0] = image->columns;
+  global_work_size[1] = image->rows;
 
-  global_work_size[0] = inputColumns;
-  for (r = 0; r < inputRows; r+=numRowsPerKernelLaunch) 
+  
+  for (k = 0; k < 4; k++)
   {
-    /* Generate random numbers in the buffer */
-    randomNumberBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, randomNumberBuffer, CL_TRUE, CL_MAP_WRITE, 0
-      , numRandomNumberPerBuffer*sizeof(float), 0, NULL, NULL, &clStatus);
+    cl_int2 offset;
+    int polarity;
+
+    
+    offset.s[0] = X[k];
+    offset.s[1] = Y[k];
+    polarity = 1;
+    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
     if (clStatus != CL_SUCCESS)
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
       goto cleanup;
     }
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
 
-#if defined(MAGICKCORE_OPENMP_SUPPORT)
-  #pragma omp parallel for schedule(static,4) \
-    num_threads((key == ~0UL) == 0 ? 1 : (size_t) GetMagickResourceLimit(ThreadResource))
-#endif
-    for (i = 0; i < numRandomNumberPerBuffer; i++)
+
+    if (k == 0)
+      clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)(tempImageBuffer));
+    offset.s[0] = -X[k];
+    offset.s[1] = -Y[k];
+    polarity = 1;
+    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+    if (clStatus != CL_SUCCESS)
     {
-      const int id = GetOpenMPThreadId();
-      randomNumberBufferPtr[i] = (float)GetPseudoRandomValue(random_info[id]);
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
     }
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
 
-    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, randomNumberBuffer, randomNumberBufferPtr, 0, NULL, NULL);
+    offset.s[0] = -X[k];
+    offset.s[1] = -Y[k];
+    polarity = -1;
+    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
     if (clStatus != CL_SUCCESS)
     {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.",".");
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
       goto cleanup;
     }
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+
+    offset.s[0] = X[k];
+    offset.s[1] = Y[k];
+    polarity = -1;
+    clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+
+    if (k == 3)
+      clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)&filteredImageBuffer);
 
-    /* set the row offset */
-    clEnv->library->clSetKernelArg(addNoiseKernel,k,sizeof(unsigned int),(void *)&r);
-    global_work_size[1] = MAGICK_MIN(numRowsPerKernelLaunch, inputRows - r);
-    clEnv->library->clEnqueueNDRangeKernel(queue,addNoiseKernel,2,NULL,global_work_size,NULL,0,NULL,NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
   }
 
   if (ALIGNED(filteredPixels,CLPixelPacket)) 
@@ -6133,20 +6124,40 @@ cleanup:
   if (filteredImage_view != NULL)
     filteredImage_view=DestroyCacheView(filteredImage_view);
 
-  if (queue!=NULL)                  RelinquishOpenCLCommandQueue(clEnv, queue);
-  if (addNoiseKernel!=NULL)         RelinquishOpenCLKernel(clEnv, addNoiseKernel);
-  if (imageBuffer!=NULL)                   clEnv->library->clReleaseMemObject(imageBuffer);
-  if (randomNumberBuffer!=NULL)     clEnv->library->clReleaseMemObject(randomNumberBuffer);
-  if (filteredImageBuffer!=NULL)         clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (outputReady == MagickFalse && filteredImage != NULL) 
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (imageBuffer!=NULL)                     clEnv->library->clReleaseMemObject(imageBuffer);
+  for (k = 0; k < 2; k++)
+  {
+    if (tempImageBuffer[k]!=NULL)            clEnv->library->clReleaseMemObject(tempImageBuffer[k]);
+  }
+  if (filteredImageBuffer!=NULL)             clEnv->library->clReleaseMemObject(filteredImageBuffer);
+  if (hullPass1!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass1);
+  if (hullPass2!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass2);
+  if (outputReady == MagickFalse && filteredImage != NULL)
     filteredImage=DestroyImage(filteredImage);
+  return(filteredImage);
+}
+
+MagickExport Image *AccelerateDespeckleImage(const Image* image,
+  ExceptionInfo* exception)
+{
+  Image
+    *filteredImage;
+
+  assert(image != NULL);
+  assert(exception != (ExceptionInfo *) NULL);
+
+  if ((checkOpenCLEnvironment(exception) == MagickFalse) ||
+      (checkAccelerateCondition(image, AllChannels) == MagickFalse))
+    return NULL;
 
+  filteredImage=ComputeDespeckleImage(image,exception);
   return(filteredImage);
 }
 
-static Image *ComputeAddNoiseImageOptRandomNum(const Image*image,
+static Image *ComputeAddNoiseImage(const Image *image,
   const ChannelType channel,const NoiseType noise_type,
-  ExceptionInfo *exception) 
+  ExceptionInfo *exception)
 {
   CacheView
     *filteredImage_view,
@@ -6159,21 +6170,24 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image,
     context;
 
   cl_int
+    inputPixelCount,
+    pixelsPerWorkitem,
     clStatus;
 
-  cl_kernel
-    addNoiseKernel,
-    randomNumberGeneratorKernel;
+  cl_uint
+    seed0,
+    seed1;
 
-  cl_mem
-    filteredImageBuffer,
-    imageBuffer,
-    randomNumberBuffer,
-    randomNumberSeedsBuffer;
+  cl_kernel
+    addNoiseKernel;
 
   cl_mem_flags
     mem_flags;
 
+  cl_mem
+    filteredImageBuffer,
+    imageBuffer;
+
   const char
     *option;
 
@@ -6181,14 +6195,7 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image,
     *inputPixels;
 
   float
-    attenuate,
-    fNormalize;
-
-  Image
-    *filteredImage;
-
-  int
-    i;
+    attenuate;
 
   MagickBooleanType
     outputReady;
@@ -6199,20 +6206,24 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image,
   MagickSizeType
     length;
 
+  Image
+    *filteredImage;
+
+  RandomInfo
+    **restrict random_info;
+
   size_t
-    global_work_size[2],
-    random_work_size;
+    global_work_size[1],
+    local_work_size[1];
 
   unsigned int
-    initRandom,
-    inputColumns,
-    inputRows,
     k,
-    numRandomNumberGenerators,
-    numRandomNumberPerBuffer,
-    numRandomNumberPerPixel,
-    numRowsPerKernelLaunch,
-    r;
+    numRandomNumberPerPixel;
+
+#if defined(MAGICKCORE_OPENMP_SUPPORT)
+  unsigned long
+    key;
+#endif
 
   void
     *filteredPixels,
@@ -6226,12 +6237,9 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image,
   filteredPixels = NULL;
   context = NULL;
   imageBuffer = NULL;
-  randomNumberBuffer = NULL;
   filteredImageBuffer = NULL;
-  randomNumberSeedsBuffer = NULL;
   queue = NULL;
   addNoiseKernel = NULL;
-  randomNumberGeneratorKernel = NULL;
 
   clEnv = GetDefaultOpenCLEnv();
   context = GetOpenCLContext(clEnv);
@@ -6327,72 +6335,44 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image,
       numRandomNumberPerPixel+=numRandPerChannel;
   }
 
-  numRowsPerKernelLaunch = 512;
+  /* set up the random number generators */
+  attenuate=1.0;
+  option=GetImageArtifact(image,"attenuate");
+  if (option != (char *) NULL)
+    attenuate=StringToDouble(option,(char **) NULL);
+  random_info=AcquireRandomInfoThreadSet();
+#if defined(MAGICKCORE_OPENMP_SUPPORT)
+  key=GetRandomSecretKey(random_info[0]);
+#endif
 
-  /* create a buffer for random numbers */
-  numRandomNumberPerBuffer = (image->columns*numRowsPerKernelLaunch)*numRandomNumberPerPixel;
-  randomNumberBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, numRandomNumberPerBuffer*sizeof(float)
-    , NULL, &clStatus);
+  addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"GenerateNoiseImage");
 
   {
-    /* setup the random number generators */
-    unsigned long* seeds;
-    numRandomNumberGenerators = 512;
-    randomNumberSeedsBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR|CL_MEM_READ_WRITE
-                                            , numRandomNumberGenerators * 4 * sizeof(unsigned long), NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.",".");
-      goto cleanup;
-    }
-    seeds = (unsigned long*) clEnv->library->clEnqueueMapBuffer(queue, randomNumberSeedsBuffer, CL_TRUE, CL_MAP_WRITE, 0
-                                                , numRandomNumberGenerators*4*sizeof(unsigned long), 0, NULL, NULL, &clStatus);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.",".");
-      goto cleanup;
-    }
-
-    for (i = 0; i < numRandomNumberGenerators; i++) {
-      RandomInfo* randomInfo = AcquireRandomInfo();
-      const unsigned long* s = GetRandomInfoSeed(randomInfo);
-
-      if (i == 0)
-        fNormalize = GetRandomInfoNormalize(randomInfo);
-
-      seeds[i*4] = s[0];
-      randomInfo = DestroyRandomInfo(randomInfo);
-    }
-
-    clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, randomNumberSeedsBuffer, seeds, 0, NULL, NULL);
-    if (clStatus != CL_SUCCESS)
-    {
-      (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.",".");
-      goto cleanup;
-    }
-
-    randomNumberGeneratorKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE
-                                                        ,"randomNumberGeneratorKernel");
-    
-    k = 0;
-    clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(cl_mem),(void *)&randomNumberSeedsBuffer);
-    clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(float),(void *)&fNormalize);
-    clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(cl_mem),(void *)&randomNumberBuffer);
-    initRandom = 1;
-    clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(unsigned int),(void *)&initRandom);
-    clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(unsigned int),(void *)&numRandomNumberPerBuffer);
+    cl_uint computeUnitCount;
+    cl_uint workItemCount;
+    clEnv->library->clGetDeviceInfo(clEnv->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &computeUnitCount, NULL);
+    workItemCount = computeUnitCount * 2 * 256;                        // 256 work items per group, 2 groups per CU
+    inputPixelCount = image->columns * image->rows;
+    pixelsPerWorkitem = (inputPixelCount + workItemCount - 1) / workItemCount;
+    pixelsPerWorkitem = ((pixelsPerWorkitem + 3) / 4) * 4;
 
-    random_work_size = numRandomNumberGenerators;
+    local_work_size[0] = 256;
+    global_work_size[0] = workItemCount;
+  }
+  {
+    RandomInfo* randomInfo = AcquireRandomInfo();
+       const unsigned long* s = GetRandomInfoSeed(randomInfo);
+       seed0 = s[0];
+       GetPseudoRandomValue(randomInfo);
+       seed1 = s[0];
+       randomInfo = DestroyRandomInfo(randomInfo);
   }
 
-  addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"AddNoiseImage");
   k = 0;
   clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&imageBuffer);
   clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&filteredImageBuffer);
-  inputColumns = image->columns;
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&inputColumns);
-  inputRows = image->rows;
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&inputRows);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&inputPixelCount);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&pixelsPerWorkitem);  
   clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(ChannelType),(void *)&channel);
   clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(NoiseType),(void *)&noise_type);
   attenuate=1.0f;
@@ -6400,28 +6380,11 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image,
   if (option != (char *) NULL)
     attenuate=(float)StringToDouble(option,(char **) NULL);
   clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(float),(void *)&attenuate);
-  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&randomNumberBuffer);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&seed0);
+  clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&seed1);
   clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&numRandomNumberPerPixel);
 
-  global_work_size[0] = inputColumns;
-  for (r = 0; r < inputRows; r+=numRowsPerKernelLaunch) 
-  {
-    size_t generator_local_size = 64;
-    /* Generate random numbers in the buffer */
-    clEnv->library->clEnqueueNDRangeKernel(queue,randomNumberGeneratorKernel,1,NULL
-                            ,&random_work_size,&generator_local_size,0,NULL,NULL);
-    if (initRandom != 0)
-    {
-      /* make sure we only do init once */
-      initRandom = 0;
-      clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,3,sizeof(unsigned int),(void *)&initRandom);
-    }
-
-    /* set the row offset */
-    clEnv->library->clSetKernelArg(addNoiseKernel,k,sizeof(unsigned int),(void *)&r);
-    global_work_size[1] = MAGICK_MIN(numRowsPerKernelLaunch, inputRows - r);
-    clEnv->library->clEnqueueNDRangeKernel(queue,addNoiseKernel,2,NULL,global_work_size,NULL,0,NULL,NULL);
-  }
+  clEnv->library->clEnqueueNDRangeKernel(queue,addNoiseKernel,1,NULL,global_work_size,NULL,0,NULL,NULL);
 
   if (ALIGNED(filteredPixels,CLPixelPacket)) 
   {
@@ -6450,17 +6413,15 @@ cleanup:
 
   if (queue!=NULL)                  RelinquishOpenCLCommandQueue(clEnv, queue);
   if (addNoiseKernel!=NULL)         RelinquishOpenCLKernel(clEnv, addNoiseKernel);
-  if (randomNumberGeneratorKernel!=NULL) RelinquishOpenCLKernel(clEnv, randomNumberGeneratorKernel);
   if (imageBuffer!=NULL)                   clEnv->library->clReleaseMemObject(imageBuffer);
-  if (randomNumberBuffer!=NULL)     clEnv->library->clReleaseMemObject(randomNumberBuffer);
   if (filteredImageBuffer!=NULL)         clEnv->library->clReleaseMemObject(filteredImageBuffer);
-  if (randomNumberSeedsBuffer!=NULL) clEnv->library->clReleaseMemObject(randomNumberSeedsBuffer);
   if (outputReady == MagickFalse && filteredImage != NULL) 
     filteredImage=DestroyImage(filteredImage);
 
   return(filteredImage);
 }
 
+
 MagickExport Image *AccelerateAddNoiseImage(const Image *image,
   const ChannelType channel,const NoiseType noise_type,
   ExceptionInfo *exception) 
@@ -6475,12 +6436,7 @@ MagickExport Image *AccelerateAddNoiseImage(const Image *image,
       (checkAccelerateCondition(image, channel) == MagickFalse))
     return NULL;
 
-DisableMSCWarning(4127)
-  if (sizeof(unsigned long) == 4)
-RestoreMSCWarning
-    filteredImage = ComputeAddNoiseImageOptRandomNum(image,channel,noise_type,exception);
-  else
-    filteredImage = ComputeAddNoiseImage(image,channel,noise_type,exception);
+  filteredImage = ComputeAddNoiseImage(image,channel,noise_type,exception);
   
   return(filteredImage);
 }