]> granicus.if.org Git - imagemagick/commitdiff
(no commit message)
authorcristy <urban-warrior@git.imagemagick.org>
Sun, 24 Nov 2013 14:16:14 +0000 (14:16 +0000)
committercristy <urban-warrior@git.imagemagick.org>
Sun, 24 Nov 2013 14:16:14 +0000 (14:16 +0000)
MagickCore/accelerate-private.h
MagickCore/accelerate.c
MagickCore/accelerate.h
MagickCore/method-attribute.h
MagickCore/opencl-private.h
MagickCore/opencl.c
MagickCore/opencl.h
MagickCore/resize.h

index 5c6f73128d1bae7b59151315e430b9ff95f294a4..772898d1ae338660528b276e66ac409b4e626ce3 100644 (file)
@@ -1,20 +1,21 @@
 /*
   Copyright 1999-2014 ImageMagick Studio LLC, a non-profit organization
   dedicated to making software imaging solutions freely available.
-
+  
   You may not use this file except in compliance with the License.
   obtain a copy of the License at
-
+  
     http://www.imagemagick.org/script/license.php
-
+  
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 
-  MagickCore acceleration private methods.
+  MagickCore private methods for accelerated functions.
 */
+
 #ifndef _MAGICKCORE_ACCELERATE_PRIVATE_H
 #define _MAGICKCORE_ACCELERATE_PRIVATE_H
 
 extern "C" {
 #endif
 
+
+#if defined(MAGICKCORE_OPENCL_SUPPORT)
+
+#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
+#define OPENCL_ELIF(...)       "\n #""elif " #__VA_ARGS__ " \n"
+#define OPENCL_ELSE()          "\n #""else " " \n"
+#define OPENCL_ENDIF()         "\n #""endif " " \n"
+#define OPENCL_IF(...)         "\n #""if " #__VA_ARGS__ " \n"
+#define STRINGIFY(...) #__VA_ARGS__ "\n"
+
+typedef struct _FloatPixelPacket
+{
+#ifdef MAGICK_PIXEL_RGBA  
+  MagickRealType
+    red,
+    green,
+    blue,
+    opacity;
+#endif
+#ifdef MAGICK_PIXEL_BGRA 
+  MagickRealType
+    blue,
+    green,
+    red,
+    opacity;
+#endif
+} FloatPixelPacket;
+
+const char* accelerateKernels =
+  STRINGIFY(
+     typedef enum
+     {
+       UndefinedChannel,
+       RedChannel = 0x0001,
+       GrayChannel = 0x0001,
+       CyanChannel = 0x0001,
+       GreenChannel = 0x0002,
+       MagentaChannel = 0x0002,
+       BlueChannel = 0x0004,
+       YellowChannel = 0x0004,
+       AlphaChannel = 0x0008,
+       OpacityChannel = 0x0008,
+       MatteChannel = 0x0008,     /* deprecated */
+       BlackChannel = 0x0020,
+       IndexChannel = 0x0020,
+       CompositeChannels = 0x002F,
+       AllChannels = 0x7ffffff,
+       /*
+       Special purpose channel types.
+       */
+       TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
+       RGBChannels = 0x0080,      /* set alpha from  grayscale mask in RGB */
+       GrayChannels = 0x0080,
+       SyncChannels = 0x0100,     /* channels should be modified equally */
+       DefaultChannels = ((AllChannels | SyncChannels) &~ OpacityChannel)
+     } ChannelType;
+  )
+
+  OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
+
+  STRINGIFY(
+    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
+    {
+      return((CLQuantum) value);
+    }
+  )
+
+  OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
+
+  STRINGIFY(
+    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
+    {
+      return((CLQuantum) (257.0f*value));
+    }
+  )
+
+  OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
+
+  STRINGIFY(
+    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
+    {
+      return((Quantum) (16843009.0*value));
+    }
+  )
+
+  OPENCL_ENDIF()
+
+
+  STRINGIFY(
+    inline int ClampToCanvas(const int offset,const int range)
+      {
+        return clamp(offset, (int)0, range-1);
+      }
+  )
+
+  STRINGIFY(
+    inline int ClampToCanvasWithHalo(const int offset,const int range, const int edge, const int section)
+      {
+        return clamp(offset, section?(int)(0-edge):(int)0, section?(range-1):(range-1+edge));
+      }
+  )
+
+  STRINGIFY(
+    inline CLQuantum ClampToQuantum(const float value)
+      {
+        return (CLQuantum) (clamp(value, 0.0f, (float) QuantumRange) + 0.5f);
+      }
+  )
+
+  STRINGIFY(
+    inline uint ScaleQuantumToMap(CLQuantum value)
+      {
+        if (value >= (CLQuantum) MaxMap)
+          return ((uint)MaxMap);
+        else 
+          return ((uint)value);
+      }
+  )
+
+  STRINGIFY(
+    inline float PerceptibleReciprocal(const float x)
+    {
+      float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
+      return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
+    }
+  )
+
+  OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
+
+  STRINGIFY(
+
+  inline CLQuantum getBlue(CLPixelType p)                  { return p.x; }
+  inline void setBlue(CLPixelType* p, CLQuantum value)     { (*p).x = value; }
+  inline float getBlueF4(float4 p)                         { return p.x; }
+  inline void setBlueF4(float4* p, float value)             { (*p).x = value; }
+
+  inline CLQuantum getGreen(CLPixelType p)                 { return p.y; }
+  inline void setGreen(CLPixelType* p, CLQuantum value)            { (*p).y = value; }
+  inline float getGreenF4(float4 p)                        { return p.y; }
+  inline void setGreenF4(float4* p, float value)           { (*p).y = value; }
+
+  inline CLQuantum getRed(CLPixelType p)                   { return p.z; }
+  inline void setRed(CLPixelType* p, CLQuantum value)      { (*p).z = value; }
+  inline float getRedF4(float4 p)                          { return p.z; }
+  inline void setRedF4(float4* p, float value)             { (*p).z = value; }
+
+  inline CLQuantum getOpacity(CLPixelType p)               { return p.w; }
+  inline void setOpacity(CLPixelType* p, CLQuantum value)   { (*p).w = value; }
+  inline float getOpacityF4(float4 p)                      { return p.w; }
+  inline void setOpacityF4(float4* p, float value)          { (*p).w = value; }
+
+  inline float GetPixelIntensity(int colorspace, CLPixelType p)
+  {
+    // this is for default intensity and sRGB (not RGB) color space
+    float red = getRed(p);
+    float green = getGreen(p);
+    float blue = getBlue(p);
+
+    if (colorspace == 0)
+      return 0.212656*red+0.715158*green+0.072186*blue;
+    else
+    {
+      // need encode gamma
+    }
+    return 0.0;
+  }
+  )
+
+  STRINGIFY(
+    __kernel 
+    void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
+    const unsigned int imageWidth, const unsigned int imageHeight,
+    __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
+    const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
+
+      int2 blockID;
+      blockID.x = get_group_id(0);
+      blockID.y = get_group_id(1);
+
+      // image area processed by this workgroup
+      int2 imageAreaOrg;
+      imageAreaOrg.x = blockID.x * get_local_size(0);
+      imageAreaOrg.y = blockID.y * get_local_size(1);
+
+      int2 midFilterDimen;
+      midFilterDimen.x = (filterWidth-1)/2;
+      midFilterDimen.y = (filterHeight-1)/2;
+
+      int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
+
+      // dimension of the local cache
+      int2 cachedAreaDimen;
+      cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
+      cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
+
+      // cache the pixels accessed by this workgroup in local memory
+      int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
+      int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
+      int groupSize = get_local_size(0) * get_local_size(1);
+      for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
+
+        int2 cachedAreaIndex;
+        cachedAreaIndex.x = i % cachedAreaDimen.x;
+        cachedAreaIndex.y = i / cachedAreaDimen.x;
+
+        int2 imagePixelIndex;
+        imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
+
+        // only support EdgeVirtualPixelMethod through ClampToCanvas
+        // TODO: implement other virtual pixel method
+        imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
+        imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
+
+        pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
+      }
+
+      // cache the filter
+      for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
+        filterCache[i] = filter[i];
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+
+      int2 imageIndex;
+      imageIndex.x = imageAreaOrg.x + get_local_id(0);
+      imageIndex.y = imageAreaOrg.y + get_local_id(1);
+
+      // if out-of-range, stops here and quit
+      if (imageIndex.x >= imageWidth
+        || imageIndex.y >= imageHeight) {
+          return;
+      }
+
+      int filterIndex = 0;
+      float4 sum = (float4)0.0f;
+      float gamma = 0.0f;
+      if (((channel & OpacityChannel) == 0) || (matte == 0)) {
+        int cacheIndexY = get_local_id(1);
+        for (int j = 0; j < filterHeight; j++) {
+          int cacheIndexX = get_local_id(0);
+          for (int i = 0; i < filterWidth; i++) {
+            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
+            float f = filterCache[filterIndex];
+
+            sum.x += f * p.x;
+            sum.y += f * p.y;
+            sum.z += f * p.z; 
+            sum.w += f * p.w;
+
+            gamma += f;
+            filterIndex++;
+            cacheIndexX++;
+          }
+          cacheIndexY++;
+        }
+      }
+      else {
+        int cacheIndexY = get_local_id(1);
+        for (int j = 0; j < filterHeight; j++) {
+          int cacheIndexX = get_local_id(0);
+          for (int i = 0; i < filterWidth; i++) {
+
+            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
+            float alpha = QuantumScale*(QuantumRange-p.w);
+            float f = filterCache[filterIndex];
+            float g = alpha * f;
+
+            sum.x += g*p.x;
+            sum.y += g*p.y;
+            sum.z += g*p.z;
+            sum.w += f*p.w;
+
+            gamma += g;
+            filterIndex++;
+            cacheIndexX++;
+          }
+          cacheIndexY++;
+        }
+        gamma = PerceptibleReciprocal(gamma);
+        sum.xyz = gamma*sum.xyz;
+      }
+      CLPixelType outputPixel;
+      outputPixel.x = ClampToQuantum(sum.x);
+      outputPixel.y = ClampToQuantum(sum.y);
+      outputPixel.z = ClampToQuantum(sum.z);
+      outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
+
+      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
+    }
+  )
+
+  STRINGIFY(
+     typedef enum
+     {
+       UndefinedFunction,
+       PolynomialFunction,
+       SinusoidFunction,
+       ArcsinFunction,
+       ArctanFunction
+     } MagickFunction;
+  )
+
+  STRINGIFY(
+
+    /*
+    apply FunctionImageChannel(braightness-contrast)
+    */
+    CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
+        const unsigned int number_parameters,
+        __constant float *parameters)
+      {
+        float4 result = (float4) 0.0f;
+        switch (function)
+        {
+        case PolynomialFunction:
+          {
+            for (unsigned int i=0; i < number_parameters; i++)
+              result = result*QuantumScale*convert_float4(pixel) + parameters[i];
+            result *= QuantumRange;
+            break;
+          }
+        case SinusoidFunction:
+          {
+            float  freq,phase,ampl,bias;
+            freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
+            phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
+            ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
+            bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+            result = QuantumRange*(ampl*sin(2.0f*MagickPI*
+              (freq*QuantumScale*convert_float4(pixel) + phase/360.0f)) + bias);
+            break;
+          }
+        case ArcsinFunction:
+          {
+            float  width,range,center,bias;
+            width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
+            center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
+            range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
+            bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+            result = 2.0f/width*(QuantumScale*convert_float4(pixel) - center);
+            result = range/MagickPI*asin(result)+bias;
+            result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
+            result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
+            result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
+            result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
+            result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
+            result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
+            result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
+            result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
+      
+            result *= QuantumRange;
+            break;
+          }
+        case ArctanFunction:
+          {
+            float slope,range,center,bias;
+            slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
+            center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
+            range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
+            bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
+            result = MagickPI*slope*(QuantumScale*convert_float4(pixel)-center);
+            result = QuantumRange*(range/MagickPI*atan(result) + bias);
+            break;
+          }
+        case UndefinedFunction:
+          break;
+        }
+        return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
+          ClampToQuantum(result.z), ClampToQuantum(result.w));
+      }
+    )
+
+    STRINGIFY(
+    /*
+    Improve brightness / contrast of the image
+    channel : define which channel is improved
+    function : the function called to enchance the brightness contrast
+    number_parameters : numbers of parameters 
+    parameters : the parameter
+    */
+    __kernel void FunctionImage(__global CLPixelType *im,
+                                        const ChannelType channel, const MagickFunction function,
+                                        const unsigned int number_parameters, __constant float *parameters)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
+        im[c] = ApplyFunction(im[c], function, number_parameters, parameters); 
+      }
+    )
+
+    STRINGIFY(
+    /*
+    */
+    __kernel void Equalize(__global CLPixelType * restrict im,
+      const ChannelType channel,  
+      __global CLPixelType * restrict equalize_map,
+      const float4 white, const float4 black)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
+
+        uint ePos;
+        CLPixelType oValue, eValue;
+        CLQuantum red, green, blue, opacity;
+
+        //read from global
+        oValue=im[c];
+
+        if ((channel & SyncChannels) != 0)
+        {
+          if (getRedF4(white) != getRedF4(black))
+          {
+            ePos = ScaleQuantumToMap(getRed(oValue)); 
+            eValue = equalize_map[ePos];
+            red = getRed(eValue);
+            ePos = ScaleQuantumToMap(getGreen(oValue)); 
+            eValue = equalize_map[ePos];
+            green = getRed(eValue);
+            ePos = ScaleQuantumToMap(getBlue(oValue)); 
+            eValue = equalize_map[ePos];
+            blue = getRed(eValue);
+            ePos = ScaleQuantumToMap(getOpacity(oValue)); 
+            eValue = equalize_map[ePos];
+            opacity = getRed(eValue);
+            //write back
+            im[c]=(CLPixelType)(blue, green, red, opacity);
+          }
+
+        }
+
+        // for equalizing, we always need all channels?
+        // otherwise something more
+
+     }
+    )
+
+    STRINGIFY(
+    /*
+    */
+    __kernel void Histogram(__global CLPixelType * restrict im,
+      const ChannelType channel, const int colorspace,
+      __global uint4 * restrict histogram)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+        const int columns = get_global_size(0);  
+        const int c = x + y * columns;
+        if ((channel & SyncChannels) != 0)
+        {
+          float intensity = GetPixelIntensity(colorspace,im[c]);
+          uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
+          atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
+        }
+        else
+        {
+          // for equalizing, we always need all channels?
+          // otherwise something more
+        }
+      }
+    )
+
+    STRINGIFY(
+      /*
+      Reduce image noise and reduce detail levels by row
+      im: input pixels filtered_in  filtered_im: output pixels
+      filter : convolve kernel  width: convolve kernel size
+      channel : define which channel is blured
+      is_RGBA_BGRA : define the input is RGBA or BGRA
+      */
+      __kernel void BlurRow(__global CLPixelType *im, __global float4 *filtered_im,
+                         const ChannelType channel, __constant float *filter,
+                         const unsigned int width, 
+                         const unsigned int imageColumns, const unsigned int imageRows,
+                         __local CLPixelType *temp)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+
+        //const int columns = get_global_size(0);  
+        //const int rows = get_global_size(1);  
+        const int columns = imageColumns;  
+        const int rows = imageRows;  
+
+        const unsigned int radius = (width-1)/2;
+        const int wsize = get_local_size(0);  
+        const unsigned int loadSize = wsize+width;
+
+        //load chunk only for now
+        //event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
+        //wait_group_events(1,&e);
+
+        //parallel load and clamp
+        /*
+        int count = 0;
+        for (int i=0; i < loadSize; i=i+wsize)
+        {
+          int currentX = x + wsize*(count++);
+
+          int localId = get_local_id(0);
+
+          if ((localId+i) > loadSize)
+            break;
+
+          temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
+
+          if (y==0 && get_group_id(0) == 0)
+          {
+            printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
+          }
+        }
+        */
+
+        //group coordinate
+        const int groupX=get_local_size(0)*get_group_id(0);
+        const int groupY=get_local_size(1)*get_group_id(1);
+
+        //parallel load and clamp
+        for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
+        {
+          //int cx = ClampToCanvas(groupX+i, columns);
+          temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
+
+          if (0 && y==0 && get_group_id(1) == 0)
+          {
+            printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
+          }
+        }
+
+        // barrier        
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // only do the work if this is not a patched item
+        if (get_global_id(0) < columns) 
+        {
+          // compute
+          float4 result = (float4) 0;
+
+          int i = 0;
+          
+          \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
+
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR\n
+            for (int j=0; j < UFACTOR; j++, i++)
+            {
+              result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
+            }
+          }
+
+          for ( ; i < width; i++)
+          {
+            result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
+          }
+
+          result.x = ClampToQuantum(result.x);
+          result.y = ClampToQuantum(result.y);
+          result.z = ClampToQuantum(result.z);
+          result.w = ClampToQuantum(result.w);
+
+          // write back to global
+          filtered_im[y*columns+x] = result;
+        }
+      }
+    )
+
+    STRINGIFY(
+      /*
+      Reduce image noise and reduce detail levels by row
+      im: input pixels filtered_in  filtered_im: output pixels
+      filter : convolve kernel  width: convolve kernel size
+      channel : define which channel is blured
+      is_RGBA_BGRA : define the input is RGBA or BGRA
+      */
+      __kernel void BlurRowSection(__global CLPixelType *im, __global float4 *filtered_im,
+                         const ChannelType channel, __constant float *filter,
+                         const unsigned int width, 
+                         const unsigned int imageColumns, const unsigned int imageRows,
+                         __local CLPixelType *temp, 
+                         const unsigned int offsetRows, const unsigned int section)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);  
+
+        //const int columns = get_global_size(0);  
+        //const int rows = get_global_size(1);  
+        const int columns = imageColumns;  
+        const int rows = imageRows;  
+
+        const unsigned int radius = (width-1)/2;
+        const int wsize = get_local_size(0);  
+        const unsigned int loadSize = wsize+width;
+
+        //group coordinate
+        const int groupX=get_local_size(0)*get_group_id(0);
+        const int groupY=get_local_size(1)*get_group_id(1);
+
+        //offset the input data, assuming section is 0, 1 
+        im += imageColumns * (offsetRows - radius * section);
+
+        //parallel load and clamp
+        for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
+        {
+          //int cx = ClampToCanvas(groupX+i, columns);
+          temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
+
+          if (0 && y==0 && get_group_id(1) == 0)
+          {
+            printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
+          }
+        }
+
+        // barrier        
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // only do the work if this is not a patched item
+        if (get_global_id(0) < columns) 
+        {
+          // compute
+          float4 result = (float4) 0;
+
+          int i = 0;
+          
+          \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
+
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR\n
+            for (int j=0; j < UFACTOR; j++, i++)
+            {
+              result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
+            }
+          }
+
+          for ( ; i < width; i++)
+          {
+            result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
+          }
+
+          result.x = ClampToQuantum(result.x);
+          result.y = ClampToQuantum(result.y);
+          result.z = ClampToQuantum(result.z);
+          result.w = ClampToQuantum(result.w);
+
+          // write back to global
+          filtered_im[y*columns+x] = result;
+        }
+
+      }
+    )
+
+    STRINGIFY(
+      /*
+      Reduce image noise and reduce detail levels by line
+      im: input pixels filtered_in  filtered_im: output pixels
+      filter : convolve kernel  width: convolve kernel size
+      channel : define which channel is blured\
+      is_RGBA_BGRA : define the input is RGBA or BGRA
+      */
+      __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
+                                const ChannelType channel, __constant float *filter,
+                                const unsigned int width, 
+                                const unsigned int imageColumns, const unsigned int imageRows,
+                                __local float4 *temp)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);
+
+        //const int columns = get_global_size(0);
+        //const int rows = get_global_size(1);  
+        const int columns = imageColumns;  
+        const int rows = imageRows;  
+
+        unsigned int radius = (width-1)/2;
+        const int wsize = get_local_size(1);  
+        const unsigned int loadSize = wsize+width;
+
+        //group coordinate
+        const int groupX=get_local_size(0)*get_group_id(0);
+        const int groupY=get_local_size(1)*get_group_id(1);
+        //notice that get_local_size(0) is 1, so
+        //groupX=get_group_id(0);
+        
+        //parallel load and clamp
+        for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
+        {
+          temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
+        }
+        
+        // barrier        
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // only do the work if this is not a patched item
+        if (get_global_id(1) < rows)
+        {
+          // compute
+          float4 result = (float4) 0;
+
+          int i = 0;
+          
+          \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
+          
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR \n
+            for (int j=0; j < UFACTOR; j++, i++)
+            {
+              result+=filter[i]*temp[i+get_local_id(1)];
+            }
+          }
+
+          for ( ; i < width; i++)
+          {
+            result+=filter[i]*temp[i+get_local_id(1)];
+          }
+
+          result.x = ClampToQuantum(result.x);
+          result.y = ClampToQuantum(result.y);
+          result.z = ClampToQuantum(result.z);
+          result.w = ClampToQuantum(result.w);
+
+          // write back to global
+          filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
+        }
+
+      }
+    )
+
+
+    STRINGIFY(
+      /*
+      Reduce image noise and reduce detail levels by line
+      im: input pixels filtered_in  filtered_im: output pixels
+      filter : convolve kernel  width: convolve kernel size
+      channel : define which channel is blured\
+      is_RGBA_BGRA : define the input is RGBA or BGRA
+      */
+      __kernel void BlurColumnSection(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
+                                const ChannelType channel, __constant float *filter,
+                                const unsigned int width, 
+                                const unsigned int imageColumns, const unsigned int imageRows,
+                                __local float4 *temp, 
+                                const unsigned int offsetRows, const unsigned int section)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);
+
+        //const int columns = get_global_size(0);
+        //const int rows = get_global_size(1);  
+        const int columns = imageColumns;  
+        const int rows = imageRows;  
+
+        unsigned int radius = (width-1)/2;
+        const int wsize = get_local_size(1);  
+        const unsigned int loadSize = wsize+width;
+
+        //group coordinate
+        const int groupX=get_local_size(0)*get_group_id(0);
+        const int groupY=get_local_size(1)*get_group_id(1);
+        //notice that get_local_size(0) is 1, so
+        //groupX=get_group_id(0);
+       
+        // offset the input data
+        blurRowData += imageColumns * radius * section;
+
+        //parallel load and clamp
+        for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
+        {
+          int pos = ClampToCanvasWithHalo(i+groupY-radius, rows, radius, section) * columns + groupX;
+          temp[i] = *(blurRowData+pos);
+        }
+        
+        // barrier        
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // only do the work if this is not a patched item
+        if (get_global_id(1) < rows)
+        {
+          // compute
+          float4 result = (float4) 0;
+
+          int i = 0;
+          
+          \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
+          
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR \n
+            for (int j=0; j < UFACTOR; j++, i++)
+            {
+              result+=filter[i]*temp[i+get_local_id(1)];
+            }
+          }
+          for ( ; i < width; i++)
+          {
+            result+=filter[i]*temp[i+get_local_id(1)];
+          }
+
+          result.x = ClampToQuantum(result.x);
+          result.y = ClampToQuantum(result.y);
+          result.z = ClampToQuantum(result.z);
+          result.w = ClampToQuantum(result.w);
+
+          // offset the output data
+          filtered_im += imageColumns * offsetRows;
+
+          // write back to global
+          filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
+        }
+
+      }
+    )
+
+
+    STRINGIFY(
+    __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage, 
+          const __global float4 *blurRowData, __global CLPixelType *filtered_im,
+          const unsigned int imageColumns, const unsigned int imageRows, 
+          __local float4* cachedData, __local float* cachedFilter,
+          const ChannelType channel, const __global float *filter, const unsigned int width, 
+          const float gain, const float threshold)
+    {
+      const unsigned int radius = (width-1)/2;
+
+      // cache the pixel shared by the workgroup
+      const int groupX = get_group_id(0);
+      const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
+      const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
+
+      if (groupStartY >= 0
+          && groupStopY < imageRows) {
+        event_t e = async_work_group_strided_copy(cachedData
+                                                ,blurRowData+groupStartY*imageColumns+groupX
+                                                ,groupStopY-groupStartY,imageColumns,0);
+        wait_group_events(1,&e);
+      }
+      else {
+        for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
+          cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+      // cache the filter as well
+      event_t e = async_work_group_copy(cachedFilter,filter,width,0);
+      wait_group_events(1,&e);
+
+      // only do the work if this is not a patched item
+      //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
+      const int cy = get_global_id(1);
+
+      if (cy < imageRows) {
+        float4 blurredPixel = (float4) 0.0f;
+
+        int i = 0;
+
+        \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
+
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR \n
+              for (int j=0; j < UFACTOR; j++, i++)
+              {
+                blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+              }
+          }
+
+        for ( ; i < width; i++)
+        {
+          blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+        }
+
+        blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
+                                      ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
+
+        float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
+        float4 outputPixel = inputImagePixel - blurredPixel;
+
+        float quantumThreshold = QuantumRange*threshold;
+
+        int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
+        outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
+
+        //write back
+        filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
+                                                            ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
+
+      }
+    }
+
+    __kernel void UnsharpMaskBlurColumnSection(const __global CLPixelType* inputImage, 
+          const __global float4 *blurRowData, __global CLPixelType *filtered_im,
+          const unsigned int imageColumns, const unsigned int imageRows, 
+          __local float4* cachedData, __local float* cachedFilter,
+          const ChannelType channel, const __global float *filter, const unsigned int width, 
+          const float gain, const float threshold, 
+          const unsigned int offsetRows, const unsigned int section)
+    {
+      const unsigned int radius = (width-1)/2;
+
+      // cache the pixel shared by the workgroup
+      const int groupX = get_group_id(0);
+      const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
+      const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
+
+      // offset the input data
+      blurRowData += imageColumns * radius * section;
+
+      if (groupStartY >= 0
+          && groupStopY < imageRows) {
+        event_t e = async_work_group_strided_copy(cachedData
+                                                ,blurRowData+groupStartY*imageColumns+groupX
+                                                ,groupStopY-groupStartY,imageColumns,0);
+        wait_group_events(1,&e);
+      }
+      else {
+        for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
+          int pos = ClampToCanvasWithHalo(groupStartY+i,imageRows, radius, section)*imageColumns+ groupX;
+          cachedData[i] = *(blurRowData + pos);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+      // cache the filter as well
+      event_t e = async_work_group_copy(cachedFilter,filter,width,0);
+      wait_group_events(1,&e);
+
+      // only do the work if this is not a patched item
+      //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
+      const int cy = get_global_id(1);
+
+      if (cy < imageRows) {
+        float4 blurredPixel = (float4) 0.0f;
+
+        int i = 0;
+
+        \n #ifndef UFACTOR   \n 
+          \n #define UFACTOR 8 \n 
+          \n #endif                  \n 
+
+          for ( ; i+UFACTOR < width; ) 
+          {
+            \n #pragma unroll UFACTOR \n
+              for (int j=0; j < UFACTOR; j++, i++)
+              {
+                blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+              }
+          }
+
+        for ( ; i < width; i++)
+        {
+          blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
+        }
+
+        blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
+                                      ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
+
+        // offset the output data
+        inputImage += imageColumns * offsetRows; 
+        filtered_im += imageColumns * offsetRows;
+
+        float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
+        float4 outputPixel = inputImagePixel - blurredPixel;
+
+        float quantumThreshold = QuantumRange*threshold;
+
+        int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
+        outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
+
+        //write back
+        filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
+                                                            ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
+
+      }
+     
+    }
+    )
+
+
+
+  STRINGIFY(
+
+  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
+  , const unsigned int imageWidth, const unsigned int imageHeight
+  , const int2 offset, const int polarity, const int matte) {
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    CLPixelType v = inputImage[y*imageWidth+x];
+
+    int2 neighbor;
+    neighbor.y = y + offset.y;
+    neighbor.x = x + offset.x;
+
+    int2 clampedNeighbor;
+    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
+    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
+
+    CLPixelType r = (clampedNeighbor.x == neighbor.x
+                     && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
+    :(CLPixelType)0;
+
+    int sv[4];
+    sv[0] = (int)v.x;
+    sv[1] = (int)v.y;
+    sv[2] = (int)v.z;
+    sv[3] = (int)v.w;
+
+    int sr[4];
+    sr[0] = (int)r.x;
+    sr[1] = (int)r.y;
+    sr[2] = (int)r.z;
+    sr[3] = (int)r.w;
+
+    if (polarity > 0) {
+      \n #pragma unroll 4\n
+      for (unsigned int i = 0; i < 4; i++) {
+        sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
+      }
+    }
+    else {
+      \n #pragma unroll 4\n
+      for (unsigned int i = 0; i < 4; i++) {
+        sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
+      }
+
+    }
+
+    v.x = (CLQuantum)sv[0];
+    v.y = (CLQuantum)sv[1];
+    v.z = (CLQuantum)sv[2];
+
+    if (matte!=0)
+      v.w = (CLQuantum)sv[3];
+
+    outputImage[y*imageWidth+x] = v;
+
+    }
+
+
+  )
+
+
+
+  STRINGIFY(
+
+  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
+  , const unsigned int imageWidth, const unsigned int imageHeight
+  , const int2 offset, const int polarity, const int matte) {
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    CLPixelType v = inputImage[y*imageWidth+x];
+
+    int2 neighbor, clampedNeighbor;
+
+    neighbor.y = y + offset.y;
+    neighbor.x = x + offset.x;
+    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
+    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
+
+    CLPixelType r = (clampedNeighbor.x == neighbor.x
+      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
+    :(CLPixelType)0;
+
+
+    neighbor.y = y - offset.y;
+    neighbor.x = x - offset.x;
+    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
+    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
+
+    CLPixelType s = (clampedNeighbor.x == neighbor.x
+      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
+    :(CLPixelType)0;
+
+
+    int sv[4];
+    sv[0] = (int)v.x;
+    sv[1] = (int)v.y;
+    sv[2] = (int)v.z;
+    sv[3] = (int)v.w;
+
+    int sr[4];
+    sr[0] = (int)r.x;
+    sr[1] = (int)r.y;
+    sr[2] = (int)r.z;
+    sr[3] = (int)r.w;
+
+    int ss[4];
+    ss[0] = (int)s.x;
+    ss[1] = (int)s.y;
+    ss[2] = (int)s.z;
+    ss[3] = (int)s.w;
+
+    if (polarity > 0) {
+      \n #pragma unroll 4\n
+      for (unsigned int i = 0; i < 4; i++) {
+        //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
+        //
+        //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
+        //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
+        sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
+      }
+    }
+    else {
+      \n #pragma unroll 4\n
+      for (unsigned int i = 0; i < 4; i++) {
+        //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
+        //
+        //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
+        sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
+      }
+    }
+
+    v.x = (CLQuantum)sv[0];
+    v.y = (CLQuantum)sv[1];
+    v.z = (CLQuantum)sv[2];
+
+    if (matte!=0)
+      v.w = (CLQuantum)sv[3];
+
+    outputImage[y*imageWidth+x] = v;
+
+    }
+
+
+  )
+
+  STRINGIFY(
+    __kernel void RadialBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
+                              const float4 bias,
+                              const unsigned int channel, const unsigned int matte,
+                              const float2 blurCenter,
+                              __constant float *cos_theta, __constant float *sin_theta, 
+                              const unsigned int cossin_theta_size)
+      {
+        const int x = get_global_id(0);  
+        const int y = get_global_id(1);
+        const int columns = get_global_size(0);
+        const int rows = get_global_size(1);  
+        unsigned int step = 1;
+        float center_x = (float) x - blurCenter.x;
+        float center_y = (float) y - blurCenter.y;
+        float radius = hypot(center_x, center_y);
+        
+        //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
+        float blur_radius = hypot(blurCenter.x, blurCenter.y);
+
+        if (radius > MagickEpsilon)
+        {
+          step = (unsigned int) (blur_radius / radius);
+          if (step == 0)
+            step = 1;
+          if (step >= cossin_theta_size)
+            step = cossin_theta_size-1;
+        }
+
+        float4 result;
+        result.x = (float)bias.x;
+        result.y = (float)bias.y;
+        result.z = (float)bias.z;
+        result.w = (float)bias.w;
+        float normalize = 0.0f;
+
+        if (((channel & OpacityChannel) == 0) || (matte == 0)) {
+          for (unsigned int i=0; i<cossin_theta_size; i+=step)
+          {
+            result += convert_float4(im[
+              ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+ 
+                ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
+              normalize += 1.0f;
+          }
+          normalize = PerceptibleReciprocal(normalize);
+          result = result * normalize;
+        }
+        else {
+          float gamma = 0.0f;
+          for (unsigned int i=0; i<cossin_theta_size; i+=step)
+          {
+            float4 p = convert_float4(im[
+              ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+ 
+                ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
+            
+            float alpha = (float)(QuantumScale*(QuantumRange-p.w));
+            result.x += alpha * p.x;
+            result.y += alpha * p.y;
+            result.z += alpha * p.z;
+            result.w += p.w;
+            gamma+=alpha;
+            normalize += 1.0f;
+          }
+          gamma = PerceptibleReciprocal(gamma);
+          normalize = PerceptibleReciprocal(normalize);
+          result.x = gamma*result.x;
+          result.y = gamma*result.y;
+          result.z = gamma*result.z;
+          result.w = normalize*result.w;
+        }
+        filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
+          ClampToQuantum(result.z), ClampToQuantum(result.w)); 
+      }
+  )
+  STRINGIFY(
+  typedef enum
+  {
+    UndefinedColorspace,
+    RGBColorspace,            /* Linear RGB colorspace */
+    GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
+    TransparentColorspace,
+    OHTAColorspace,
+    LabColorspace,
+    XYZColorspace,
+    YCbCrColorspace,
+    YCCColorspace,
+    YIQColorspace,
+    YPbPrColorspace,
+    YUVColorspace,
+    CMYKColorspace,           /* negared linear RGB with black separated */
+    sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
+    HSBColorspace,
+    HSLColorspace,
+    HWBColorspace,
+    Rec601LumaColorspace,
+    Rec601YCbCrColorspace,
+    Rec709LumaColorspace,
+    Rec709YCbCrColorspace,
+    LogColorspace,
+    CMYColorspace,            /* negated linear RGB colorspace */
+    LuvColorspace,
+    HCLColorspace,
+    LCHColorspace,            /* alias for LCHuv */
+    LMSColorspace,
+    LCHabColorspace,          /* Cylindrical (Polar) Lab */
+    LCHuvColorspace,          /* Cylindrical (Polar) Luv */
+    scRGBColorspace,
+    HSIColorspace,
+    HSVColorspace,            /* alias for HSB */
+    HCLpColorspace,
+    YDbDrColorspace
+  } ColorspaceType;
+  )
+
+
+  STRINGIFY(
+
+  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
+    float3 HueSaturationBrightness;
+    HueSaturationBrightness.x = 0.0f; // Hue
+    HueSaturationBrightness.y = 0.0f; // Saturation
+    HueSaturationBrightness.z = 0.0f; // Brightness
+
+    float r=(float) getRed(pixel);
+    float g=(float) getGreen(pixel);
+    float b=(float) getBlue(pixel);
+
+    float tmin=min(min(r,g),b);
+    float tmax=max(max(r,g),b);
+
+    if (tmax!=0.0f) {
+      float delta=tmax-tmin;
+      HueSaturationBrightness.y=delta/tmax;
+      HueSaturationBrightness.z=QuantumScale*tmax;
+
+      if (delta != 0.0f) {
+       HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
+       HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
+        HueSaturationBrightness.x/=6.0f;
+        HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
+      }
+    }
+    return HueSaturationBrightness;
+  }
+
+  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
+
+    float hue = HueSaturationBrightness.x;
+    float brightness = HueSaturationBrightness.z;
+    float saturation = HueSaturationBrightness.y;
+   
+    CLPixelType rgb;
+
+    if (saturation == 0.0f) {
+      setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
+      setGreen(&rgb,getRed(rgb));
+      setBlue(&rgb,getRed(rgb));
+    }
+    else {
+
+      float h=6.0f*(hue-floor(hue));
+      float f=h-floor(h);
+      float p=brightness*(1.0f-saturation);
+      float q=brightness*(1.0f-saturation*f);
+      float t=brightness*(1.0f-(saturation*(1.0f-f)));
+      float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
+      float clamped_t = ClampToQuantum(QuantumRange*t);
+      float clamped_p = ClampToQuantum(QuantumRange*p);
+      float clamped_q = ClampToQuantum(QuantumRange*q);     
+      int ih = (int)h;
+      setRed(&rgb, (ih == 1)?clamped_q:
+             (ih == 2 || ih == 3)?clamped_p:
+             (ih == 4)?clamped_t:
+                 clampedBrightness);
+      setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
+             (ih == 3)?clamped_q:
+             (ih == 4 || ih == 5)?clamped_p:
+                 clamped_t);
+
+      setBlue(&rgb, (ih == 2)?clamped_t:
+             (ih == 3 || ih == 4)?clampedBrightness:
+             (ih == 5)?clamped_q:
+                 clamped_p);
+    }
+    return rgb;
+  }
+
+  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
+  {
+
+    const int sign = sharpen!=0?1:-1;
+    const int x = get_global_id(0);  
+    const int y = get_global_id(1);
+    const int columns = get_global_size(0);
+    const int c = x + y * columns;
+
+    CLPixelType pixel = im[c];
+    float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
+    float brightness = HueSaturationBrightness.z;
+    brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
+    brightness = clamp(brightness,0.0f,1.0f);
+    HueSaturationBrightness.z = brightness;
+
+    CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
+    filteredPixel.w = pixel.w;
+    im[c] = filteredPixel;
+  }
+
+
+  )
+
+  STRINGIFY(
+
+  inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
+    float *hue, float *saturation, float *lightness)
+  {
+  float
+    c,
+    tmax,
+    tmin;
+
+  /*
+     Convert RGB to HSL colorspace.
+     */
+  tmax=max(QuantumScale*red,max(QuantumScale*green, QuantumScale*blue));
+  tmin=min(QuantumScale*red,min(QuantumScale*green, QuantumScale*blue));
+
+  c=tmax-tmin;
+
+  *lightness=(tmax+tmin)/2.0;
+  if (c <= 0.0)
+  {
+    *hue=0.0;
+    *saturation=0.0;
+    return;
+  }
+
+  if (tmax == (QuantumScale*red))
+  {
+    *hue=(QuantumScale*green-QuantumScale*blue)/c;
+    if ((QuantumScale*green) < (QuantumScale*blue))
+      *hue+=6.0;
+  }
+  else
+    if (tmax == (QuantumScale*green))
+      *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
+    else
+      *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
+
+  *hue*=60.0/360.0;
+  if (*lightness <= 0.5)
+    *saturation=c/(2.0*(*lightness));
+  else
+    *saturation=c/(2.0-2.0*(*lightness));
+  }
+
+  inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
+      CLQuantum *red,CLQuantum *green,CLQuantum *blue)
+  {
+    float
+      b,
+      c,
+      g,
+      h,
+      tmin,
+      r,
+      x;
+
+    /*
+       Convert HSL to RGB colorspace.
+       */
+    h=hue*360.0;
+    if (lightness <= 0.5)
+      c=2.0*lightness*saturation;
+    else
+      c=(2.0-2.0*lightness)*saturation;
+    tmin=lightness-0.5*c;
+    h-=360.0*floor(h/360.0);
+    h/=60.0;
+    x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
+    switch ((int) floor(h))
+    {
+      case 0:
+        {
+          r=tmin+c;
+          g=tmin+x;
+          b=tmin;
+          break;
+        }
+      case 1:
+        {
+          r=tmin+x;
+          g=tmin+c;
+          b=tmin;
+          break;
+        }
+      case 2:
+        {
+          r=tmin;
+          g=tmin+c;
+          b=tmin+x;
+          break;
+        }
+      case 3:
+        {
+          r=tmin;
+          g=tmin+x;
+          b=tmin+c;
+          break;
+        }
+      case 4:
+        {
+          r=tmin+x;
+          g=tmin;
+          b=tmin+c;
+          break;
+        }
+      case 5:
+        {
+          r=tmin+c;
+          g=tmin;
+          b=tmin+x;
+          break;
+        }
+      default:
+        {
+          r=0.0;
+          g=0.0;
+          b=0.0;
+        }
+    }
+    *red=ClampToQuantum(QuantumRange*r);
+    *green=ClampToQuantum(QuantumRange*g);
+    *blue=ClampToQuantum(QuantumRange*b);
+  }
+
+  inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 
+    CLQuantum *red,CLQuantum *green,CLQuantum *blue)
+  {
+    float
+      hue,
+      lightness,
+      saturation;
+
+    /*
+    Increase or decrease color lightness, saturation, or hue.
+    */
+    ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
+    hue+=0.5*(0.01*percent_hue-1.0);
+    while (hue < 0.0)
+      hue+=1.0;
+    while (hue >= 1.0)
+      hue-=1.0;
+    saturation*=0.01*percent_saturation;
+    lightness*=0.01*percent_lightness;
+    ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
+  }
+
+  __kernel void Modulate(__global CLPixelType *im, 
+    const float percent_brightness, 
+    const float percent_hue, 
+    const float percent_saturation, 
+    const int colorspace)
+  {
+
+    const int x = get_global_id(0);  
+    const int y = get_global_id(1);
+    const int columns = get_global_size(0);
+    const int c = x + y * columns;
+
+    CLPixelType pixel = im[c];
+
+    CLQuantum
+        blue,
+        green,
+        red;
+
+    red=getRed(pixel);
+    green=getGreen(pixel);
+    blue=getBlue(pixel);
+
+    switch (colorspace)
+    {
+      case HSLColorspace:
+      default:
+        {
+          ModulateHSL(percent_hue, percent_saturation, percent_brightness, 
+              &red, &green, &blue);
+        }
+
+    }
+
+    CLPixelType filteredPixel;
+   
+    setRed(&filteredPixel, red);
+    setGreen(&filteredPixel, green);
+    setBlue(&filteredPixel, blue);
+    filteredPixel.w = pixel.w;
+
+    im[c] = filteredPixel;
+  }
+  )
+
+  STRINGIFY(
+  // Based on Box from resize.c
+  float BoxResizeFilter(const float x)
+  {
+    return 1.0f;
+  }
+  )
+    
+  STRINGIFY(
+  // Based on CubicBC from resize.c
+  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
+  {
+    /*
+    Cubic Filters using B,C determined values:
+    Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
+    Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
+    Spline              B = 1   C = 0    B-Spline Gaussian approximation
+    Hermite             B = 0   C = 0    B-Spline interpolator
+
+    See paper by Mitchell and Netravali, Reconstruction Filters in Computer
+    Graphics Computer Graphics, Volume 22, Number 4, August 1988
+    http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
+    Mitchell.pdf.
+
+    Coefficents are determined from B,C values:
+    P0 = (  6 - 2*B       )/6 = coeff[0]
+    P1 =         0
+    P2 = (-18 +12*B + 6*C )/6 = coeff[1]
+    P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
+    Q0 = (      8*B +24*C )/6 = coeff[3]
+    Q1 = (    -12*B -48*C )/6 = coeff[4]
+    Q2 = (      6*B +30*C )/6 = coeff[5]
+    Q3 = (    - 1*B - 6*C )/6 = coeff[6]
+
+    which are used to define the filter:
+
+    P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
+    Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
+
+    which ensures function is continuous in value and derivative (slope).
+    */
+    if (x < 1.0)
+      return(resizeFilterCoefficients[0]+x*(x*
+      (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
+    if (x < 2.0)
+      return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
+      (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
+    return(0.0);
+  }
+  )
+
+  STRINGIFY(
+  float Sinc(const float x)
+  {
+    if (x != 0.0f)
+    {
+      const float alpha=(float) (MagickPI*x);
+      return sinpi(x)/alpha;
+    }
+    return(1.0f);
+  }
+  )
+
+  STRINGIFY(
+  float Triangle(const float x)
+  {
+    /*
+    1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
+    a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
+    for Sinc().
+    */
+    return ((x<1.0f)?(1.0f-x):0.0f);
+  }
+  )
+
+
+  STRINGIFY(
+  float Hanning(const float x)
+  {
+    /*
+    Cosine window function:
+      0.5+0.5*cos(pi*x).
+    */
+    const float cosine=cos((MagickPI*x));
+    return(0.5f+0.5f*cosine);
+  }
+  )
+
+  STRINGIFY(
+  float Hamming(const float x)
+  {
+    /*
+      Offset cosine window function:
+       .54 + .46 cos(pi x).
+    */
+    const float cosine=cos((MagickPI*x));
+    return(0.54f+0.46f*cosine);
+  }
+  )
+
+  STRINGIFY(
+  float Blackman(const float x)
+  {
+    /*
+      Blackman: 2nd order cosine windowing function:
+        0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
+
+      Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
+      five flops.
+    */
+    const float cosine=cos((MagickPI*x));
+    return(0.34f+cosine*(0.5f+cosine*0.16f));
+  }
+  )
+
+
+  STRINGIFY(
+  typedef enum {
+    BoxWeightingFunction = 0,
+    TriangleWeightingFunction,
+    CubicBCWeightingFunction,
+    HanningWeightingFunction,
+    HammingWeightingFunction,
+    BlackmanWeightingFunction,
+    GaussianWeightingFunction,
+    QuadraticWeightingFunction,
+    JincWeightingFunction,
+    SincWeightingFunction,
+    SincFastWeightingFunction,
+    KaiserWeightingFunction,
+    WelshWeightingFunction,
+    BohmanWeightingFunction,
+    LagrangeWeightingFunction,
+    CosineWeightingFunction,
+  } ResizeWeightingFunctionType;
+  )
+
+  STRINGIFY(
+  inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
+  {
+    switch (filterType)
+    {
+    /* Call Sinc even for SincFast to get better precision on GPU 
+       and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
+    case SincWeightingFunction:
+    case SincFastWeightingFunction:  
+      return Sinc(x);
+    case CubicBCWeightingFunction:
+      return CubicBC(x,filterCoefficients);
+    case BoxWeightingFunction:
+      return BoxResizeFilter(x);
+    case TriangleWeightingFunction:
+      return Triangle(x);
+    case HanningWeightingFunction:
+      return Hanning(x);
+    case HammingWeightingFunction:
+      return Hamming(x);
+    case BlackmanWeightingFunction:
+      return Blackman(x);
+
+    default:
+      return 0.0f;
+    }
+  }
+  )
+
+
+  STRINGIFY(
+  inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
+           , const ResizeWeightingFunctionType resizeWindowType
+           , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
+  {
+    float scale;
+    float xBlur = fabs(x/resizeFilterBlur);
+    if (resizeWindowSupport < MagickEpsilon
+        || resizeWindowType == BoxWeightingFunction)
+    {
+      scale = 1.0f;
+    }
+    else
+    {
+      scale = resizeFilterScale;
+      scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
+    }
+    float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
+    return weight;
+  }
+
+  )
+
+  ;
+  const char* accelerateKernels2 =
+
+  STRINGIFY(
+
+  inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
+    return (numWorkItems/pixelPerWorkgroup);
+  }
+
+  // returns the index of the pixel for the current workitem to compute.
+  // returns -1 if this workitem doesn't need to participate in any computation
+  inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
+    const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
+    int pixelIndex = itemID/numWorkItemsPerPixel;
+    pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
+    return pixelIndex;
+  }
+  )
+
+  STRINGIFY(
+ __kernel __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
+ void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
+  , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
+  , const int resizeFilterType, const int resizeWindowType
+  , const __global float* resizeFilterCubicCoefficients
+  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
+  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
+  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
+
+
+    // calculate the range of resized image pixels computed by this workgroup
+    const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
+    const unsigned int stopX = min(startX + pixelPerWorkgroup,filteredColumns);
+    const unsigned int actualNumPixelToCompute = stopX - startX;
+
+    // calculate the range of input image pixels to cache
+    float scale = max(1.0/xFactor+MagickEpsilon ,1.0f);
+    const float support = max(scale*resizeFilterSupport,0.5f);
+    scale = PerceptibleReciprocal(scale);
+
+    const int cacheRangeStartX = max((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
+    const int cacheRangeEndX = min((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
+
+    // cache the input pixels into local memory
+    const unsigned int y = get_global_id(1);
+    event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
+    wait_group_events(1,&e);
+
+    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
+    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
+    {
+
+      const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
+      const unsigned int chunkStopX = min(chunkStartX + pixelChunkSize, stopX);
+      const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
+
+      // determine which resized pixel computed by this workitem
+      const unsigned int itemID = get_local_id(0);
+      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
+      
+      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
+
+      float4 filteredPixel = (float4)0.0f;
+      float density = 0.0f;
+      float gamma = 0.0f;
+      // -1 means this workitem doesn't participate in the computation
+      if (pixelIndex != -1) {
+
+        // x coordinated of the resized pixel computed by this workitem
+        const int x = chunkStartX + pixelIndex;
+
+        // calculate how many steps required for this pixel
+        const float bisect = (x+0.5)/xFactor+MagickEpsilon;
+        const unsigned int start = (unsigned int)max(bisect-support+0.5f,0.0f);
+        const unsigned int stop  = (unsigned int)min(bisect+support+0.5f,(float)inputColumns);
+        const unsigned int n = stop - start;
+
+        // calculate how many steps this workitem will contribute
+        unsigned int numStepsPerWorkItem = n / numItems;
+        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
+
+        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
+        if (startStep < n) {
+          const unsigned int stopStep = min(startStep+numStepsPerWorkItem, n);
+
+          unsigned int cacheIndex = start+startStep-cacheRangeStartX;
+          if (matte == 0) {
+
+            for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
+              float4 cp = convert_float4(inputImageCache[cacheIndex]);
+
+              float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
+                , (ResizeWeightingFunctionType)resizeWindowType
+                , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
+
+              filteredPixel += ((float4)weight)*cp;
+              density+=weight;
+            }
+
+
+          }
+          else {
+            for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
+              CLPixelType p = inputImageCache[cacheIndex];
+
+              float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
+                , (ResizeWeightingFunctionType)resizeWindowType
+                , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
+
+              float alpha = weight * QuantumScale * GetPixelAlpha(p);
+              float4 cp = convert_float4(p);
+
+              filteredPixel.x += alpha * cp.x;
+              filteredPixel.y += alpha * cp.y;
+              filteredPixel.z += alpha * cp.z;
+              filteredPixel.w += weight * cp.w;
+
+              density+=weight;
+              gamma+=alpha;
+            }
+         }
+      }
+    }
+
+    // initialize the accumulators to zero
+    if (itemID < actualNumPixelInThisChunk) {
+      outputPixelCache[itemID] = (float4)0.0f;
+      densityCache[itemID] = 0.0f;
+      if (matte != 0)
+        gammaCache[itemID] = 0.0f;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // accumulatte the filtered pixel value and the density
+    for (unsigned int i = 0; i < numItems; i++) {
+      if (pixelIndex != -1) {
+        if (itemID%numItems == i) {
+          outputPixelCache[pixelIndex]+=filteredPixel;
+          densityCache[pixelIndex]+=density;
+          if (matte!=0) {
+            gammaCache[pixelIndex]+=gamma;
+          }
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (itemID < actualNumPixelInThisChunk) {
+      if (matte==0) {
+        float density = densityCache[itemID];
+        float4 filteredPixel = outputPixelCache[itemID];
+        if (density!= 0.0f && density != 1.0)
+        {
+          density = PerceptibleReciprocal(density);
+          filteredPixel *= (float4)density;
+        }
+        filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
+                                                                       , ClampToQuantum(filteredPixel.y)
+                                                                       , ClampToQuantum(filteredPixel.z)
+                                                                       , ClampToQuantum(filteredPixel.w));
+      }
+      else {
+        float density = densityCache[itemID];
+        float gamma = gammaCache[itemID];
+        float4 filteredPixel = outputPixelCache[itemID];
+
+        if (density!= 0.0f && density != 1.0) {
+          density = PerceptibleReciprocal(density);
+          filteredPixel *= (float4)density;
+          gamma *= density;
+        }
+        gamma = PerceptibleReciprocal(gamma);
+
+        CLPixelType fp;
+        fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
+          , ClampToQuantum(gamma*filteredPixel.y)
+          , ClampToQuantum(gamma*filteredPixel.z)
+          , ClampToQuantum(filteredPixel.w));
+
+        filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
+
+      }
+    }
+
+    } // end of chunking loop
+  }
+  )
+
+
+
+  STRINGIFY(
+ __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
+ void ResizeHorizontalFilterSinc(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
+  , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
+  , const int resizeFilterType, const int resizeWindowType
+  , const __global float* resizeFilterCubicCoefficients
+  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
+  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
+  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
+    
+    ResizeHorizontalFilter(inputImage,inputColumns,inputRows,matte
+    ,xFactor, filteredImage, filteredColumns, filteredRows
+    ,SincWeightingFunction, SincWeightingFunction
+    ,resizeFilterCubicCoefficients
+    ,resizeFilterScale, resizeFilterSupport, resizeFilterWindowSupport, resizeFilterBlur
+    ,inputImageCache, numCachedPixels, pixelPerWorkgroup, pixelChunkSize
+    ,outputPixelCache, densityCache, gammaCache);
+
+  }
+  )
+
+
+  STRINGIFY(
+ __kernel __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
+ void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
+  , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
+  , const int resizeFilterType, const int resizeWindowType
+  , const __global float* resizeFilterCubicCoefficients
+  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
+  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
+  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
+
+
+    // calculate the range of resized image pixels computed by this workgroup
+    const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
+    const unsigned int stopY = min(startY + pixelPerWorkgroup,filteredRows);
+    const unsigned int actualNumPixelToCompute = stopY - startY;
+
+    // calculate the range of input image pixels to cache
+    float scale = max(1.0/yFactor+MagickEpsilon ,1.0f);
+    const float support = max(scale*resizeFilterSupport,0.5f);
+    scale = PerceptibleReciprocal(scale);
+
+    const int cacheRangeStartY = max((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
+    const int cacheRangeEndY = min((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
+
+    // cache the input pixels into local memory
+    const unsigned int x = get_global_id(0);
+    event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
+    wait_group_events(1,&e);
+
+    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
+    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
+    {
+
+      const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
+      const unsigned int chunkStopY = min(chunkStartY + pixelChunkSize, stopY);
+      const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
+
+      // determine which resized pixel computed by this workitem
+      const unsigned int itemID = get_local_id(1);
+      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
+      
+      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
+
+      float4 filteredPixel = (float4)0.0f;
+      float density = 0.0f;
+      float gamma = 0.0f;
+      // -1 means this workitem doesn't participate in the computation
+      if (pixelIndex != -1) {
+
+        // x coordinated of the resized pixel computed by this workitem
+        const int y = chunkStartY + pixelIndex;
+
+        // calculate how many steps required for this pixel
+        const float bisect = (y+0.5)/yFactor+MagickEpsilon;
+        const unsigned int start = (unsigned int)max(bisect-support+0.5f,0.0f);
+        const unsigned int stop  = (unsigned int)min(bisect+support+0.5f,(float)inputRows);
+        const unsigned int n = stop - start;
+
+        // calculate how many steps this workitem will contribute
+        unsigned int numStepsPerWorkItem = n / numItems;
+        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
+
+        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
+        if (startStep < n) {
+          const unsigned int stopStep = min(startStep+numStepsPerWorkItem, n);
+
+          unsigned int cacheIndex = start+startStep-cacheRangeStartY;
+          if (matte == 0) {
+
+            for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
+              float4 cp = convert_float4(inputImageCache[cacheIndex]);
+
+              float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
+                , (ResizeWeightingFunctionType)resizeWindowType
+                , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
+
+              filteredPixel += ((float4)weight)*cp;
+              density+=weight;
+            }
+
+
+          }
+          else {
+            for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
+              CLPixelType p = inputImageCache[cacheIndex];
+
+              float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
+                , (ResizeWeightingFunctionType)resizeWindowType
+                , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
+
+              float alpha = weight * QuantumScale * GetPixelAlpha(p);
+              float4 cp = convert_float4(p);
+
+              filteredPixel.x += alpha * cp.x;
+              filteredPixel.y += alpha * cp.y;
+              filteredPixel.z += alpha * cp.z;
+              filteredPixel.w += weight * cp.w;
+
+              density+=weight;
+              gamma+=alpha;
+            }
+         }
+      }
+    }
+
+    // initialize the accumulators to zero
+    if (itemID < actualNumPixelInThisChunk) {
+      outputPixelCache[itemID] = (float4)0.0f;
+      densityCache[itemID] = 0.0f;
+      if (matte != 0)
+        gammaCache[itemID] = 0.0f;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // accumulatte the filtered pixel value and the density
+    for (unsigned int i = 0; i < numItems; i++) {
+      if (pixelIndex != -1) {
+        if (itemID%numItems == i) {
+          outputPixelCache[pixelIndex]+=filteredPixel;
+          densityCache[pixelIndex]+=density;
+          if (matte!=0) {
+            gammaCache[pixelIndex]+=gamma;
+          }
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (itemID < actualNumPixelInThisChunk) {
+      if (matte==0) {
+        float density = densityCache[itemID];
+        float4 filteredPixel = outputPixelCache[itemID];
+        if (density!= 0.0f && density != 1.0)
+        {
+          density = PerceptibleReciprocal(density);
+          filteredPixel *= (float4)density;
+        }
+        filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
+                                                                       , ClampToQuantum(filteredPixel.y)
+                                                                       , ClampToQuantum(filteredPixel.z)
+                                                                       , ClampToQuantum(filteredPixel.w));
+      }
+      else {
+        float density = densityCache[itemID];
+        float gamma = gammaCache[itemID];
+        float4 filteredPixel = outputPixelCache[itemID];
+
+        if (density!= 0.0f && density != 1.0) {
+          density = PerceptibleReciprocal(density);
+          filteredPixel *= (float4)density;
+          gamma *= density;
+        }
+        gamma = PerceptibleReciprocal(gamma);
+
+        CLPixelType fp;
+        fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
+          , ClampToQuantum(gamma*filteredPixel.y)
+          , ClampToQuantum(gamma*filteredPixel.z)
+          , ClampToQuantum(filteredPixel.w));
+
+        filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
+
+      }
+    }
+
+    } // end of chunking loop
+  }
+  )
+
+
+
+  STRINGIFY(
+ __kernel __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
+ void ResizeVerticalFilterSinc(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
+  , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
+  , const int resizeFilterType, const int resizeWindowType
+  , const __global float* resizeFilterCubicCoefficients
+  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
+  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
+  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
+    ResizeVerticalFilter(inputImage,inputColumns,inputRows,matte
+      ,yFactor,filteredImage,filteredColumns,filteredRows
+      ,SincWeightingFunction, SincWeightingFunction
+      ,resizeFilterCubicCoefficients
+      ,resizeFilterScale,resizeFilterSupport,resizeFilterWindowSupport,resizeFilterBlur
+      ,inputImageCache,numCachedPixels,pixelPerWorkgroup,pixelChunkSize
+      ,outputPixelCache,densityCache,gammaCache);
+  }
+  )
+  ;
+
+
+#endif // MAGICKCORE_OPENCL_SUPPORT
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
 
-#endif
+#endif // _MAGICKCORE_ACCELERATE_PRIVATE_H
index cec404017421e3f520f16ed07d7e451dcac23776..6b5de7bc5c46273357fb54069b381e65d4cb984e 100644 (file)
@@ -14,6 +14,8 @@
 %                                                                             %
 %                              Software Design                                %
 %                               John Cristy                                   %
+%                               SiuChi Chan                                   %
+%                               Guansong Zhang                                %
 %                               January 2010                                  %
 %                                                                             %
 %                                                                             %
 %  limitations under the License.                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-% Morphology is the the application of various kernals, of any size and even
-% shape, to a image in various ways (typically binary, but not always).
-%
-% Convolution (weighted sum or average) is just one specific type of
-% accelerate. Just one that is very common for image bluring and sharpening
-% effects.  Not only 2D Gaussian blurring, but also 2-pass 1D Blurring.
-%
-% This module provides not only a general accelerate function, and the ability
-% to apply more advanced or iterative morphologies, but also functions for the
-% generation of many different types of kernel arrays from user supplied
-% arguments. Prehaps even the generation of a kernel from a small image.
 */
-\f
 /*
-  Include declarations.
+Include declarations.
 */
 #include "MagickCore/studio.h"
 #include "MagickCore/accelerate.h"
+#include "MagickCore/accelerate-private.h"
 #include "MagickCore/artifact.h"
 #include "MagickCore/cache.h"
 #include "MagickCore/cache-private.h"
 #include "MagickCore/memory_.h"
 #include "MagickCore/monitor-private.h"
 #include "MagickCore/accelerate.h"
+#include "MagickCore/opencl.h"
+#include "MagickCore/opencl-private.h"
 #include "MagickCore/option.h"
-#include "MagickCore/pixel-accessor.h"
+#include "MagickCore/pixel-private.h"
 #include "MagickCore/prepress.h"
 #include "MagickCore/quantize.h"
 #include "MagickCore/registry.h"
+#include "MagickCore/resize.h"
+#include "MagickCore/resize-private.h"
 #include "MagickCore/semaphore.h"
 #include "MagickCore/splay-tree.h"
 #include "MagickCore/statistic.h"
 #include "MagickCore/string_.h"
 #include "MagickCore/string-private.h"
 #include "MagickCore/token.h"
-\f
+
+#ifdef MAGICKCORE_CLPERFMARKER
+#include "CLPerfMarker.h"
+#endif
+
+#if defined(MAGICKCORE_OPENCL_SUPPORT)
+
+#define ALIGNED(pointer,type) ((((long)(pointer)) & (sizeof(type)-1)) == 0)
+/*#define ALIGNED(pointer,type) (0) */
+
+static MagickBooleanType checkOpenCLEnvironment(ExceptionInfo* exception)
+{
+  MagickBooleanType flag;
+
+  MagickCLEnv clEnv;
+  clEnv = GetDefaultOpenCLEnv();
+  
+  GetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED
+    , sizeof(MagickBooleanType), &flag, exception);
+  if (flag == MagickTrue)
+    return MagickFalse;
+
+  GetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_OPENCL_INITIALIZED
+    , sizeof(MagickBooleanType), &flag, exception);
+  if (flag == MagickFalse)
+  {
+    if(InitOpenCLEnv(clEnv, exception) == MagickFalse)
+      return MagickFalse;
+  }
+
+  return MagickTrue;
+}
+
+
+static MagickBooleanType checkAccelerateCondition(const Image* image, const ChannelType channel, ExceptionInfo *exception) 
+{
+  /* check if the image's colorspace is supported */
+  if (image->colorspace != RGBColorspace
+    && image->colorspace != sRGBColorspace)
+    return MagickFalse;
+  
+  /* check if the channel is supported */
+  if (((channel&RedChannel) == 0)
+  || ((channel&GreenChannel) == 0)
+  || ((channel&BlueChannel) == 0))
+  {
+    return MagickFalse;
+  }
+  
+
+  /* check if if the virtual pixel method is compatible with the OpenCL implementation */
+  if ((GetImageVirtualPixelMethod(image) != UndefinedVirtualPixelMethod)&&
+      (GetImageVirtualPixelMethod(image) != EdgeVirtualPixelMethod))
+    return MagickFalse;
+
+  return MagickTrue;
+}
+
+
+static Image* ComputeConvolveImage(const Image* inputImage, const ChannelType channel, const KernelInfo *kernel, ExceptionInfo *exception)
+{
+  MagickBooleanType outputReady;
+  MagickCLEnv clEnv;
+
+  cl_int clStatus;
+  size_t global_work_size[2];
+  size_t localGroupSize[2];
+  size_t localMemoryRequirement;
+  Image* filteredImage;
+  MagickSizeType length;
+  const void *inputPixels;
+  void *filteredPixels;
+  cl_mem_flags mem_flags;
+  float* kernelBufferPtr;
+  unsigned kernelSize;
+  unsigned int i;
+  void *hostPtr;
+  unsigned int matte, filterWidth, filterHeight, imageWidth, imageHeight;
+
+  cl_context context;
+  cl_kernel clkernel;
+  cl_mem inputImageBuffer, filteredImageBuffer, convolutionKernel;
+  cl_ulong deviceLocalMemorySize;
+  cl_device_id device;
+
+  cl_command_queue queue;
+
+  /* intialize all CL objects to NULL */
+  context = NULL;
+  inputImageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  convolutionKernel = NULL;
+  clkernel = NULL;
+  queue = NULL;
+  device = NULL;
+
+  filteredImage = NULL;
+  outputReady = MagickFalse;
+  
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+
+  inputPixels = NULL;
+  inputPixels = AcquirePixelCachePixels(inputImage, &length, exception);
+  if (inputPixels == (const void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+    goto cleanup;
+  }
+
+  /* Create and initialize OpenCL buffers. */
+
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  filteredImage = CloneImage(inputImage,inputImage->columns,inputImage->rows,MagickTrue,exception);
+  assert(filteredImage != NULL);
+  if (SetImageStorageClass(filteredImage,DirectClass) != MagickTrue)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  filteredPixels = GetPixelCachePixels(filteredImage, &length, exception);
+  if (filteredPixels == (void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+    goto cleanup;
+  }
+
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = filteredPixels;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY;
+    hostPtr = NULL;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  filteredImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  kernelSize = kernel->width * kernel->height;
+  convolutionKernel = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, kernelSize * sizeof(float), NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  kernelBufferPtr = (float*)clEnqueueMapBuffer(queue, convolutionKernel, CL_TRUE, CL_MAP_WRITE, 0, kernelSize * sizeof(float)
+          , 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueueMapBuffer failed.",".");
+    goto cleanup;
+  }
+  for (i = 0; i < kernelSize; i++)
+  {
+    kernelBufferPtr[i] = (float) kernel->values[i];
+  }
+  clStatus = clEnqueueUnmapMemObject(queue, convolutionKernel, kernelBufferPtr, 0, NULL, NULL);
+ if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clFlush(queue);
+
+  /* Compute the local memory requirement for a 16x16 workgroup.
+     If it's larger than 16k, reduce the workgroup size to 8x8 */
+  localGroupSize[0] = 16;
+  localGroupSize[1] = 16;
+  localMemoryRequirement = (localGroupSize[0]+kernel->width-1) * (localGroupSize[1]+kernel->height-1) * sizeof(CLPixelPacket)
+    + kernel->width*kernel->height*sizeof(float);
+  if (localMemoryRequirement > 16384)
+  {
+
+
+    localGroupSize[0] = 8;
+    localGroupSize[1] = 8;
+
+    localMemoryRequirement = (localGroupSize[0]+kernel->width-1) * (localGroupSize[1]+kernel->height-1) * sizeof(CLPixelPacket)
+      + kernel->width*kernel->height*sizeof(float);
+  }
+
+  GetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_DEVICE, sizeof(cl_device_id), &device, exception);
+  clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &deviceLocalMemorySize, NULL);
+  if (localMemoryRequirement <= deviceLocalMemorySize) 
+  {
+    /* get the OpenCL kernel */
+    clkernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Convolve");
+    if (clkernel == NULL)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }
+
+    /* set the kernel arguments */
+    i = 0;
+    clStatus =clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+    imageWidth = inputImage->columns;
+    imageHeight = inputImage->rows;
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageWidth);
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&imageHeight);
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&convolutionKernel);
+    filterWidth = kernel->width;
+    filterHeight = kernel->height;
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterWidth);
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterHeight);
+    matte = (inputImage->matte==MagickTrue)?1:0;
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&matte);
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(ChannelType),(void *)&channel);
+    clStatus|=clSetKernelArg(clkernel,i++, (localGroupSize[0] + kernel->width-1)*(localGroupSize[1] + kernel->height-1)*sizeof(CLPixelPacket),NULL);
+    clStatus|=clSetKernelArg(clkernel,i++, kernel->width*kernel->height*sizeof(float),NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+
+    /* pad the global size to a multiple of the local work size dimension */
+    global_work_size[0] = ((inputImage->columns + localGroupSize[0]  - 1)/localGroupSize[0] ) * localGroupSize[0] ;
+    global_work_size[1] = ((inputImage->rows + localGroupSize[1] - 1)/localGroupSize[1]) * localGroupSize[1];
+
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, clkernel, 2, NULL, global_work_size, localGroupSize, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
+  else
+  {
+    /* get the OpenCL kernel */
+    clkernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Convolve");
+    if (clkernel == NULL)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }
+
+    /* set the kernel arguments */
+    i = 0;
+    clStatus =clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&convolutionKernel);
+    filterWidth = kernel->width;
+    filterHeight = kernel->height;
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterWidth);
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&filterHeight);
+    matte = (inputImage->matte==MagickTrue)?1:0;
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&matte);
+    clStatus|=clSetKernelArg(clkernel,i++,sizeof(ChannelType),(void *)&channel);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+
+    global_work_size[0] = inputImage->columns;
+    global_work_size[1] = inputImage->rows;
+
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, clkernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
+  clFlush(queue);
+
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* everything is fine! :) */
+  outputReady = MagickTrue;
+
+
+cleanup:
+
+  if (inputImageBuffer != NULL)
+    clReleaseMemObject(inputImageBuffer);
+
+  if (filteredImageBuffer != NULL)
+    clReleaseMemObject(filteredImageBuffer);
+
+  if (convolutionKernel != NULL)
+    clReleaseMemObject(convolutionKernel);
+
+  if (clkernel != NULL)
+    RelinquishOpenCLKernel(clEnv, clkernel);
+
+  if (queue != NULL)
+    RelinquishOpenCLCommandQueue(clEnv, queue);
+
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+
+  return filteredImage;
+}
+
 /*
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %                                                                             %
 %                                                                             %
 %                                                                             %
-%     A c c e l e r a t e C o n v o l v e I m a g e                           %
+%     C o n v o l v e I m a g e  w i t h  O p e n C L                         %
 %                                                                             %
 %                                                                             %
 %                                                                             %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
-%  AccelerateConvolveImage() applies a custom convolution kernel to the image.
-%  It is accelerated by taking advantage of speed-ups offered by executing in
-%  concert across heterogeneous platforms consisting of CPUs, GPUs, and other
-%  processors.
+%  ConvolveImage() applies a custom convolution kernel to the image.
 %
-%  The format of the AccelerateConvolveImage method is:
+%  The format of the ConvolveImage method is:
 %
-%      Image *AccelerateConvolveImage(const Image *image,
-%        const KernelInfo *kernel,Image *convolve_image,
-%        ExceptionInfo *exception)
+%      Image *ConvolveImage(const Image *image,const size_t order,
+%        const double *kernel,ExceptionInfo *exception)
+%      Image *ConvolveImageChannel(const Image *image,const ChannelType channel,
+%        const size_t order,const double *kernel,ExceptionInfo *exception)
 %
 %  A description of each parameter follows:
 %
 %    o image: the image.
 %
-%    o kernel: the convolution kernel.
+%    o channel: the channel type.
 %
-%    o convole_image: the convoleed image.
+%    o kernel: kernel info.
 %
 %    o exception: return any errors or warnings in this structure.
 %
 */
 
-#if defined(MAGICKCORE_OPENCL_SUPPORT)
+MagickExport Image* AccelerateConvolveImageChannel(const Image *image, const ChannelType channel, const KernelInfo *kernel, ExceptionInfo *exception)
+{
+  MagickBooleanType status;
+  Image* filteredImage = NULL;
 
-#if defined(MAGICKCORE_HDRI_SUPPORT)
-#define CLOptions "-DMAGICKCORE_HDRI_SUPPORT=1 -DCLQuantum=float " \
-  "-DCLPixelType=float4 -DQuantumRange=%g -DMagickEpsilon=%g"
-#define CLPixelInfo  cl_float4
-#else
-#if (MAGICKCORE_QUANTUM_DEPTH == 8)
-#define CLOptions "-DCLQuantum=uchar -DCLPixelType=uchar4 " \
-  "-DQuantumRange=%g -DMagickEpsilon=%g"
-#define CLPixelInfo  cl_uchar4
-#elif (MAGICKCORE_QUANTUM_DEPTH == 16)
-#define CLOptions "-DCLQuantum=ushort -DCLPixelType=ushort4 " \
-  "-DQuantumRange=%g -DMagickEpsilon=%g"
-#define CLPixelInfo  cl_ushort4
-#elif (MAGICKCORE_QUANTUM_DEPTH == 32)
-#define CLOptions "-DCLQuantum=uint -DCLPixelType=uint4 " \
-  "-DQuantumRange=%g -DMagickEpsilon=%g"
-#define CLPixelInfo  cl_uint4
-#elif (MAGICKCORE_QUANTUM_DEPTH == 64)
-#define CLOptions "-DCLQuantum=ussize_t -DCLPixelType=ussize_t4 " \
-  "-DQuantumRange=%g -DMagickEpsilon=%g"
-#define CLPixelInfo  cl_ulong4
-#endif
-#endif
+  assert(image != NULL);
+  assert(kernel != (KernelInfo *) NULL);
+  assert(exception != (ExceptionInfo *) NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  status = checkAccelerateCondition(image, channel, exception);
+  if (status == MagickFalse)
+    return NULL;
 
-typedef struct _ConvolveInfo
-{
-  cl_context
-    context;
-
-  cl_device_id
-    *devices;
-
-  cl_command_queue
-    command_queue;
-
-  cl_kernel
-    kernel;
-
-  cl_program
-    program;
-
-  cl_mem
-    pixels,
-    convolve_pixels;
-
-  cl_ulong
-    width,
-    height;
-
-  cl_uint
-    matte;
-
-  cl_mem
-    filter;
-} ConvolveInfo;
-
-static const char
-  *ConvolveKernel =
-    "static inline long ClampToCanvas(const long offset,const unsigned long range)\n"
-    "{\n"
-    "  if (offset < 0L)\n"
-    "    return(0L);\n"
-    "  if (offset >= range)\n"
-    "    return((long) (range-1L));\n"
-    "  return(offset);\n"
-    "}\n"
-    "\n"
-    "static inline CLQuantum ClampToQuantum(const float value)\n"
-    "{\n"
-    "#if defined(MAGICKCORE_HDRI_SUPPORT)\n"
-    "  return((CLQuantum) value);\n"
-    "#else\n"
-    "  if (value < 0.0)\n"
-    "    return((CLQuantum) 0);\n"
-    "  if (value >= (float) QuantumRange)\n"
-    "    return((CLQuantum) QuantumRange);\n"
-    "  return((CLQuantum) (value+0.5));\n"
-    "#endif\n"
-    "}\n"
-    "\n"
-    "static inline float PerceptibleReciprocal(const float x)\n"
-    "{\n"
-    "  float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;\n"
-    "  return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/\n"
-    "    MagickEpsilon));\n"
-    "}\n"
-    "\n"
-    "__kernel void Convolve(const __global CLPixelType *input,\n"
-    "  __constant float *filter,const unsigned long width,const unsigned long height,\n"
-    "  const unsigned int matte,__global CLPixelType *output)\n"
-    "{\n"
-    "  const unsigned long columns = get_global_size(0);\n"
-    "  const unsigned long rows = get_global_size(1);\n"
-    "\n"
-    "  const long x = get_global_id(0);\n"
-    "  const long y = get_global_id(1);\n"
-    "\n"
-    "  const float scale = (1.0/QuantumRange);\n"
-    "  const long mid_width = (width-1)/2;\n"
-    "  const long mid_height = (height-1)/2;\n"
-    "  float4 sum = { 0.0, 0.0, 0.0, 0.0 };\n"
-    "  float gamma = 0.0;\n"
-    "  register unsigned long i = 0;\n"
-    "\n"
-    "  int method = 0;\n"
-    "  if (matte != false)\n"
-    "    method=1;\n"
-    "  if ((x >= width) && (x < (columns-width-1)) &&\n"
-    "      (y >= height) && (y < (rows-height-1)))\n"
-    "    {\n"
-    "      method=2;\n"
-    "      if (matte != false)\n"
-    "        method=3;\n"
-    "    }\n"
-    "  switch (method)\n"
-    "  {\n"
-    "    case 0:\n"
-    "    {\n"
-    "      for (long v=(-mid_height); v <= mid_height; v++)\n"
-    "      {\n"
-    "        for (long u=(-mid_width); u <= mid_width; u++)\n"
-    "        {\n"
-    "          const long index=ClampToCanvas(y+v,rows)*columns+\n"
-    "            ClampToCanvas(x+u,columns);\n"
-    "          sum.x+=filter[i]*input[index].x;\n"
-    "          sum.y+=filter[i]*input[index].y;\n"
-    "          sum.z+=filter[i]*input[index].z;\n"
-    "          gamma+=filter[i];\n"
-    "          i++;\n"
-    "        }\n"
-    "      }\n"
-    "      break;\n"
-    "    }\n"
-    "    case 1:\n"
-    "    {\n"
-    "      for (long v=(-mid_height); v <= mid_height; v++)\n"
-    "      {\n"
-    "        for (long u=(-mid_width); u <= mid_width; u++)\n"
-    "        {\n"
-    "          const unsigned long index=ClampToCanvas(y+v,rows)*columns+\n"
-    "            ClampToCanvas(x+u,columns);\n"
-    "          const float alpha=scale*input[index].w;\n"
-    "          sum.x+=alpha*filter[i]*input[index].x;\n"
-    "          sum.y+=alpha*filter[i]*input[index].y;\n"
-    "          sum.z+=alpha*filter[i]*input[index].z;\n"
-    "          sum.w+=filter[i]*input[index].w;\n"
-    "          gamma+=alpha*filter[i];\n"
-    "          i++;\n"
-    "        }\n"
-    "      }\n"
-    "      break;\n"
-    "    }\n"
-    "    case 2:\n"
-    "    {\n"
-    "      for (long v=(-mid_height); v <= mid_height; v++)\n"
-    "      {\n"
-    "        for (long u=(-mid_width); u <= mid_width; u++)\n"
-    "        {\n"
-    "          const unsigned long index=(y+v)*columns+(x+u);\n"
-    "          sum.x+=filter[i]*input[index].x;\n"
-    "          sum.y+=filter[i]*input[index].y;\n"
-    "          sum.z+=filter[i]*input[index].z;\n"
-    "          gamma+=filter[i];\n"
-    "          i++;\n"
-    "        }\n"
-    "      }\n"
-    "      break;\n"
-    "    }\n"
-    "    case 3:\n"
-    "    {\n"
-    "      for (long v=(-mid_height); v <= mid_height; v++)\n"
-    "      {\n"
-    "        for (long u=(-mid_width); u <= mid_width; u++)\n"
-    "        {\n"
-    "          const unsigned long index=(y+v)*columns+(x+u);\n"
-    "          const float alpha=scale*input[index].w;\n"
-    "          sum.x+=alpha*filter[i]*input[index].x;\n"
-    "          sum.y+=alpha*filter[i]*input[index].y;\n"
-    "          sum.z+=alpha*filter[i]*input[index].z;\n"
-    "          sum.w+=filter[i]*input[index].w;\n"
-    "          gamma+=alpha*filter[i];\n"
-    "          i++;\n"
-    "        }\n"
-    "      }\n"
-    "      break;\n"
-    "    }\n"
-    "  }\n"
-    "  gamma=PerceptibleReciprocal(gamma);\n"
-    "  const unsigned long index = y*columns+x;\n"
-    "  output[index].x=ClampToQuantum(gamma*sum.x);\n"
-    "  output[index].y=ClampToQuantum(gamma*sum.y);\n"
-    "  output[index].z=ClampToQuantum(gamma*sum.z);\n"
-    "  if (matte == false)\n"
-    "    output[index].w=input[index].w;\n"
-    "  else\n"
-    "    output[index].w=ClampToQuantum(sum.w);\n"
-    "}\n";
-
-static MagickDLLCall void ConvolveNotify(const char *message,const void *data,
-  size_t length,void *user_context)
+  filteredImage = ComputeConvolveImage(image, channel, kernel, exception);
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  return filteredImage;
+}
+
+static MagickBooleanType ComputeFunctionImage(Image *image, const ChannelType channel,const MagickFunction function,
+  const size_t number_parameters,const double *parameters, ExceptionInfo *exception)
 {
-  ExceptionInfo
-    *exception;
+  MagickBooleanType status;
+
+  MagickCLEnv clEnv;
+
+  MagickSizeType length;
+  void* pixels;
+  float* parametersBufferPtr;
+
+  cl_int clStatus;
+  cl_context context;
+  cl_kernel clkernel;
+  cl_command_queue queue;
+  cl_mem_flags mem_flags;
+  cl_mem imageBuffer;
+  cl_mem parametersBuffer;
+  size_t globalWorkSize[2];
+
+  unsigned int i;
+
+  status = MagickFalse;
+
+  context = NULL;
+  clkernel = NULL;
+  queue = NULL;
+  imageBuffer = NULL;
+  parametersBuffer = NULL;
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+
+  pixels = GetPixelCachePixels(image, &length, exception);
+  if (pixels == (void *) NULL)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), CacheWarning,
+      "GetPixelCachePixels failed.",
+      "'%s'", image->filename);
+    goto cleanup;
+  }
+
+
+  if (ALIGNED(pixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = image->columns * image->rows;
+  imageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)pixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  parametersBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, number_parameters * sizeof(float), NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  parametersBufferPtr = (float*)clEnqueueMapBuffer(queue, parametersBuffer, CL_TRUE, CL_MAP_WRITE, 0, number_parameters * sizeof(float)
+                , 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueueMapBuffer failed.",".");
+    goto cleanup;
+  }
+  for (i = 0; i < number_parameters; i++)
+  {
+    parametersBufferPtr[i] = (float)parameters[i];
+  }
+  clStatus = clEnqueueUnmapMemObject(queue, parametersBuffer, parametersBufferPtr, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clFlush(queue);
 
-  (void) data;
-  (void) length;
-  exception=(ExceptionInfo *) user_context;
-  (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning,
-    "DelegateFailed","`%s'",message);
+  clkernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "FunctionImage");
+  if (clkernel == NULL)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* set the kernel arguments */
+  i = 0;
+  clStatus =clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&imageBuffer);
+  clStatus|=clSetKernelArg(clkernel,i++,sizeof(ChannelType),(void *)&channel);
+  clStatus|=clSetKernelArg(clkernel,i++,sizeof(MagickFunction),(void *)&function);
+  clStatus|=clSetKernelArg(clkernel,i++,sizeof(unsigned int),(void *)&number_parameters);
+  clStatus|=clSetKernelArg(clkernel,i++,sizeof(cl_mem),(void *)&parametersBuffer);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  globalWorkSize[0] = image->columns;
+  globalWorkSize[1] = image->rows;
+  /* launch the kernel */
+  clStatus = clEnqueueNDRangeKernel(queue, clkernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clFlush(queue);
+
+
+  if (ALIGNED(pixels,CLPixelPacket)) 
+  {
+    length = image->columns * image->rows;
+    clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = image->columns * image->rows;
+    clStatus = clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), pixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  status = MagickTrue;
+
+cleanup:
+  
+  if (clkernel != NULL) RelinquishOpenCLKernel(clEnv, clkernel);
+  if (queue != NULL) RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (imageBuffer != NULL) clReleaseMemObject(imageBuffer);
+  if (parametersBuffer != NULL) clReleaseMemObject(parametersBuffer);
+
+  return status;
 }
 
-static MagickBooleanType BindConvolveParameters(ConvolveInfo *convolve_info,
-  const Image *image,const void *pixels,float *filter,const size_t width,
-  const size_t height,void *convolve_pixels)
+
+
+MagickExport MagickBooleanType 
+  AccelerateFunctionImage(Image *image, const ChannelType channel,const MagickFunction function,
+  const size_t number_parameters,const double *parameters, ExceptionInfo *exception)
 {
-  cl_int
-    status;
+  MagickBooleanType status;
 
-  register cl_uint
-    i;
+  status = MagickFalse;
 
-  size_t
-    length;
+  assert(image != NULL);
+  assert(exception != (ExceptionInfo *) NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickTrue)
+  {
+    status = checkAccelerateCondition(image, channel, exception);
+    if (status == MagickTrue)
+    {
+      status = ComputeFunctionImage(image, channel, function, number_parameters, parameters, exception);
+      OpenCLLogException(__FUNCTION__,__LINE__,exception);
+    }
+  }
+  return status;
+}
 
-  /*
-    Allocate OpenCL buffers.
-  */
-  length=image->columns*image->rows;
-  convolve_info->pixels=clCreateBuffer(convolve_info->context,(cl_mem_flags)
-    (CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR),length*sizeof(CLPixelInfo),
-    (void *) pixels,&status);
-  if ((convolve_info->pixels == (cl_mem) NULL) || (status != CL_SUCCESS))
-    return(MagickFalse);
-  length=width*height;
-  convolve_info->filter=clCreateBuffer(convolve_info->context,(cl_mem_flags)
-    (CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR),length*sizeof(cl_float),filter,
-    &status);
-  if ((convolve_info->filter == (cl_mem) NULL) || (status != CL_SUCCESS))
-    return(MagickFalse);
-  length=image->columns*image->rows;
-  convolve_info->convolve_pixels=clCreateBuffer(convolve_info->context,
-    (cl_mem_flags) (CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR),length*
-    sizeof(CLPixelInfo),convolve_pixels,&status);
-  if ((convolve_info->convolve_pixels == (cl_mem) NULL) ||
-      (status != CL_SUCCESS))
-    return(MagickFalse);
-  /*
-    Bind OpenCL buffers.
-  */
-  i=0;
-  status=clSetKernelArg(convolve_info->kernel,i++,sizeof(cl_mem),(void *)
-    &convolve_info->pixels);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  status=clSetKernelArg(convolve_info->kernel,i++,sizeof(cl_mem),(void *)
-    &convolve_info->filter);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  convolve_info->width=(cl_ulong) width;
-  status=clSetKernelArg(convolve_info->kernel,i++,sizeof(cl_ulong),(void *)
-    &convolve_info->width);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  convolve_info->height=(cl_ulong) height;
-  status=clSetKernelArg(convolve_info->kernel,i++,sizeof(cl_ulong),(void *)
-    &convolve_info->height);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  convolve_info->matte=(cl_uint) image->alpha_trait == BlendPixelTrait ?
-    MagickTrue : MagickFalse;
-  status=clSetKernelArg(convolve_info->kernel,i++,sizeof(cl_uint),(void *)
-    &convolve_info->matte);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  status=clSetKernelArg(convolve_info->kernel,i++,sizeof(cl_mem),(void *)
-    &convolve_info->convolve_pixels);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  status=clFinish(convolve_info->command_queue);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  return(MagickTrue);
-}
-
-static void DestroyConvolveBuffers(ConvolveInfo *convolve_info)
-{
-  cl_int
-    status;
-
-  status=0;
-  if (convolve_info->convolve_pixels != (cl_mem) NULL)
-    status=clReleaseMemObject(convolve_info->convolve_pixels);
-  if (convolve_info->pixels != (cl_mem) NULL)
-    status=clReleaseMemObject(convolve_info->pixels);
-  if (convolve_info->filter != (cl_mem) NULL)
-    status=clReleaseMemObject(convolve_info->filter);
-  (void) status;
-}
-
-static ConvolveInfo *DestroyConvolveInfo(ConvolveInfo *convolve_info)
-{
-  cl_int
-    status;
-
-  status=0;
-  if (convolve_info->kernel != (cl_kernel) NULL)
-    status=clReleaseKernel(convolve_info->kernel);
-  if (convolve_info->program != (cl_program) NULL)
-    status=clReleaseProgram(convolve_info->program);
-  if (convolve_info->command_queue != (cl_command_queue) NULL)
-    status=clReleaseCommandQueue(convolve_info->command_queue);
-  if (convolve_info->context != (cl_context) NULL)
-    status=clReleaseContext(convolve_info->context);
-  (void) status;
-  convolve_info=(ConvolveInfo *) RelinquishMagickMemory(convolve_info);
-  return(convolve_info);
-}
-
-static MagickBooleanType EnqueueConvolveKernel(ConvolveInfo *convolve_info,
-  const Image *image,const void *pixels,float *filter,const size_t width,
-  const size_t height,void *convolve_pixels)
-{
-  cl_int
-    status;
-
-  size_t
-    global_work_size[2],
-    length;
-
-  length=image->columns*image->rows;
-  status=clEnqueueWriteBuffer(convolve_info->command_queue,
-    convolve_info->pixels,CL_TRUE,0,length*sizeof(CLPixelInfo),pixels,0,NULL,
-    NULL);
-  length=width*height;
-  status=clEnqueueWriteBuffer(convolve_info->command_queue,
-    convolve_info->filter,CL_TRUE,0,length*sizeof(cl_float),filter,0,NULL,
-    NULL);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  global_work_size[0]=image->columns;
-  global_work_size[1]=image->rows;
-  status=clEnqueueNDRangeKernel(convolve_info->command_queue,
-    convolve_info->kernel,2,NULL,global_work_size,NULL,0,NULL,NULL);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  length=image->columns*image->rows;
-  status=clEnqueueReadBuffer(convolve_info->command_queue,
-    convolve_info->convolve_pixels,CL_TRUE,0,length*sizeof(CLPixelInfo),
-    convolve_pixels,0,NULL,NULL);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  status=clFinish(convolve_info->command_queue);
-  if (status != CL_SUCCESS)
-    return(MagickFalse);
-  return(MagickTrue);
-}
-
-static ConvolveInfo *GetConvolveInfo(const Image *image,const char *name,
-  const char *source,ExceptionInfo *exception)
-{
-  char
-    options[MaxTextExtent];
-
-  cl_context_properties
-    context_properties[3];
-
-  cl_int
-    status;
-
-  cl_platform_id
-    platforms[1];
-
-  cl_uint
-    number_platforms;
-
-  ConvolveInfo
-    *convolve_info;
-
-  size_t
-    length,
-    lengths[] = { strlen(source) };
+
+static MagickBooleanType splitImage(const Image* inputImage)
+{
+  MagickBooleanType split;
+
+  MagickCLEnv clEnv;
+  unsigned long allocSize;
+  unsigned long tempSize;
+
+  clEnv = GetDefaultOpenCLEnv();
+  allocSize = GetOpenCLDeviceMaxMemAllocSize(clEnv);
+  tempSize = inputImage->columns * inputImage->rows * 4 * 4;
 
   /*
-    Create OpenCL info.
+  printf("alloc size: %lu\n", allocSize);
+  printf("temp size: %lu\n", tempSize);
   */
-  convolve_info=(ConvolveInfo *) AcquireMagickMemory(sizeof(*convolve_info));
-  if (convolve_info == (ConvolveInfo *) NULL)
+
+  split = ((tempSize > allocSize) ? MagickTrue:MagickFalse);
+
+  return split;
+}
+
+static Image* ComputeBlurImage(const Image* inputImage, const ChannelType channel, const double radius, const double sigma, ExceptionInfo *exception)
+{
+  MagickBooleanType outputReady;
+  Image* filteredImage;
+  MagickCLEnv clEnv;
+
+  cl_int clStatus;
+
+  const void *inputPixels;
+  void *filteredPixels;
+  cl_mem_flags mem_flags;
+
+  cl_context context;
+  cl_mem inputImageBuffer, tempImageBuffer, filteredImageBuffer, imageKernelBuffer;
+  cl_kernel blurRowKernel, blurColumnKernel;
+  cl_command_queue queue;
+
+  void* hostPtr;
+  float* kernelBufferPtr;
+  MagickSizeType length;
+
+  char geometry[MaxTextExtent];
+  KernelInfo* kernel = NULL;
+  unsigned int kernelWidth;
+  unsigned int imageColumns, imageRows;
+
+  unsigned int i;
+
+  context = NULL;
+  filteredImage = NULL;
+  inputImageBuffer = NULL;
+  tempImageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  imageKernelBuffer = NULL;
+  blurRowKernel = NULL;
+  blurColumnKernel = NULL;
+  queue = NULL;
+
+  outputReady = MagickFalse;
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  /* Create and initialize OpenCL buffers. */
+  {
+    inputPixels = NULL;
+    inputPixels = AcquirePixelCachePixels(inputImage, &length, exception);
+    if (inputPixels == (const void *) NULL)
     {
-      (void) ThrowMagickException(exception,GetMagickModule(),
-        ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename);
-      return((ConvolveInfo *) NULL);
+      (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+      goto cleanup;
     }
-  (void) ResetMagickMemory(convolve_info,0,sizeof(*convolve_info));
-  /*
-    Create OpenCL context.
-  */
-  status=clGetPlatformIDs(0,(cl_platform_id *) NULL,&number_platforms);
-  if ((status == CL_SUCCESS) && (number_platforms > 0))
-    status=clGetPlatformIDs(1,platforms,NULL);
-  if (status != CL_SUCCESS)
-    {
-      (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning,
-        "failed to create OpenCL context","'%s' (%d)",image->filename,status);
-      convolve_info=DestroyConvolveInfo(convolve_info);
-      return((ConvolveInfo *) NULL);
-    }
-  context_properties[0]=CL_CONTEXT_PLATFORM;
-  context_properties[1]=(cl_context_properties) platforms[0];
-  context_properties[2]=0;
-  convolve_info->context=clCreateContextFromType(context_properties,
-    (cl_device_type) CL_DEVICE_TYPE_GPU,ConvolveNotify,exception,&status);
-  if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS))
-    convolve_info->context=clCreateContextFromType(context_properties,
-      (cl_device_type) CL_DEVICE_TYPE_CPU,ConvolveNotify,exception,&status);
-  if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS))
-    convolve_info->context=clCreateContextFromType(context_properties,
-      (cl_device_type) CL_DEVICE_TYPE_DEFAULT,ConvolveNotify,exception,&status);
-  if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS))
-    {
-      (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning,
-        "failed to create OpenCL context","'%s' (%d)",image->filename,status);
-      convolve_info=DestroyConvolveInfo(convolve_info);
-      return((ConvolveInfo *) NULL);
+    /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+    if (ALIGNED(inputPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
     }
-  /*
-    Detect OpenCL devices.
-  */
-  status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,0,NULL,
-    &length);
-  if ((status != CL_SUCCESS) || (length == 0))
-    {
-      convolve_info=DestroyConvolveInfo(convolve_info);
-      return((ConvolveInfo *) NULL);
-    }
-  convolve_info->devices=(cl_device_id *) AcquireMagickMemory(length);
-  if (convolve_info->devices == (cl_device_id *) NULL)
-    {
-      (void) ThrowMagickException(exception,GetMagickModule(),
-        ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename);
-      convolve_info=DestroyConvolveInfo(convolve_info);
-      return((ConvolveInfo *) NULL);
-    }
-  status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,length,
-    convolve_info->devices,NULL);
-  if (status != CL_SUCCESS)
-    {
-      convolve_info=DestroyConvolveInfo(convolve_info);
-      return((ConvolveInfo *) NULL);
-    }
-  if (image->debug != MagickFalse)
-    {
-      char
-        attribute[MaxTextExtent];
-
-      size_t
-        length;
-
-      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_NAME,
-        sizeof(attribute),attribute,&length);
-      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Name: %s",
-        attribute);
-      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_VENDOR,
-        sizeof(attribute),attribute,&length);
-      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Vendor: %s",
-        attribute);
-      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_VERSION,
-        sizeof(attribute),attribute,&length);
-      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),
-        "Driver Version: %s",attribute);
-      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_PROFILE,
-        sizeof(attribute),attribute,&length);
-      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Profile: %s",
-        attribute);
-      clGetDeviceInfo(convolve_info->devices[0],CL_DRIVER_VERSION,
-        sizeof(attribute),attribute,&length);
-      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Driver: %s",
-        attribute);
-      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_EXTENSIONS,
-        sizeof(attribute),attribute,&length);
-      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Extensions: %s",
-        attribute);
+    else 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
     }
-  /*
-    Create OpenCL command queue.
-  */
-  convolve_info->command_queue=clCreateCommandQueue(convolve_info->context,
-    convolve_info->devices[0],0,&status);
-  if ((convolve_info->command_queue == (cl_command_queue) NULL) ||
-      (status != CL_SUCCESS))
+    /* create a CL buffer from image pixel buffer */
+    length = inputImage->columns * inputImage->rows;
+    inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+    if (clStatus != CL_SUCCESS)
     {
-      convolve_info=DestroyConvolveInfo(convolve_info);
-      return((ConvolveInfo *) NULL);
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
     }
-  /*
-    Build OpenCL program.
-  */
-  convolve_info->program=clCreateProgramWithSource(convolve_info->context,1,
-    &source,lengths,&status);
-  if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS))
-    {
-      convolve_info=DestroyConvolveInfo(convolve_info);
-      return((ConvolveInfo *) NULL);
-    }
-  (void) FormatLocaleString(options,MaxTextExtent,CLOptions,(float)
-    QuantumRange,MagickEpsilon);
-  status=clBuildProgram(convolve_info->program,1,convolve_info->devices,options,
-    NULL,NULL);
-  if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS))
-    {
-      char
-        *log;
-
-      status=clGetProgramBuildInfo(convolve_info->program,
-        convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&length);
-      log=(char *) AcquireMagickMemory(length);
-      if (log == (char *) NULL)
-        {
-          convolve_info=DestroyConvolveInfo(convolve_info);
-          return((ConvolveInfo *) NULL);
-        }
-      status=clGetProgramBuildInfo(convolve_info->program,
-        convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,length,log,&length);
-      (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning,
-        "failed to build OpenCL program","'%s' (%s)",image->filename,log);
-      log=DestroyString(log);
-      convolve_info=DestroyConvolveInfo(convolve_info);
-      return((ConvolveInfo *) NULL);
+  }
+
+  /* create output */
+  {
+    filteredImage = CloneImage(inputImage,inputImage->columns,inputImage->rows,MagickTrue,exception);
+    assert(filteredImage != NULL);
+    if (SetImageStorageClass(filteredImage,DirectClass) != MagickTrue)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+      goto cleanup;
     }
-  /*
-    Get a kernel object.
-  */
-  convolve_info->kernel=clCreateKernel(convolve_info->program,name,&status);
-  if ((convolve_info->kernel == (cl_kernel) NULL) || (status != CL_SUCCESS))
+    filteredPixels = GetPixelCachePixels(filteredImage, &length, exception);
+    if (filteredPixels == (void *) NULL)
     {
-      convolve_info=DestroyConvolveInfo(convolve_info);
-      return((ConvolveInfo *) NULL);
+      (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+      goto cleanup;
     }
-  return(convolve_info);
-}
 
-#endif
+    if (ALIGNED(filteredPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+      hostPtr = filteredPixels;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY;
+      hostPtr = NULL;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = inputImage->columns * inputImage->rows;
+    filteredImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
 
-MagickExport MagickBooleanType AccelerateConvolveImage(const Image *image,
-  const KernelInfo *kernel,Image *convolve_image,ExceptionInfo *exception)
-{
-  assert(image != (Image *) NULL);
-  assert(image->signature == MagickSignature);
-  if (image->debug != MagickFalse)
-    (void) LogMagickEvent(TraceEvent,GetMagickModule(),"%s",image->filename);
-  assert(kernel != (KernelInfo *) NULL);
-  assert(kernel->signature == MagickSignature);
-  assert(convolve_image != (Image *) NULL);
-  assert(convolve_image->signature == MagickSignature);
-  assert(exception != (ExceptionInfo *) NULL);
-  assert(exception->signature == MagickSignature);
-  if ((image->storage_class != DirectClass) ||
-      (image->colorspace == CMYKColorspace))
-    return(MagickFalse);
-  if ((GetImageVirtualPixelMethod(image) != UndefinedVirtualPixelMethod) &&
-      (GetImageVirtualPixelMethod(image) != EdgeVirtualPixelMethod))
-    return(MagickFalse);
-  if (GetPixelChannels(image) != 4)
-    return(MagickFalse);
-#if !defined(MAGICKCORE_OPENCL_SUPPORT)
-  return(MagickFalse);
-#else
+  /* create processing kernel */
   {
-    const void
-      *pixels;
+    (void) FormatLocaleString(geometry,MaxTextExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
+    kernel=AcquireKernelInfo(geometry);
+    if (kernel == (KernelInfo *) NULL)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "MemoryAllocationFailed.",".");
+      goto cleanup;
+    }
 
-    float
-      *filter;
+    imageKernelBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, kernel->width * sizeof(float), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+    kernelBufferPtr = (float*)clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueueMapBuffer failed.",".");
+      goto cleanup;
+    }
 
-    ConvolveInfo
-      *convolve_info;
+    for (i = 0; i < kernel->width; i++)
+    {
+      kernelBufferPtr[i] = (float) kernel->values[i];
+    }
 
-    MagickBooleanType
-      status;
+    clStatus = clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
 
-    MagickSizeType
-      length;
+  {
 
-    register ssize_t
-      i;
+    /* create temp buffer */
+    {
+      length = inputImage->columns * inputImage->rows;
+      tempImageBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length * 4 * sizeof(float), NULL, &clStatus);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+        goto cleanup;
+      }
+    }
+
+    /* get the OpenCL kernels */
+    {
+      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurRow");
+      if (blurRowKernel == NULL)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+
+      blurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurColumn");
+      if (blurColumnKernel == NULL)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+    }
+
+    {
+      /* need logic to decide this value */
+      int chunkSize = 256;
+
+      {
+        imageColumns = inputImage->columns;
+        imageRows = inputImage->rows;
+
+        /* set the kernel arguments */
+        i = 0;
+        clStatus=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(ChannelType),&channel);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+        kernelWidth = kernel->width;
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(CLPixelPacket)*(chunkSize+kernel->width),(void *)NULL);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+          goto cleanup;
+        }
+      }
+
+      /* launch the kernel */
+      {
+        size_t gsize[2];
+        size_t wsize[2];
+
+        gsize[0] = chunkSize*((inputImage->columns+chunkSize-1)/chunkSize);
+        gsize[1] = inputImage->rows;
+        wsize[0] = chunkSize;
+        wsize[1] = 1;
+
+        clStatus = clEnqueueNDRangeKernel(queue, blurRowKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+          goto cleanup;
+        }
+        clFlush(queue);
+      }
+    }
 
-    void
-      *convolve_pixels;
+    {
+      /* need logic to decide this value */
+      int chunkSize = 256;
 
-    convolve_info=GetConvolveInfo(image,"Convolve",ConvolveKernel,exception);
-    if (convolve_info == (ConvolveInfo *) NULL)
-      return(MagickFalse);
-    pixels=AcquirePixelCachePixels(image,&length,exception);
-    if (pixels == (const void *) NULL)
       {
-        convolve_info=DestroyConvolveInfo(convolve_info);
-        (void) ThrowMagickException(exception,GetMagickModule(),CacheError,
-          "UnableToReadPixelCache","`%s'",image->filename);
-        return(MagickFalse);
+        imageColumns = inputImage->columns;
+        imageRows = inputImage->rows;
+
+        /* set the kernel arguments */
+        i = 0;
+        clStatus=clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+        clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+        clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(ChannelType),&channel);
+        clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+        kernelWidth = kernel->width;
+        clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+        clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+        clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+        clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(cl_float4)*(chunkSize+kernel->width),(void *)NULL);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+          goto cleanup;
+        }
       }
-    convolve_pixels=GetPixelCachePixels(convolve_image,&length,exception);
-    if (convolve_pixels == (void *) NULL)
+
+      /* launch the kernel */
       {
-        convolve_info=DestroyConvolveInfo(convolve_info);
-        (void) ThrowMagickException(exception,GetMagickModule(),CacheError,
-          "UnableToReadPixelCache","`%s'",image->filename);
-        return(MagickFalse);
+        size_t gsize[2];
+        size_t wsize[2];
+
+        gsize[0] = inputImage->columns;
+        gsize[1] = chunkSize*((inputImage->rows+chunkSize-1)/chunkSize);
+        wsize[0] = 1;
+        wsize[1] = chunkSize;
+
+        clStatus = clEnqueueNDRangeKernel(queue, blurColumnKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+          goto cleanup;
+        }
+        clFlush(queue);
       }
-    filter=(float *) AcquireQuantumMemory(kernel->width,kernel->height*
-      sizeof(*filter));
-    if (filter == (float *) NULL)
+    }
+
+  }
+
+  /* get result */ 
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady = MagickTrue;
+
+cleanup:
+  if (inputImageBuffer!=NULL)     clReleaseMemObject(inputImageBuffer);
+  if (tempImageBuffer!=NULL)      clReleaseMemObject(tempImageBuffer);
+  if (filteredImageBuffer!=NULL)  clReleaseMemObject(filteredImageBuffer);
+  if (imageKernelBuffer!=NULL)    clReleaseMemObject(imageKernelBuffer);
+  if (blurRowKernel!=NULL)        RelinquishOpenCLKernel(clEnv, blurRowKernel);
+  if (blurColumnKernel!=NULL)     RelinquishOpenCLKernel(clEnv, blurColumnKernel);
+  if (queue != NULL)              RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (kernel!=NULL)               DestroyKernelInfo(kernel);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return filteredImage;
+}
+
+static Image* ComputeBlurImageSection(const Image* inputImage, const ChannelType channel, const double radius, const double sigma, ExceptionInfo *exception)
+{
+  MagickBooleanType outputReady;
+  Image* filteredImage;
+  MagickCLEnv clEnv;
+
+  cl_int clStatus;
+
+  const void *inputPixels;
+  void *filteredPixels;
+  cl_mem_flags mem_flags;
+
+  cl_context context;
+  cl_mem inputImageBuffer, tempImageBuffer, filteredImageBuffer, imageKernelBuffer;
+  cl_kernel blurRowKernel, blurColumnKernel;
+  cl_command_queue queue;
+
+  void* hostPtr;
+  float* kernelBufferPtr;
+  MagickSizeType length;
+
+  char geometry[MaxTextExtent];
+  KernelInfo* kernel = NULL;
+  unsigned int kernelWidth;
+  unsigned int imageColumns, imageRows;
+
+  unsigned int i;
+
+  context = NULL;
+  filteredImage = NULL;
+  inputImageBuffer = NULL;
+  tempImageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  imageKernelBuffer = NULL;
+  blurRowKernel = NULL;
+  blurColumnKernel = NULL;
+  queue = NULL;
+
+  outputReady = MagickFalse;
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  /* Create and initialize OpenCL buffers. */
+  {
+    inputPixels = NULL;
+    inputPixels = AcquirePixelCachePixels(inputImage, &length, exception);
+    if (inputPixels == (const void *) NULL)
+    {
+      (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+      goto cleanup;
+    }
+    /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+    if (ALIGNED(inputPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = inputImage->columns * inputImage->rows;
+    inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create output */
+  {
+    filteredImage = CloneImage(inputImage,inputImage->columns,inputImage->rows,MagickTrue,exception);
+    assert(filteredImage != NULL);
+    if (SetImageStorageClass(filteredImage,DirectClass) != MagickTrue)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    filteredPixels = GetPixelCachePixels(filteredImage, &length, exception);
+    if (filteredPixels == (void *) NULL)
+    {
+      (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+      goto cleanup;
+    }
+
+    if (ALIGNED(filteredPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+      hostPtr = filteredPixels;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY;
+      hostPtr = NULL;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = inputImage->columns * inputImage->rows;
+    filteredImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create processing kernel */
+  {
+    (void) FormatLocaleString(geometry,MaxTextExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
+    kernel=AcquireKernelInfo(geometry);
+    if (kernel == (KernelInfo *) NULL)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "MemoryAllocationFailed.",".");
+      goto cleanup;
+    }
+
+    imageKernelBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, kernel->width * sizeof(float), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+    kernelBufferPtr = (float*)clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueueMapBuffer failed.",".");
+      goto cleanup;
+    }
+
+    for (i = 0; i < kernel->width; i++)
+    {
+      kernelBufferPtr[i] = (float) kernel->values[i];
+    }
+
+    clStatus = clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
+
+  {
+    unsigned int offsetRows;
+    unsigned int sec;
+
+    /* create temp buffer */
+    {
+      length = inputImage->columns * (inputImage->rows / 2 + 1 + (kernel->width-1) / 2);
+      tempImageBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length * 4 * sizeof(float), NULL, &clStatus);
+      if (clStatus != CL_SUCCESS)
       {
-        DestroyConvolveBuffers(convolve_info);
-        convolve_info=DestroyConvolveInfo(convolve_info);
-        (void) ThrowMagickException(exception,GetMagickModule(),
-          ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename);
-        return(MagickFalse);
+        (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+        goto cleanup;
       }
-    for (i=0; i < (ssize_t) (kernel->width*kernel->height); i++)
-      filter[i]=(float) kernel->values[i];
-    status=BindConvolveParameters(convolve_info,image,pixels,filter,
-      kernel->width,kernel->height,convolve_pixels);
-    if (status == MagickFalse)
+    }
+
+    /* get the OpenCL kernels */
+    {
+      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurRowSection");
+      if (blurRowKernel == NULL)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+
+      blurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurColumnSection");
+      if (blurColumnKernel == NULL)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+    }
+
+    for (sec = 0; sec < 2; sec++)
+    {
       {
-        filter=(float *) RelinquishMagickMemory(filter);
-        DestroyConvolveBuffers(convolve_info);
-        convolve_info=DestroyConvolveInfo(convolve_info);
-        return(MagickFalse);
+        /* need logic to decide this value */
+        int chunkSize = 256;
+
+        {
+          imageColumns = inputImage->columns;
+          if (sec == 0)
+            imageRows = inputImage->rows / 2 + (kernel->width-1) / 2;
+          else
+            imageRows = (inputImage->rows - inputImage->rows / 2) + (kernel->width-1) / 2;
+
+          offsetRows = sec * inputImage->rows / 2;
+
+          kernelWidth = kernel->width;
+
+          /* set the kernel arguments */
+          i = 0;
+          clStatus=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+          clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+          clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(ChannelType),&channel);
+          clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+          clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+          clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+          clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+          clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(CLPixelPacket)*(chunkSize+kernel->width),(void *)NULL);
+          clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&offsetRows);
+          clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&sec);
+          if (clStatus != CL_SUCCESS)
+          {
+            (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+            goto cleanup;
+          }
+        }
+
+        /* launch the kernel */
+        {
+          size_t gsize[2];
+          size_t wsize[2];
+
+          gsize[0] = chunkSize*((imageColumns+chunkSize-1)/chunkSize);
+          gsize[1] = imageRows;
+          wsize[0] = chunkSize;
+          wsize[1] = 1;
+
+          clStatus = clEnqueueNDRangeKernel(queue, blurRowKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+          if (clStatus != CL_SUCCESS)
+          {
+            (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+            goto cleanup;
+          }
+          clFlush(queue);
+        }
       }
-    status=EnqueueConvolveKernel(convolve_info,image,pixels,filter,
-      kernel->width,kernel->height,convolve_pixels);
-    filter=(float *) RelinquishMagickMemory(filter);
-    if (status == MagickFalse)
+
       {
-        DestroyConvolveBuffers(convolve_info);
-        convolve_info=DestroyConvolveInfo(convolve_info);
-        return(MagickFalse);
+        /* need logic to decide this value */
+        int chunkSize = 256;
+
+        {
+          imageColumns = inputImage->columns;
+          if (sec == 0)
+            imageRows = inputImage->rows / 2;
+          else
+            imageRows = (inputImage->rows - inputImage->rows / 2);
+
+          offsetRows = sec * inputImage->rows / 2;
+
+          kernelWidth = kernel->width;
+
+          /* set the kernel arguments */
+          i = 0;
+          clStatus=clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+          clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+          clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(ChannelType),&channel);
+          clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+          clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+          clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+          clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+          clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(cl_float4)*(chunkSize+kernel->width),(void *)NULL);
+          clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&offsetRows);
+          clStatus|=clSetKernelArg(blurColumnKernel,i++,sizeof(unsigned int),(void *)&sec);
+          if (clStatus != CL_SUCCESS)
+          {
+            (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+            goto cleanup;
+          }
+        }
+
+        /* launch the kernel */
+        {
+          size_t gsize[2];
+          size_t wsize[2];
+
+          gsize[0] = imageColumns;
+          gsize[1] = chunkSize*((imageRows+chunkSize-1)/chunkSize);
+          wsize[0] = 1;
+          wsize[1] = chunkSize;
+
+          clStatus = clEnqueueNDRangeKernel(queue, blurColumnKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+          if (clStatus != CL_SUCCESS)
+          {
+            (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+            goto cleanup;
+          }
+          clFlush(queue);
+        }
       }
-    DestroyConvolveBuffers(convolve_info);
-    convolve_info=DestroyConvolveInfo(convolve_info);
-    return(MagickTrue);
+    }
+
   }
-#endif
+
+  /* get result */
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady = MagickTrue;
+
+cleanup:
+  if (inputImageBuffer!=NULL)     clReleaseMemObject(inputImageBuffer);
+  if (tempImageBuffer!=NULL)      clReleaseMemObject(tempImageBuffer);
+  if (filteredImageBuffer!=NULL)  clReleaseMemObject(filteredImageBuffer);
+  if (imageKernelBuffer!=NULL)    clReleaseMemObject(imageKernelBuffer);
+  if (blurRowKernel!=NULL)        RelinquishOpenCLKernel(clEnv, blurRowKernel);
+  if (blurColumnKernel!=NULL)     RelinquishOpenCLKernel(clEnv, blurColumnKernel);
+  if (queue != NULL)              RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (kernel!=NULL)               DestroyKernelInfo(kernel);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return filteredImage;
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     B l u r I m a g e  w i t h  O p e n C L                                 %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  BlurImage() blurs an image.  We convolve the image with a Gaussian operator
+%  of the given radius and standard deviation (sigma).  For reasonable results,
+%  the radius should be larger than sigma.  Use a radius of 0 and BlurImage()
+%  selects a suitable radius for you.
+%
+%  The format of the BlurImage method is:
+%
+%      Image *BlurImage(const Image *image,const double radius,
+%        const double sigma,ExceptionInfo *exception)
+%      Image *BlurImageChannel(const Image *image,const ChannelType channel,
+%        const double radius,const double sigma,ExceptionInfo *exception)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o channel: the channel type.
+%
+%    o radius: the radius of the Gaussian, in pixels, not counting the center
+%      pixel.
+%
+%    o sigma: the standard deviation of the Gaussian, in pixels.
+%
+%    o exception: return any errors or warnings in this structure.
+%
+*/
+
+MagickExport
+Image* AccelerateBlurImage(const Image *image, const ChannelType channel, const double radius, const double sigma,ExceptionInfo *exception)
+{
+  MagickBooleanType status;
+  Image* filteredImage = NULL;
+
+  assert(image != NULL);
+  assert(exception != (ExceptionInfo *) NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  status = checkAccelerateCondition(image, channel, exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  if (splitImage(image) && (image->rows / 2 > radius)) 
+    filteredImage = ComputeBlurImageSection(image, channel, radius, sigma, exception);
+  else
+    filteredImage = ComputeBlurImage(image, channel, radius, sigma, exception);
+
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  return filteredImage;
 }
+
+
+static Image* ComputeRadialBlurImage(const Image *inputImage, const ChannelType channel, const double angle, ExceptionInfo *exception)
+{
+
+  MagickBooleanType outputReady;
+  Image* filteredImage;
+  MagickCLEnv clEnv;
+
+  cl_int clStatus;
+  size_t global_work_size[2];
+
+  cl_context context;
+  cl_mem_flags mem_flags;
+  cl_mem inputImageBuffer, filteredImageBuffer, sinThetaBuffer, cosThetaBuffer;
+  cl_kernel radialBlurKernel;
+  cl_command_queue queue;
+
+  const void *inputPixels;
+  void *filteredPixels;
+  void* hostPtr;
+  float* sinThetaPtr;
+  float* cosThetaPtr;
+  MagickSizeType length;
+  unsigned int matte;
+  MagickPixelPacket bias;
+  cl_float4 biasPixel;
+  cl_float2 blurCenter;
+  float blurRadius;
+  unsigned int cossin_theta_size;
+  float offset, theta;
+
+  unsigned int i;
+
+  outputReady = MagickFalse;
+  context = NULL;
+  filteredImage = NULL;
+  inputImageBuffer = NULL;
+  filteredImageBuffer = NULL;
+  sinThetaBuffer = NULL;
+  cosThetaBuffer = NULL;
+  queue = NULL;
+  radialBlurKernel = NULL;
+
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+
+
+  /* Create and initialize OpenCL buffers. */
+
+  inputPixels = NULL;
+  inputPixels = AcquirePixelCachePixels(inputImage, &length, exception);
+  if (inputPixels == (const void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+    goto cleanup;
+  }
+
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+
+  filteredImage = CloneImage(inputImage,inputImage->columns,inputImage->rows,MagickTrue,exception);
+  assert(filteredImage != NULL);
+  if (SetImageStorageClass(filteredImage,DirectClass) != MagickTrue)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  filteredPixels = GetPixelCachePixels(filteredImage, &length, exception);
+  if (filteredPixels == (void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+    goto cleanup;
+  }
+
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = filteredPixels;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY;
+    hostPtr = NULL;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  filteredImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  blurCenter.s[0] = (float) (inputImage->columns-1)/2.0;
+  blurCenter.s[1] = (float) (inputImage->rows-1)/2.0;
+  blurRadius=hypot(blurCenter.s[0],blurCenter.s[1]);
+  cossin_theta_size=(unsigned int) fabs(4.0*DegreesToRadians(angle)*sqrt((double)blurRadius)+2UL);
+
+  /* create a buffer for sin_theta and cos_theta */
+  sinThetaBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, cossin_theta_size * sizeof(float), NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+  cosThetaBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, cossin_theta_size * sizeof(float), NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+
+  queue = AcquireOpenCLCommandQueue(clEnv);
+  sinThetaPtr = (float*) clEnqueueMapBuffer(queue, sinThetaBuffer, CL_TRUE, CL_MAP_WRITE, 0, cossin_theta_size*sizeof(float), 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueuemapBuffer failed.",".");
+    goto cleanup;
+  }
+
+  cosThetaPtr = (float*) clEnqueueMapBuffer(queue, cosThetaBuffer, CL_TRUE, CL_MAP_WRITE, 0, cossin_theta_size*sizeof(float), 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueuemapBuffer failed.",".");
+    goto cleanup;
+  }
+
+  theta=DegreesToRadians(angle)/(MagickRealType) (cossin_theta_size-1);
+  offset=theta*(MagickRealType) (cossin_theta_size-1)/2.0;
+  for (i=0; i < (ssize_t) cossin_theta_size; i++)
+  {
+    cosThetaPtr[i]=(float)cos((double) (theta*i-offset));
+    sinThetaPtr[i]=(float)sin((double) (theta*i-offset));
+  }
+  clStatus = clEnqueueUnmapMemObject(queue, sinThetaBuffer, sinThetaPtr, 0, NULL, NULL);
+  clStatus |= clEnqueueUnmapMemObject(queue, cosThetaBuffer, cosThetaPtr, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* get the OpenCL kernel */
+  radialBlurKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "RadialBlur");
+  if (radialBlurKernel == NULL)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  
+  /* set the kernel arguments */
+  i = 0;
+  clStatus=clSetKernelArg(radialBlurKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+  clStatus|=clSetKernelArg(radialBlurKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+
+  GetMagickPixelPacket(inputImage,&bias);
+  biasPixel.s[0] = bias.red;
+  biasPixel.s[1] = bias.green;
+  biasPixel.s[2] = bias.blue;
+  biasPixel.s[3] = bias.opacity;
+  clStatus|=clSetKernelArg(radialBlurKernel,i++,sizeof(cl_float4), &biasPixel);
+  clStatus|=clSetKernelArg(radialBlurKernel,i++,sizeof(ChannelType), &channel);
+
+  matte = (inputImage->matte == MagickTrue)?1:0;
+  clStatus|=clSetKernelArg(radialBlurKernel,i++,sizeof(unsigned int), &matte);
+
+  clStatus=clSetKernelArg(radialBlurKernel,i++,sizeof(cl_float2), &blurCenter);
+
+  clStatus|=clSetKernelArg(radialBlurKernel,i++,sizeof(cl_mem),(void *)&cosThetaBuffer);
+  clStatus|=clSetKernelArg(radialBlurKernel,i++,sizeof(cl_mem),(void *)&sinThetaBuffer);
+  clStatus|=clSetKernelArg(radialBlurKernel,i++,sizeof(unsigned int), &cossin_theta_size);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+
+  global_work_size[0] = inputImage->columns;
+  global_work_size[1] = inputImage->rows;
+  /* launch the kernel */
+  clStatus = clEnqueueNDRangeKernel(queue, radialBlurKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clFlush(queue);
+
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  outputReady = MagickTrue;
+
+cleanup:
+  if (filteredImageBuffer!=NULL)  clReleaseMemObject(filteredImageBuffer);
+  if (inputImageBuffer!=NULL)     clReleaseMemObject(inputImageBuffer);
+  if (sinThetaBuffer!=NULL)       clReleaseMemObject(sinThetaBuffer);
+  if (cosThetaBuffer!=NULL)       clReleaseMemObject(cosThetaBuffer);
+  if (radialBlurKernel!=NULL)     RelinquishOpenCLKernel(clEnv, radialBlurKernel);
+  if (queue != NULL)              RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return filteredImage;
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     R a d i a l B l u r I m a g e  w i t h  O p e n C L                     %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  RadialBlurImage() applies a radial blur to the image.
+%
+%  Andrew Protano contributed this effect.
+%
+%  The format of the RadialBlurImage method is:
+%
+%    Image *RadialBlurImage(const Image *image,const double angle,
+%      ExceptionInfo *exception)
+%    Image *RadialBlurImageChannel(const Image *image,const ChannelType channel,
+%      const double angle,ExceptionInfo *exception)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o channel: the channel type.
+%
+%    o angle: the angle of the radial blur.
+%
+%    o exception: return any errors or warnings in this structure.
+%
+*/
+
+MagickExport
+Image* AccelerateRadialBlurImage(const Image *image, const ChannelType channel, const double angle, ExceptionInfo *exception)
+{
+  MagickBooleanType status;
+  Image* filteredImage;
+  
+
+  assert(image != NULL);
+  assert(exception != NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  status = checkAccelerateCondition(image, channel, exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  filteredImage = ComputeRadialBlurImage(image, channel, angle, exception);
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  return filteredImage;
+}
+
+
+
+static Image* ComputeUnsharpMaskImage(const Image *inputImage, const ChannelType channel,const double radius,const double sigma, 
+          const double gain,const double threshold,ExceptionInfo *exception)
+{
+  MagickBooleanType outputReady = MagickFalse;
+  Image* filteredImage = NULL;
+  MagickCLEnv clEnv = NULL;
+
+  cl_int clStatus;
+
+  const void *inputPixels;
+  void *filteredPixels;
+  cl_mem_flags mem_flags;
+
+  KernelInfo *kernel = NULL;
+  char geometry[MaxTextExtent];
+
+  cl_context context = NULL;
+  cl_mem inputImageBuffer = NULL;
+  cl_mem filteredImageBuffer = NULL;
+  cl_mem tempImageBuffer = NULL;
+  cl_mem imageKernelBuffer = NULL;
+  cl_kernel blurRowKernel = NULL;
+  cl_kernel unsharpMaskBlurColumnKernel = NULL;
+  cl_command_queue queue = NULL;
+
+  void* hostPtr;
+  float* kernelBufferPtr;
+  MagickSizeType length;
+  unsigned int kernelWidth;
+  float fGain;
+  float fThreshold;
+  unsigned int imageColumns, imageRows;
+  int chunkSize;
+  unsigned int i;
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  /* Create and initialize OpenCL buffers. */
+  {
+    inputPixels = NULL;
+    inputPixels = AcquirePixelCachePixels(inputImage, &length, exception);
+    if (inputPixels == (const void *) NULL)
+    {
+      (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+      goto cleanup;
+    }
+
+    /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+    if (ALIGNED(inputPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = inputImage->columns * inputImage->rows;
+    inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create output */
+  {
+    filteredImage = CloneImage(inputImage,inputImage->columns,inputImage->rows,MagickTrue,exception);
+    assert(filteredImage != NULL);
+    if (SetImageStorageClass(filteredImage,DirectClass) != MagickTrue)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    filteredPixels = GetPixelCachePixels(filteredImage, &length, exception);
+    if (filteredPixels == (void *) NULL)
+    {
+      (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+      goto cleanup;
+    }
+
+    if (ALIGNED(filteredPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+      hostPtr = filteredPixels;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY;
+      hostPtr = NULL;
+    }
+
+    /* create a CL buffer from image pixel buffer */
+    length = inputImage->columns * inputImage->rows;
+    filteredImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create the blur kernel */
+  {
+    (void) FormatLocaleString(geometry,MaxTextExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
+    kernel=AcquireKernelInfo(geometry);
+    if (kernel == (KernelInfo *) NULL)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.",".");
+      goto cleanup;
+    }
+
+    imageKernelBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+
+
+    kernelBufferPtr = (float*)clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueueMapBuffer failed.",".");
+      goto cleanup;
+    }
+    for (i = 0; i < kernel->width; i++)
+    {
+      kernelBufferPtr[i] = (float) kernel->values[i];
+    }
+    clStatus = clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
+
+  {
+    /* create temp buffer */
+    {
+      length = inputImage->columns * inputImage->rows;
+      tempImageBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length * 4 * sizeof(float), NULL, &clStatus);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+        goto cleanup;
+      }
+    }
+
+    /* get the opencl kernel */
+    {
+      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurRow");
+      if (blurRowKernel == NULL)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+
+      unsharpMaskBlurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMaskBlurColumn");
+      if (unsharpMaskBlurColumnKernel == NULL)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+    }
+
+    {
+      chunkSize = 256;
+
+      imageColumns = inputImage->columns;
+      imageRows = inputImage->rows;
+
+      kernelWidth = kernel->width;
+
+      /* set the kernel arguments */
+      i = 0;
+      clStatus=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+      clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+      clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(ChannelType),&channel);
+      clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+      clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+      clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+      clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+      clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(CLPixelPacket)*(chunkSize+kernel->width),(void *)NULL);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+        goto cleanup;
+      }
+    }
+
+    /* launch the kernel */
+    {
+      size_t gsize[2];
+      size_t wsize[2];
+
+      gsize[0] = chunkSize*((inputImage->columns+chunkSize-1)/chunkSize);
+      gsize[1] = inputImage->rows;
+      wsize[0] = chunkSize;
+      wsize[1] = 1;
+
+      clStatus = clEnqueueNDRangeKernel(queue, blurRowKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+        goto cleanup;
+      }
+      clFlush(queue);
+    }
+
+
+    {
+      chunkSize = 256;
+      imageColumns = inputImage->columns;
+      imageRows = inputImage->rows;
+      kernelWidth = kernel->width;
+      fGain = (float)gain;
+      fThreshold = (float)threshold;
+
+      i = 0;
+      clStatus=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++, (chunkSize+kernelWidth-1)*sizeof(cl_float4),NULL);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++, kernelWidth*sizeof(float),NULL);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(ChannelType),&channel);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fGain);
+      clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fThreshold);
+
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+        goto cleanup;
+      }
+    }
+
+    /* launch the kernel */
+    {
+      size_t gsize[2];
+      size_t wsize[2];
+
+      gsize[0] = inputImage->columns;
+      gsize[1] = chunkSize*((inputImage->rows+chunkSize-1)/chunkSize);
+      wsize[0] = 1;
+      wsize[1] = chunkSize;
+
+      clStatus = clEnqueueNDRangeKernel(queue, unsharpMaskBlurColumnKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+        goto cleanup;
+      }
+      clFlush(queue);
+    }
+
+  }
+
+  /* get result */
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady = MagickTrue;
+  
+cleanup:
+  if (kernel != NULL)                        kernel=DestroyKernelInfo(kernel);
+  if (inputImageBuffer!=NULL)                clReleaseMemObject(inputImageBuffer);
+  if (filteredImageBuffer!=NULL)              clReleaseMemObject(filteredImageBuffer);
+  if (tempImageBuffer!=NULL)                  clReleaseMemObject(tempImageBuffer);
+  if (imageKernelBuffer!=NULL)                clReleaseMemObject(imageKernelBuffer);
+  if (blurRowKernel!=NULL)                    RelinquishOpenCLKernel(clEnv, blurRowKernel);
+  if (unsharpMaskBlurColumnKernel!=NULL)      RelinquishOpenCLKernel(clEnv, unsharpMaskBlurColumnKernel);
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return filteredImage;
+}
+
+
+static Image* ComputeUnsharpMaskImageSection(const Image *inputImage, const ChannelType channel,const double radius,const double sigma, 
+          const double gain,const double threshold,ExceptionInfo *exception)
+{
+  MagickBooleanType outputReady = MagickFalse;
+  Image* filteredImage = NULL;
+  MagickCLEnv clEnv = NULL;
+
+  cl_int clStatus;
+
+  const void *inputPixels;
+  void *filteredPixels;
+  cl_mem_flags mem_flags;
+
+  KernelInfo *kernel = NULL;
+  char geometry[MaxTextExtent];
+
+  cl_context context = NULL;
+  cl_mem inputImageBuffer = NULL;
+  cl_mem filteredImageBuffer = NULL;
+  cl_mem tempImageBuffer = NULL;
+  cl_mem imageKernelBuffer = NULL;
+  cl_kernel blurRowKernel = NULL;
+  cl_kernel unsharpMaskBlurColumnKernel = NULL;
+  cl_command_queue queue = NULL;
+
+  void* hostPtr;
+  float* kernelBufferPtr;
+  MagickSizeType length;
+  unsigned int kernelWidth;
+  float fGain;
+  float fThreshold;
+  unsigned int imageColumns, imageRows;
+  int chunkSize;
+  unsigned int i;
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  /* Create and initialize OpenCL buffers. */
+  {
+    inputPixels = NULL;
+    inputPixels = AcquirePixelCachePixels(inputImage, &length, exception);
+    if (inputPixels == (const void *) NULL)
+    {
+      (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+      goto cleanup;
+    }
+
+    /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+    if (ALIGNED(inputPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+    }
+    /* create a CL buffer from image pixel buffer */
+    length = inputImage->columns * inputImage->rows;
+    inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create output */
+  {
+    filteredImage = CloneImage(inputImage,inputImage->columns,inputImage->rows,MagickTrue,exception);
+    assert(filteredImage != NULL);
+    if (SetImageStorageClass(filteredImage,DirectClass) != MagickTrue)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    filteredPixels = GetPixelCachePixels(filteredImage, &length, exception);
+    if (filteredPixels == (void *) NULL)
+    {
+      (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+      goto cleanup;
+    }
+
+    if (ALIGNED(filteredPixels,CLPixelPacket)) 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+      hostPtr = filteredPixels;
+    }
+    else 
+    {
+      mem_flags = CL_MEM_WRITE_ONLY;
+      hostPtr = NULL;
+    }
+
+    /* create a CL buffer from image pixel buffer */
+    length = inputImage->columns * inputImage->rows;
+    filteredImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  /* create the blur kernel */
+  {
+    (void) FormatLocaleString(geometry,MaxTextExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma);
+    kernel=AcquireKernelInfo(geometry);
+    if (kernel == (KernelInfo *) NULL)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.",".");
+      goto cleanup;
+    }
+
+    imageKernelBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+
+
+    kernelBufferPtr = (float*)clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueueMapBuffer failed.",".");
+      goto cleanup;
+    }
+    for (i = 0; i < kernel->width; i++)
+    {
+      kernelBufferPtr[i] = (float) kernel->values[i];
+    }
+    clStatus = clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
+
+  {
+    unsigned int offsetRows;
+    unsigned int sec;
+
+    /* create temp buffer */
+    {
+      length = inputImage->columns * (inputImage->rows / 2 + 1 + (kernel->width-1) / 2);
+      tempImageBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length * 4 * sizeof(float), NULL, &clStatus);
+      if (clStatus != CL_SUCCESS)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+        goto cleanup;
+      }
+    }
+
+    /* get the opencl kernel */
+    {
+      blurRowKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "BlurRowSection");
+      if (blurRowKernel == NULL)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+
+      unsharpMaskBlurColumnKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMaskBlurColumnSection");
+      if (unsharpMaskBlurColumnKernel == NULL)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+        goto cleanup;
+      };
+    }
+
+    for (sec = 0; sec < 2; sec++)
+    {
+      {
+        chunkSize = 256;
+
+        imageColumns = inputImage->columns;
+        if (sec == 0)
+          imageRows = inputImage->rows / 2 + (kernel->width-1) / 2;
+        else
+          imageRows = (inputImage->rows - inputImage->rows / 2) + (kernel->width-1) / 2;
+
+        offsetRows = sec * inputImage->rows / 2;
+
+        kernelWidth = kernel->width;
+
+        /* set the kernel arguments */
+        i = 0;
+        clStatus=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(ChannelType),&channel);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(CLPixelPacket)*(chunkSize+kernel->width),(void *)NULL);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&offsetRows);
+        clStatus|=clSetKernelArg(blurRowKernel,i++,sizeof(unsigned int),(void *)&sec);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+          goto cleanup;
+        }
+      }
+      /* launch the kernel */
+      {
+        size_t gsize[2];
+        size_t wsize[2];
+
+        gsize[0] = chunkSize*((imageColumns+chunkSize-1)/chunkSize);
+        gsize[1] = imageRows;
+        wsize[0] = chunkSize;
+        wsize[1] = 1;
+
+        clStatus = clEnqueueNDRangeKernel(queue, blurRowKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+          goto cleanup;
+        }
+        clFlush(queue);
+      }
+
+
+      {
+        chunkSize = 256;
+
+        imageColumns = inputImage->columns;
+        if (sec == 0)
+          imageRows = inputImage->rows / 2 + (kernel->width-1) / 2;
+        else
+          imageRows = (inputImage->rows - inputImage->rows / 2);
+
+        offsetRows = sec * inputImage->rows / 2;
+
+        kernelWidth = kernel->width;
+
+        fGain = (float)gain;
+        fThreshold = (float)threshold;
+
+        i = 0;
+        clStatus=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&tempImageBuffer);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageColumns);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&imageRows);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++, (chunkSize+kernelWidth-1)*sizeof(cl_float4),NULL);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++, kernelWidth*sizeof(float),NULL);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(ChannelType),&channel);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&kernelWidth);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fGain);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(float),(void *)&fThreshold);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&offsetRows);
+        clStatus|=clSetKernelArg(unsharpMaskBlurColumnKernel,i++,sizeof(unsigned int),(void *)&sec);
+
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+          goto cleanup;
+        }
+      }
+
+      /* launch the kernel */
+      {
+        size_t gsize[2];
+        size_t wsize[2];
+
+        gsize[0] = imageColumns;
+        gsize[1] = chunkSize*((imageRows+chunkSize-1)/chunkSize);
+        wsize[0] = 1;
+        wsize[1] = chunkSize;
+
+        clStatus = clEnqueueNDRangeKernel(queue, unsharpMaskBlurColumnKernel, 2, NULL, gsize, wsize, 0, NULL, NULL);
+        if (clStatus != CL_SUCCESS)
+        {
+          (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+          goto cleanup;
+        }
+        clFlush(queue);
+      }
+    }
+  }
+
+  /* get result */
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady = MagickTrue;
+  
+cleanup:
+  if (kernel != NULL)                        kernel=DestroyKernelInfo(kernel);
+  if (inputImageBuffer!=NULL)                clReleaseMemObject(inputImageBuffer);
+  if (filteredImageBuffer!=NULL)              clReleaseMemObject(filteredImageBuffer);
+  if (tempImageBuffer!=NULL)                  clReleaseMemObject(tempImageBuffer);
+  if (imageKernelBuffer!=NULL)                clReleaseMemObject(imageKernelBuffer);
+  if (blurRowKernel!=NULL)                    RelinquishOpenCLKernel(clEnv, blurRowKernel);
+  if (unsharpMaskBlurColumnKernel!=NULL)      RelinquishOpenCLKernel(clEnv, unsharpMaskBlurColumnKernel);
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return filteredImage;
+}
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     U n s h a r p M a s k I m a g e  w i t h  O p e n C L                   %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  UnsharpMaskImage() sharpens one or more image channels.  We convolve the
+%  image with a Gaussian operator of the given radius and standard deviation
+%  (sigma).  For reasonable results, radius should be larger than sigma.  Use a
+%  radius of 0 and UnsharpMaskImage() selects a suitable radius for you.
+%
+%  The format of the UnsharpMaskImage method is:
+%
+%    Image *UnsharpMaskImage(const Image *image,const double radius,
+%      const double sigma,const double amount,const double threshold,
+%      ExceptionInfo *exception)
+%    Image *UnsharpMaskImageChannel(const Image *image,
+%      const ChannelType channel,const double radius,const double sigma,
+%      const double gain,const double threshold,ExceptionInfo *exception)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o channel: the channel type.
+%
+%    o radius: the radius of the Gaussian, in pixels, not counting the center
+%      pixel.
+%
+%    o sigma: the standard deviation of the Gaussian, in pixels.
+%
+%    o gain: the percentage of the difference between the original and the
+%      blur image that is added back into the original.
+%
+%    o threshold: the threshold in pixels needed to apply the diffence gain.
+%
+%    o exception: return any errors or warnings in this structure.
+%
+*/
+
+
+MagickExport
+Image* AccelerateUnsharpMaskImage(const Image *image, const ChannelType channel,const double radius,const double sigma, 
+          const double gain,const double threshold,ExceptionInfo *exception)
+{
+  MagickBooleanType status;
+  Image* filteredImage;
+  
+
+  assert(image != NULL);
+  assert(exception != NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  status = checkAccelerateCondition(image, channel, exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  if (splitImage(image) && (image->rows / 2 > radius)) 
+    filteredImage = ComputeUnsharpMaskImageSection(image,channel,radius,sigma,gain,threshold,exception);
+  else
+    filteredImage = ComputeUnsharpMaskImage(image,channel,radius,sigma,gain,threshold,exception);
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  return filteredImage;
+
+}
+
+
+static inline double MagickMax(const double x,const double y)
+{
+  if (x > y)
+    return(x);
+  return(y);
+}
+
+static inline double MagickMin(const double x,const double y)
+{
+  if (x < y)
+    return(x);
+  return(y);
+}
+
+
+static MagickBooleanType resizeHorizontalFilter(cl_mem inputImage
+                                 , const unsigned int inputImageColumns, const unsigned int inputImageRows, const unsigned int matte
+                                 , cl_mem resizedImage, const unsigned int resizedColumns, const unsigned int resizedRows
+                                 , const ResizeFilter* resizeFilter, cl_mem resizeFilterCubicCoefficients, const float xFactor
+                                 , MagickCLEnv clEnv, cl_command_queue queue, ExceptionInfo *exception)
+{
+  MagickBooleanType status = MagickFalse;
+
+  float scale, support;
+  unsigned int i;
+  cl_kernel horizontalKernel = NULL;
+  cl_int clStatus;
+  size_t global_work_size[2];
+  size_t local_work_size[2];
+  int resizeFilterType, resizeWindowType;
+  float resizeFilterScale, resizeFilterSupport, resizeFilterWindowSupport, resizeFilterBlur;
+  size_t totalLocalMemorySize;
+  size_t imageCacheLocalMemorySize, pixelAccumulatorLocalMemorySize
+        , weightAccumulatorLocalMemorySize, gammaAccumulatorLocalMemorySize;
+  size_t deviceLocalMemorySize;
+  int cacheRangeStart, cacheRangeEnd, numCachedPixels;
+  
+  const unsigned int workgroupSize = 256;
+  unsigned int pixelPerWorkgroup;
+  unsigned int chunkSize;
+
+  /*
+  Apply filter to resize vertically from image to resize image.
+  */
+  scale=MagickMax(1.0/xFactor+MagickEpsilon,1.0);
+  support=scale*GetResizeFilterSupport(resizeFilter);
+  if (support < 0.5)
+  {
+    /*
+    Support too small even for nearest neighbour: Reduce to point
+    sampling.
+    */
+    support=(MagickRealType) 0.5;
+    scale=1.0;
+  }
+  scale=PerceptibleReciprocal(scale);
+
+  if (resizedColumns < workgroupSize) 
+  {
+    chunkSize = 32;
+    pixelPerWorkgroup = 32;
+  }
+  else
+  {
+    chunkSize = workgroupSize;
+    pixelPerWorkgroup = workgroupSize;
+  }
+
+  /* get the local memory size supported by the device */
+  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
+
+  while(1)
+  {
+    /* calculate the local memory size needed per workgroup */
+    cacheRangeStart = (int) (((0 + 0.5)/xFactor+MagickEpsilon)-support+0.5);
+    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/xFactor+MagickEpsilon)+support+0.5);
+    numCachedPixels = cacheRangeEnd - cacheRangeStart + 1;
+    imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket);
+    totalLocalMemorySize = imageCacheLocalMemorySize;
+
+    /* local size for the pixel accumulator */
+    pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4);
+    totalLocalMemorySize+=pixelAccumulatorLocalMemorySize;
+
+    /* local memory size for the weight accumulator */
+    weightAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=weightAccumulatorLocalMemorySize;
+
+    /* local memory size for the gamma accumulator */
+    if (matte == 0)
+      gammaAccumulatorLocalMemorySize = sizeof(float);
+    else
+      gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=gammaAccumulatorLocalMemorySize;
+
+    if (totalLocalMemorySize <= deviceLocalMemorySize)
+      break;
+    else
+    {
+      pixelPerWorkgroup = pixelPerWorkgroup/2;
+      chunkSize = chunkSize/2;
+      if (pixelPerWorkgroup == 0
+          || chunkSize == 0)
+      {
+        /* quit, fallback to CPU */
+        goto cleanup;
+      }
+    }
+  }
+
+  resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter);
+  resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter);
+
+
+  if (resizeFilterType == SincFastWeightingFunction
+    && resizeWindowType == SincFastWeightingFunction)
+  {
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilterSinc");
+  }
+  else
+  {
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilter");
+  }
+  if (horizontalKernel == NULL)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  i = 0;
+  clStatus = clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&inputImage);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&inputImageColumns);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&inputImageRows);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&matte);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&xFactor);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizedImage);
+
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedRows);
+
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeFilterType);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeWindowType);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients);
+
+  resizeFilterScale = (float) GetResizeFilterScale(resizeFilter);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterScale);
+
+  resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterSupport);
+
+  resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport);
+
+  resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterBlur);
+
+
+  clStatus |= clSetKernelArg(horizontalKernel, i++, imageCacheLocalMemorySize, NULL);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(int), &numCachedPixels);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &chunkSize);
+  
+
+  clStatus |= clSetKernelArg(horizontalKernel, i++, pixelAccumulatorLocalMemorySize, NULL);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, weightAccumulatorLocalMemorySize, NULL);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, gammaAccumulatorLocalMemorySize, NULL);
+
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  global_work_size[0] = (resizedColumns+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize;
+  global_work_size[1] = resizedRows;
+
+  local_work_size[0] = workgroupSize;
+  local_work_size[1] = 1;
+  clStatus = clEnqueueNDRangeKernel(queue, horizontalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clFlush(queue);
+  status = MagickTrue;
+
+
+cleanup:
+  if (horizontalKernel != NULL) RelinquishOpenCLKernel(clEnv, horizontalKernel);
+
+  return status;
+}
+
+
+static MagickBooleanType resizeVerticalFilter(cl_mem inputImage
+                                 , const unsigned int inputImageColumns, const unsigned int inputImageRows, const unsigned int matte
+                                 , cl_mem resizedImage, const unsigned int resizedColumns, const unsigned int resizedRows
+                                 , const ResizeFilter* resizeFilter, cl_mem resizeFilterCubicCoefficients, const float yFactor
+                                 , MagickCLEnv clEnv, cl_command_queue queue, ExceptionInfo *exception)
+{
+  MagickBooleanType status = MagickFalse;
+
+  float scale, support;
+  unsigned int i;
+  cl_kernel horizontalKernel = NULL;
+  cl_int clStatus;
+  size_t global_work_size[2];
+  size_t local_work_size[2];
+  int resizeFilterType, resizeWindowType;
+  float resizeFilterScale, resizeFilterSupport, resizeFilterWindowSupport, resizeFilterBlur;
+  size_t totalLocalMemorySize;
+  size_t imageCacheLocalMemorySize, pixelAccumulatorLocalMemorySize
+        , weightAccumulatorLocalMemorySize, gammaAccumulatorLocalMemorySize;
+  size_t deviceLocalMemorySize;
+  int cacheRangeStart, cacheRangeEnd, numCachedPixels;
+  
+  const unsigned int workgroupSize = 256;
+  unsigned int pixelPerWorkgroup;
+  unsigned int chunkSize;
+
+  /*
+  Apply filter to resize vertically from image to resize image.
+  */
+  scale=MagickMax(1.0/yFactor+MagickEpsilon,1.0);
+  support=scale*GetResizeFilterSupport(resizeFilter);
+  if (support < 0.5)
+  {
+    /*
+    Support too small even for nearest neighbour: Reduce to point
+    sampling.
+    */
+    support=(MagickRealType) 0.5;
+    scale=1.0;
+  }
+  scale=PerceptibleReciprocal(scale);
+
+  if (resizedRows < workgroupSize) 
+  {
+    chunkSize = 32;
+    pixelPerWorkgroup = 32;
+  }
+  else
+  {
+    chunkSize = workgroupSize;
+    pixelPerWorkgroup = workgroupSize;
+  }
+
+  /* get the local memory size supported by the device */
+  deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv);
+
+  while(1)
+  {
+    /* calculate the local memory size needed per workgroup */
+    cacheRangeStart = (int) (((0 + 0.5)/yFactor+MagickEpsilon)-support+0.5);
+    cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/yFactor+MagickEpsilon)+support+0.5);
+    numCachedPixels = cacheRangeEnd - cacheRangeStart + 1;
+    imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket);
+    totalLocalMemorySize = imageCacheLocalMemorySize;
+
+    /* local size for the pixel accumulator */
+    pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4);
+    totalLocalMemorySize+=pixelAccumulatorLocalMemorySize;
+
+    /* local memory size for the weight accumulator */
+    weightAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=weightAccumulatorLocalMemorySize;
+
+    /* local memory size for the gamma accumulator */
+    if (matte == 0)
+      gammaAccumulatorLocalMemorySize = sizeof(float);
+    else
+      gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float);
+    totalLocalMemorySize+=gammaAccumulatorLocalMemorySize;
+
+    if (totalLocalMemorySize <= deviceLocalMemorySize)
+      break;
+    else
+    {
+      pixelPerWorkgroup = pixelPerWorkgroup/2;
+      chunkSize = chunkSize/2;
+      if (pixelPerWorkgroup == 0
+          || chunkSize == 0)
+      {
+        /* quit, fallback to CPU */
+        goto cleanup;
+      }
+    }
+  }
+
+  resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter);
+  resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter);
+
+  if (resizeFilterType == SincFastWeightingFunction
+    && resizeWindowType == SincFastWeightingFunction)
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilterSinc");
+  else 
+    horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilter");
+
+  if (horizontalKernel == NULL)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  i = 0;
+  clStatus = clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&inputImage);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&inputImageColumns);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&inputImageRows);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&matte);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&yFactor);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizedImage);
+
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedRows);
+
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeFilterType);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeWindowType);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients);
+
+  resizeFilterScale = (float) GetResizeFilterScale(resizeFilter);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterScale);
+
+  resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterSupport);
+
+  resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport);
+
+  resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterBlur);
+
+
+  clStatus |= clSetKernelArg(horizontalKernel, i++, imageCacheLocalMemorySize, NULL);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(int), &numCachedPixels);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &chunkSize);
+  
+
+  clStatus |= clSetKernelArg(horizontalKernel, i++, pixelAccumulatorLocalMemorySize, NULL);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, weightAccumulatorLocalMemorySize, NULL);
+  clStatus |= clSetKernelArg(horizontalKernel, i++, gammaAccumulatorLocalMemorySize, NULL);
+
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  global_work_size[0] = resizedColumns;
+  global_work_size[1] = (resizedRows+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize;
+
+  local_work_size[0] = 1;
+  local_work_size[1] = workgroupSize;
+  clStatus = clEnqueueNDRangeKernel(queue, horizontalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clFlush(queue);
+  status = MagickTrue;
+
+
+cleanup:
+  if (horizontalKernel != NULL) RelinquishOpenCLKernel(clEnv, horizontalKernel);
+
+  return status;
+}
+
+
+
+static Image* ComputeResizeImage(const Image* inputImage, const size_t resizedColumns, const size_t resizedRows
+        , const ResizeFilter* resizeFilter, ExceptionInfo *exception)
+{
+
+  MagickBooleanType outputReady = MagickFalse;
+  Image* filteredImage = NULL;
+  MagickCLEnv clEnv = NULL;
+
+  cl_int clStatus;
+  MagickBooleanType status;
+  const void *inputPixels;
+  void* filteredPixels;
+  void* hostPtr;
+  const MagickRealType* resizeFilterCoefficient;
+  float* mappedCoefficientBuffer;
+  float xFactor, yFactor;
+  MagickSizeType length;
+
+  cl_mem_flags mem_flags;
+  cl_context context = NULL;
+  cl_mem inputImageBuffer = NULL;
+  cl_mem tempImageBuffer = NULL;
+  cl_mem filteredImageBuffer = NULL;
+  cl_mem cubicCoefficientsBuffer = NULL;
+  cl_command_queue queue = NULL;
+
+  unsigned int i;
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+
+  /* Create and initialize OpenCL buffers. */
+  inputPixels = NULL;
+  inputPixels = AcquirePixelCachePixels(inputImage, &length, exception);
+  if (inputPixels == (const void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+    goto cleanup;
+  }
+
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  cubicCoefficientsBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, 7 * sizeof(float), NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+  queue = AcquireOpenCLCommandQueue(clEnv);
+  mappedCoefficientBuffer = (float*)clEnqueueMapBuffer(queue, cubicCoefficientsBuffer, CL_TRUE, CL_MAP_WRITE, 0, 7 * sizeof(float)
+          , 0, NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnqueueMapBuffer failed.",".");
+    goto cleanup;
+  }
+  resizeFilterCoefficient = GetResizeFilterCoefficient(resizeFilter);
+  for (i = 0; i < 7; i++)
+  {
+    mappedCoefficientBuffer[i] = (float) resizeFilterCoefficient[i];
+  }
+  clStatus = clEnqueueUnmapMemObject(queue, cubicCoefficientsBuffer, mappedCoefficientBuffer, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueUnmapMemObject failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  filteredImage = CloneImage(inputImage,resizedColumns,resizedRows,MagickTrue,exception);
+  if (filteredImage == NULL)
+    goto cleanup;
+
+  if (SetImageStorageClass(filteredImage,DirectClass) != MagickTrue)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  filteredPixels = GetPixelCachePixels(filteredImage, &length, exception);
+  if (filteredPixels == (void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+    goto cleanup;
+  }
+
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = filteredPixels;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY;
+    hostPtr = NULL;
+  }
+
+  /* create a CL buffer from image pixel buffer */
+  length = filteredImage->columns * filteredImage->rows;
+  filteredImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  xFactor=(float) resizedColumns/(float) inputImage->columns;
+  yFactor=(float) resizedRows/(float) inputImage->rows;
+  if (xFactor > yFactor)
+  {
+
+    length = resizedColumns*inputImage->rows;
+    tempImageBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(CLPixelPacket), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+    
+    status = resizeHorizontalFilter(inputImageBuffer, inputImage->columns, inputImage->rows, (inputImage->matte == MagickTrue)?1:0
+          , tempImageBuffer, resizedColumns, inputImage->rows
+          , resizeFilter, cubicCoefficientsBuffer
+          , xFactor, clEnv, queue, exception);
+    if (status != MagickTrue)
+      goto cleanup;
+    
+    status = resizeVerticalFilter(tempImageBuffer, resizedColumns, inputImage->rows, (inputImage->matte == MagickTrue)?1:0
+       , filteredImageBuffer, resizedColumns, resizedRows
+       , resizeFilter, cubicCoefficientsBuffer
+       , yFactor, clEnv, queue, exception);
+    if (status != MagickTrue)
+      goto cleanup;
+  }
+  else
+  {
+    length = inputImage->columns*resizedRows;
+    tempImageBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(CLPixelPacket), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+
+    status = resizeVerticalFilter(inputImageBuffer, inputImage->columns, inputImage->rows, (inputImage->matte == MagickTrue)?1:0
+       , tempImageBuffer, inputImage->columns, resizedRows
+       , resizeFilter, cubicCoefficientsBuffer
+       , yFactor, clEnv, queue, exception);
+    if (status != MagickTrue)
+      goto cleanup;
+
+    status = resizeHorizontalFilter(tempImageBuffer, inputImage->columns, resizedRows, (inputImage->matte == MagickTrue)?1:0
+       , filteredImageBuffer, resizedColumns, resizedRows
+       , resizeFilter, cubicCoefficientsBuffer
+       , xFactor, clEnv, queue, exception);
+    if (status != MagickTrue)
+      goto cleanup;
+  }
+  length = resizedColumns*resizedRows;
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    clStatus = clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  outputReady = MagickTrue;
+
+cleanup:
+  if (inputImageBuffer!=NULL)            clReleaseMemObject(inputImageBuffer);
+  if (tempImageBuffer!=NULL)             clReleaseMemObject(tempImageBuffer);
+  if (filteredImageBuffer!=NULL)         clReleaseMemObject(filteredImageBuffer);
+  if (cubicCoefficientsBuffer!=NULL)      clReleaseMemObject(cubicCoefficientsBuffer);
+  if (queue != NULL)                     RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+
+  return filteredImage;
+}
+
+const ResizeWeightingFunctionType supportedResizeWeighting[] = 
+{
+  BoxWeightingFunction
+  ,TriangleWeightingFunction
+  ,HanningWeightingFunction
+  ,HammingWeightingFunction
+  ,BlackmanWeightingFunction
+  ,CubicBCWeightingFunction
+  ,SincWeightingFunction
+  ,SincFastWeightingFunction
+  ,LastWeightingFunction
+};
+
+static MagickBooleanType gpuSupportedResizeWeighting(ResizeWeightingFunctionType f)
+{
+  MagickBooleanType supported = MagickFalse;
+  unsigned int i;
+  for (i = 0; ;i++)
+  {
+    if (supportedResizeWeighting[i] == LastWeightingFunction)
+      break;
+    if (supportedResizeWeighting[i] == f)
+    {
+      supported = MagickTrue;
+      break;
+    }
+  }
+  return supported;
+}
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%   A c c e l e r a t e R e s i z e I m a g e                                 %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  AccelerateResizeImage() is an OpenCL implementation of ResizeImage()
+%
+%  AccelerateResizeImage() scales an image to the desired dimensions, using the given
+%  filter (see AcquireFilterInfo()).
+%
+%  If an undefined filter is given the filter defaults to Mitchell for a
+%  colormapped image, a image with a matte channel, or if the image is
+%  enlarged.  Otherwise the filter defaults to a Lanczos.
+%
+%  AccelerateResizeImage() was inspired by Paul Heckbert's "zoom" program.
+%
+%  The format of the AccelerateResizeImage method is:
+%
+%      Image *ResizeImage(Image *image,const size_t columns,
+%        const size_t rows, const ResizeFilter* filter,
+%        ExceptionInfo *exception)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o columns: the number of columns in the scaled image.
+%
+%    o rows: the number of rows in the scaled image.
+%
+%    o filter: Image filter to use.
+%
+%    o exception: return any errors or warnings in this structure.
+%
+*/
+
+MagickExport
+Image* AccelerateResizeImage(const Image* image, const size_t resizedColumns, const size_t resizedRows
+          , const ResizeFilter* resizeFilter, ExceptionInfo *exception) 
+{
+  MagickBooleanType status;
+  Image* filteredImage;
+
+  assert(image != NULL);
+  assert(resizeFilter != NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  status = checkAccelerateCondition(image, AllChannels, exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  if (gpuSupportedResizeWeighting(GetResizeFilterWeightingType(resizeFilter)) == MagickFalse
+    || gpuSupportedResizeWeighting(GetResizeFilterWindowWeightingType(resizeFilter)) == MagickFalse)
+    return NULL;
+
+  filteredImage = ComputeResizeImage(image,resizedColumns,resizedRows,resizeFilter,exception);
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  return filteredImage;
+
+}
+
+
+static MagickBooleanType ComputeContrastImage(Image *inputImage, const MagickBooleanType sharpen, ExceptionInfo *exception)
+{
+  MagickBooleanType outputReady = MagickFalse;
+  MagickCLEnv clEnv = NULL;
+
+  cl_int clStatus;
+  size_t global_work_size[2];
+
+  void *inputPixels = NULL;
+  MagickSizeType length;
+  unsigned int uSharpen;
+  unsigned int i;
+
+  cl_mem_flags mem_flags;
+  cl_context context = NULL;
+  cl_mem inputImageBuffer = NULL;
+  cl_kernel filterKernel = NULL;
+  cl_command_queue queue = NULL;
+
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+
+  /* Create and initialize OpenCL buffers. */
+  inputPixels = GetPixelCachePixels(inputImage, &length, exception);
+  if (inputPixels == (void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+    goto cleanup;
+  }
+
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+  
+  filterKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Contrast");
+  if (filterKernel == NULL)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  i = 0;
+  clStatus=clSetKernelArg(filterKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+
+  uSharpen = (sharpen == MagickFalse)?0:1;
+  clStatus|=clSetKernelArg(filterKernel,i++,sizeof(cl_uint),&uSharpen);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  global_work_size[0] = inputImage->columns;
+  global_work_size[1] = inputImage->rows;
+  /* launch the kernel */
+  queue = AcquireOpenCLCommandQueue(clEnv);
+  clStatus = clEnqueueNDRangeKernel(queue, filterKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clFlush(queue);
+
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, inputImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, inputImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  outputReady = MagickTrue;
+
+cleanup:
+
+  if (inputImageBuffer!=NULL)                clReleaseMemObject(inputImageBuffer);
+  if (filterKernel!=NULL)                     RelinquishOpenCLKernel(clEnv, filterKernel);
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  return outputReady;
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     C o n t r a s t I m a g e  w i t h  O p e n C L                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  ContrastImage() enhances the intensity differences between the lighter and
+%  darker elements of the image.  Set sharpen to a MagickTrue to increase the
+%  image contrast otherwise the contrast is reduced.
+%
+%  The format of the ContrastImage method is:
+%
+%      MagickBooleanType ContrastImage(Image *image,
+%        const MagickBooleanType sharpen)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o sharpen: Increase or decrease image contrast.
+%
+*/
+
+MagickExport
+MagickBooleanType AccelerateContrastImage(Image* image, const MagickBooleanType sharpen, ExceptionInfo* exception)
+{
+  MagickBooleanType status;
+
+  assert(image != NULL);
+  assert(exception != NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickFalse)
+    return MagickFalse;
+
+  status = checkAccelerateCondition(image, AllChannels, exception);
+  if (status == MagickFalse)
+    return MagickFalse;
+
+  status = ComputeContrastImage(image,sharpen,exception);
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  return status;
+}
+
+
+
+MagickBooleanType ComputeModulateImage(Image* image, double percent_brightness, double percent_hue, double percent_saturation, ColorspaceType colorspace, ExceptionInfo* exception)
+{
+  register ssize_t
+    i;
+
+  cl_float
+    bright,
+    hue,
+    saturation;
+
+  cl_int color;
+
+  MagickBooleanType outputReady;
+
+  MagickCLEnv clEnv;
+
+  void *inputPixels;
+
+  MagickSizeType length;
+
+  cl_context context;
+  cl_command_queue queue;
+  cl_kernel modulateKernel; 
+
+  cl_mem inputImageBuffer;
+  cl_mem_flags mem_flags;
+
+  cl_int clStatus;
+
+  Image * inputImage = image;
+
+  inputImageBuffer = NULL;
+  modulateKernel = NULL; 
+
+  assert(inputImage != (Image *) NULL);
+  assert(inputImage->signature == MagickSignature);
+  if (inputImage->debug != MagickFalse)
+    (void) LogMagickEvent(TraceEvent,GetMagickModule(),"%s",inputImage->filename);
+
+  /*
+   * initialize opencl env
+   */
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  outputReady = MagickFalse;
+
+  /* Create and initialize OpenCL buffers.
+   inputPixels = AcquirePixelCachePixels(inputImage, &length, exception);
+   assume this  will get a writable image
+   */
+  inputPixels = GetPixelCachePixels(inputImage, &length, exception);
+  if (inputPixels == (void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+    goto cleanup;
+  }
+
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+   then use the host buffer directly from the GPU; otherwise, 
+   create a buffer on the GPU and copy the data over
+   */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  modulateKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Modulate");
+  if (modulateKernel == NULL)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  bright=percent_brightness;
+  hue=percent_hue;
+  saturation=percent_saturation;
+  color=colorspace;
+
+  i = 0;
+  clStatus=clSetKernelArg(modulateKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+  clStatus|=clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&bright);
+  clStatus|=clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&hue);
+  clStatus|=clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&saturation);
+  clStatus|=clSetKernelArg(modulateKernel,i++,sizeof(cl_float),&color);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    printf("no kernel\n");
+    goto cleanup;
+  }
+
+  {
+    size_t global_work_size[2];
+    global_work_size[0] = inputImage->columns;
+    global_work_size[1] = inputImage->rows;
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, modulateKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    clFlush(queue);
+  }
+
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, inputImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, inputImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady = MagickTrue;
+
+cleanup:
+
+  if (inputPixels) {
+    //ReleasePixelCachePixels();
+    inputPixels = NULL;
+  }
+
+  if (inputImageBuffer!=NULL)                
+    clReleaseMemObject(inputImageBuffer);
+  if (modulateKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, modulateKernel);
+  if (queue != NULL)                          
+    RelinquishOpenCLCommandQueue(clEnv, queue);
+
+  return outputReady;
+
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     M o d u l a t e I m a g e  w i t h  O p e n C L                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  ModulateImage() lets you control the brightness, saturation, and hue
+%  of an image.  Modulate represents the brightness, saturation, and hue
+%  as one parameter (e.g. 90,150,100).  If the image colorspace is HSL, the
+%  modulation is lightness, saturation, and hue.  For HWB, use blackness,
+%  whiteness, and hue. And for HCL, use chrome, luma, and hue.
+%
+%  The format of the ModulateImage method is:
+%
+%      MagickBooleanType ModulateImage(Image *image,const char *modulate)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o percent_*: Define the percent change in brightness, saturation, and
+%      hue.
+%
+*/
+
+MagickExport
+MagickBooleanType AccelerateModulateImage(Image* image, double percent_brightness, double percent_hue, double percent_saturation, ColorspaceType colorspace, ExceptionInfo* exception)
+{
+  MagickBooleanType status;
+
+  assert(image != NULL);
+  assert(exception != NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickFalse)
+    return MagickFalse;
+
+  status = checkAccelerateCondition(image, AllChannels, exception);
+  if (status == MagickFalse)
+    return MagickFalse;
+
+  if ((colorspace != HSLColorspace && colorspace != UndefinedColorspace))
+    return MagickFalse;
+
+
+  status = ComputeModulateImage(image,percent_brightness, percent_hue, percent_saturation, colorspace, exception);
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  return status;
+}
+
+
+MagickExport MagickBooleanType ComputeEqualizeImage(Image *inputImage, const ChannelType channel, ExceptionInfo * _exception)
+{
+#define EqualizeImageTag  "Equalize/Image"
+
+  ExceptionInfo
+    *exception=_exception;
+
+  FloatPixelPacket
+    white,
+    black,
+    intensity,
+    *map;
+
+  cl_uint4
+    *histogram;
+
+  PixelPacket
+    *equalize_map;
+
+  register ssize_t
+    i;
+
+  Image * image = inputImage;
+
+  MagickBooleanType outputReady;
+  MagickCLEnv clEnv;
+
+  cl_int clStatus;
+  size_t global_work_size[2];
+
+  void *inputPixels;
+  cl_mem_flags mem_flags;
+
+  cl_context context;
+  cl_mem inputImageBuffer;
+  cl_mem histogramBuffer;
+  cl_mem equalizeMapBuffer;
+  cl_kernel histogramKernel; 
+  cl_kernel equalizeKernel; 
+  cl_command_queue queue;
+  cl_int colorspace;
+
+  void* hostPtr;
+
+  MagickSizeType length;
+
+  inputPixels = NULL;
+  inputImageBuffer = NULL;
+  histogramBuffer = NULL;
+  histogramKernel = NULL; 
+  equalizeKernel = NULL; 
+  context = NULL;
+  queue = NULL;
+  outputReady = MagickFalse;
+
+  assert(inputImage != (Image *) NULL);
+  assert(inputImage->signature == MagickSignature);
+  if (inputImage->debug != MagickFalse)
+    (void) LogMagickEvent(TraceEvent,GetMagickModule(),"%s",inputImage->filename);
+
+  /*
+    Allocate and initialize histogram arrays.
+  */
+  histogram=(cl_uint4 *) AcquireQuantumMemory(MaxMap+1UL, sizeof(*histogram));
+  if (histogram == (cl_uint4 *) NULL)
+      ThrowBinaryException(ResourceLimitWarning,"MemoryAllocationFailed", image->filename);
+
+  /* reset histogram */
+  (void) ResetMagickMemory(histogram,0,(MaxMap+1)*sizeof(*histogram));
+
+  /*
+   * initialize opencl env
+   */
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+
+  /* Create and initialize OpenCL buffers. */
+  /* inputPixels = AcquirePixelCachePixels(inputImage, &length, exception); */
+  /* assume this  will get a writable image */
+  inputPixels = GetPixelCachePixels(inputImage, &length, exception);
+
+  if (inputPixels == (void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+    goto cleanup;
+  }
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+  
+  /* If the host pointer is aligned to the size of cl_uint, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(histogram,cl_uint4)) 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+    hostPtr = histogram;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    hostPtr = histogram;
+  }
+  /* create a CL buffer for histogram  */
+  length = (MaxMap+1); 
+  histogramBuffer = clCreateBuffer(context, mem_flags, length * sizeof(cl_uint4), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  switch (inputImage->colorspace)
+  {
+  case RGBColorspace:
+    colorspace = 1;
+    break;
+  case sRGBColorspace:
+    colorspace = 0;
+    break;
+  default:
+    {
+    /* something is wrong, as we checked in checkAccelerateCondition */
+    }
+  }
+
+  /* get the OpenCL kernel */
+  histogramKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Histogram");
+  if (histogramKernel == NULL)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* set the kernel arguments */
+  i = 0;
+  clStatus=clSetKernelArg(histogramKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+  clStatus|=clSetKernelArg(histogramKernel,i++,sizeof(ChannelType),&channel);
+  clStatus|=clSetKernelArg(histogramKernel,i++,sizeof(cl_int),&colorspace);
+  clStatus|=clSetKernelArg(histogramKernel,i++,sizeof(cl_mem),(void *)&histogramBuffer);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* launch the kernel */
+  global_work_size[0] = inputImage->columns;
+  global_work_size[1] = inputImage->rows;
+
+  clStatus = clEnqueueNDRangeKernel(queue, histogramKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clFlush(queue);
+
+  /* read from the kenel output */
+  if (ALIGNED(histogram,cl_uint4)) 
+  {
+    length = (MaxMap+1); 
+    clEnqueueMapBuffer(queue, histogramBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(cl_uint4), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = (MaxMap+1); 
+    clStatus = clEnqueueReadBuffer(queue, histogramBuffer, CL_TRUE, 0, length * sizeof(cl_uint4), histogram, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* unmap, don't block gpu to use this buffer again.  */
+  if (ALIGNED(histogram,cl_uint4))
+  {
+    clStatus = clEnqueueUnmapMemObject(queue, histogramBuffer, histogram, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueUnmapMemObject failed.", "'%s'", ".");
+      goto cleanup;
+    }
+  }
+
+  if (getenv("TEST")) {
+    unsigned int i;
+    for (i=0; i<(MaxMap+1UL); i++) 
+    {
+      printf("histogram %d: red %d\n", i, histogram[i].s[2]);
+      printf("histogram %d: green %d\n", i, histogram[i].s[1]);
+      printf("histogram %d: blue %d\n", i, histogram[i].s[0]);
+      printf("histogram %d: opacity %d\n", i, histogram[i].s[3]);
+    }
+  }
+
+  /* cpu stuff */
+  equalize_map=(PixelPacket *) AcquireQuantumMemory(MaxMap+1UL, sizeof(*equalize_map));
+  if (equalize_map == (PixelPacket *) NULL)
+      ThrowBinaryException(ResourceLimitWarning,"MemoryAllocationFailed", image->filename);
+
+  map=(FloatPixelPacket *) AcquireQuantumMemory(MaxMap+1UL,sizeof(*map));
+  if (map == (FloatPixelPacket *) NULL)
+      ThrowBinaryException(ResourceLimitWarning,"MemoryAllocationFailed", image->filename);
+
+  /*
+    Integrate the histogram to get the equalization map.
+  */
+  (void) ResetMagickMemory(&intensity,0,sizeof(intensity));
+  for (i=0; i <= (ssize_t) MaxMap; i++)
+  {
+    if ((channel & SyncChannels) != 0)
+      {
+        intensity.red+=histogram[i].s[2];
+        map[i]=intensity;
+        continue;
+      }
+    if ((channel & RedChannel) != 0)
+      intensity.red+=histogram[i].s[2];
+    if ((channel & GreenChannel) != 0)
+      intensity.green+=histogram[i].s[1];
+    if ((channel & BlueChannel) != 0)
+      intensity.blue+=histogram[i].s[0];
+    if ((channel & OpacityChannel) != 0)
+      intensity.opacity+=histogram[i].s[3];
+    if (((channel & IndexChannel) != 0) &&
+        (image->colorspace == CMYKColorspace))
+    {
+      printf("something here\n");
+      /*intensity.index+=histogram[i].index; */
+    }
+    map[i]=intensity;
+  }
+  black=map[0];
+  white=map[(int) MaxMap];
+  (void) ResetMagickMemory(equalize_map,0,(MaxMap+1)*sizeof(*equalize_map));
+  for (i=0; i <= (ssize_t) MaxMap; i++)
+  {
+    if ((channel & SyncChannels) != 0)
+      {
+        if (white.red != black.red)
+          equalize_map[i].red=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+            (map[i].red-black.red))/(white.red-black.red)));
+        continue;
+      }
+    if (((channel & RedChannel) != 0) && (white.red != black.red))
+      equalize_map[i].red=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+        (map[i].red-black.red))/(white.red-black.red)));
+    if (((channel & GreenChannel) != 0) && (white.green != black.green))
+      equalize_map[i].green=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+        (map[i].green-black.green))/(white.green-black.green)));
+    if (((channel & BlueChannel) != 0) && (white.blue != black.blue))
+      equalize_map[i].blue=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+        (map[i].blue-black.blue))/(white.blue-black.blue)));
+    if (((channel & OpacityChannel) != 0) && (white.opacity != black.opacity))
+      equalize_map[i].opacity=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+        (map[i].opacity-black.opacity))/(white.opacity-black.opacity)));
+    /*
+    if ((((channel & IndexChannel) != 0) &&
+        (image->colorspace == CMYKColorspace)) &&
+        (white.index != black.index))
+      equalize_map[i].index=ScaleMapToQuantum((MagickRealType) ((MaxMap*
+        (map[i].index-black.index))/(white.index-black.index)));
+    */
+  }
+
+  histogram=(cl_uint4 *) RelinquishMagickMemory(histogram);
+  map=(FloatPixelPacket *) RelinquishMagickMemory(map);
+
+  if (image->storage_class == PseudoClass)
+  {
+      /*
+        Equalize colormap.
+      */
+      for (i=0; i < (ssize_t) image->colors; i++)
+      {
+        if ((channel & SyncChannels) != 0)
+          {
+            if (white.red != black.red)
+              {
+                image->colormap[i].red=equalize_map[
+                  ScaleQuantumToMap(image->colormap[i].red)].red;
+                image->colormap[i].green=equalize_map[
+                  ScaleQuantumToMap(image->colormap[i].green)].red;
+                image->colormap[i].blue=equalize_map[
+                  ScaleQuantumToMap(image->colormap[i].blue)].red;
+                image->colormap[i].opacity=equalize_map[
+                  ScaleQuantumToMap(image->colormap[i].opacity)].red;
+              }
+            continue;
+          }
+        if (((channel & RedChannel) != 0) && (white.red != black.red))
+          image->colormap[i].red=equalize_map[
+            ScaleQuantumToMap(image->colormap[i].red)].red;
+        if (((channel & GreenChannel) != 0) && (white.green != black.green))
+          image->colormap[i].green=equalize_map[
+            ScaleQuantumToMap(image->colormap[i].green)].green;
+        if (((channel & BlueChannel) != 0) && (white.blue != black.blue))
+          image->colormap[i].blue=equalize_map[
+            ScaleQuantumToMap(image->colormap[i].blue)].blue;
+        if (((channel & OpacityChannel) != 0) &&
+            (white.opacity != black.opacity))
+          image->colormap[i].opacity=equalize_map[
+            ScaleQuantumToMap(image->colormap[i].opacity)].opacity;
+      }
+  }
+
+  /*
+    Equalize image.
+  */
+
+  /* GPU can work on this again, image and equalize map as input
+    image:        uchar4 (CLPixelPacket)
+    equalize_map: uchar4 (PixelPacket)
+    black, white: float4 (FloatPixelPacket) */
+
+  if (inputImageBuffer!=NULL)                
+    clReleaseMemObject(inputImageBuffer);
+  /* If the host pointer is aligned to the size of CLPixelPacket, 
+     then use the host buffer directly from the GPU; otherwise, 
+     create a buffer on the GPU and copy the data over */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  /* Create and initialize OpenCL buffers. */
+  if (ALIGNED(equalize_map, PixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = equalize_map;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR;
+    hostPtr = equalize_map;
+  }
+  /* create a CL buffer for eqaulize_map  */
+  length = (MaxMap+1); 
+  equalizeMapBuffer = clCreateBuffer(context, mem_flags, length * sizeof(PixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  /* get the OpenCL kernel */
+  equalizeKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Equalize");
+  if (equalizeKernel == NULL)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "AcquireOpenCLKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* set the kernel arguments */
+  i = 0;
+  clStatus=clSetKernelArg(equalizeKernel,i++,sizeof(cl_mem),(void *)&inputImageBuffer);
+  clStatus|=clSetKernelArg(equalizeKernel,i++,sizeof(ChannelType),&channel);
+  clStatus|=clSetKernelArg(equalizeKernel,i++,sizeof(cl_mem),(void *)&equalizeMapBuffer);
+  clStatus|=clSetKernelArg(equalizeKernel,i++,sizeof(FloatPixelPacket),&white);
+  clStatus|=clSetKernelArg(equalizeKernel,i++,sizeof(FloatPixelPacket),&black);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  /* launch the kernel */
+  global_work_size[0] = inputImage->columns;
+  global_work_size[1] = inputImage->rows;
+
+  clStatus = clEnqueueNDRangeKernel(queue, equalizeKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  clFlush(queue);
+
+  /* read the data back */
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, inputImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, inputImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady = MagickTrue;
+  
+  equalize_map=(PixelPacket *) RelinquishMagickMemory(equalize_map);
+
+cleanup:
+
+  if (inputPixels) {
+    /*ReleasePixelCachePixels();*/
+    inputPixels = NULL;
+  }
+
+  if (inputImageBuffer!=NULL)                
+    clReleaseMemObject(inputImageBuffer);
+  if (histogramBuffer!=NULL)                 
+    clReleaseMemObject(histogramBuffer);
+  if (histogramKernel!=NULL)                     
+    RelinquishOpenCLKernel(clEnv, histogramKernel);
+  if (queue != NULL)                          
+    RelinquishOpenCLCommandQueue(clEnv, queue);
+
+  return outputReady;
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     E q u a l i z e I m a g e  w i t h  O p e n C L                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  EqualizeImage() applies a histogram equalization to the image.
+%
+%  The format of the EqualizeImage method is:
+%
+%      MagickBooleanType EqualizeImage(Image *image)
+%      MagickBooleanType EqualizeImageChannel(Image *image,
+%        const ChannelType channel)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o channel: the channel.
+%
+*/
+
+
+MagickExport
+MagickBooleanType AccelerateEqualizeImage(Image* image, const ChannelType channel, ExceptionInfo* exception)
+{
+  MagickBooleanType status;
+
+  assert(image != NULL);
+  assert(exception != NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickFalse)
+    return MagickFalse;
+
+  status = checkAccelerateCondition(image, channel, exception);
+  if (status == MagickFalse)
+    return MagickFalse;
+
+  /* ensure this is the only pass get in for now. */
+  if ((channel & SyncChannels) == 0)
+    return MagickFalse;
+
+  if (image->colorspace != sRGBColorspace)
+    return MagickFalse;
+
+  status = ComputeEqualizeImage(image,channel,exception);
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  return status;
+}
+
+
+static Image* ComputeDespeckleImage(const Image* inputImage, ExceptionInfo* exception)
+{
+
+  MagickBooleanType outputReady = MagickFalse;
+  MagickCLEnv clEnv = NULL;
+
+  cl_int clStatus;
+  size_t global_work_size[2];
+
+  const void *inputPixels = NULL;
+  Image* filteredImage = NULL;
+  void *filteredPixels = NULL;
+  void *hostPtr;
+  MagickSizeType length;
+
+  cl_mem_flags mem_flags;
+  cl_context context = NULL;
+  cl_mem inputImageBuffer = NULL;
+  cl_mem tempImageBuffer[2];
+  cl_mem filteredImageBuffer = NULL;
+  cl_command_queue queue = NULL;
+  cl_kernel hullPass1 = NULL;
+  cl_kernel hullPass2 = NULL;
+
+  unsigned int imageWidth, imageHeight;
+  int matte;
+  int k;
+
+  static const int 
+    X[4] = {0, 1, 1,-1},
+    Y[4] = {1, 0, 1, 1};
+
+  tempImageBuffer[0] = tempImageBuffer[1] = NULL;
+  clEnv = GetDefaultOpenCLEnv();
+  context = GetOpenCLContext(clEnv);
+  queue = AcquireOpenCLCommandQueue(clEnv);
+  inputPixels = AcquirePixelCachePixels(inputImage, &length, exception);
+  if (inputPixels == (void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",inputImage->filename);
+    goto cleanup;
+  }
+
+  if (ALIGNED(inputPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  inputImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  mem_flags = CL_MEM_READ_WRITE;
+  length = inputImage->columns * inputImage->rows;
+  for (k = 0; k < 2; k++)
+  {
+    tempImageBuffer[k] = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), NULL, &clStatus);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+      goto cleanup;
+    }
+  }
+
+  filteredImage = CloneImage(inputImage,inputImage->columns,inputImage->rows,MagickTrue,exception);
+  assert(filteredImage != NULL);
+  if (SetImageStorageClass(filteredImage,DirectClass) != MagickTrue)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
+    goto cleanup;
+  }
+  filteredPixels = GetPixelCachePixels(filteredImage, &length, exception);
+  if (filteredPixels == (void *) NULL)
+  {
+    (void) ThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename);
+    goto cleanup;
+  }
+
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR;
+    hostPtr = filteredPixels;
+  }
+  else 
+  {
+    mem_flags = CL_MEM_WRITE_ONLY;
+    hostPtr = NULL;
+  }
+  /* create a CL buffer from image pixel buffer */
+  length = inputImage->columns * inputImage->rows;
+  filteredImageBuffer = clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clCreateBuffer failed.",".");
+    goto cleanup;
+  }
+
+  hullPass1 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass1");
+  hullPass2 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass2");
+
+  clStatus =clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)&inputImageBuffer);
+  clStatus |=clSetKernelArg(hullPass1,1,sizeof(cl_mem),(void *)(tempImageBuffer+1));
+  imageWidth = inputImage->columns;
+  clStatus |=clSetKernelArg(hullPass1,2,sizeof(unsigned int),(void *)&imageWidth);
+  imageHeight = inputImage->rows;
+  clStatus |=clSetKernelArg(hullPass1,3,sizeof(unsigned int),(void *)&imageHeight);
+  matte = (inputImage->matte==MagickFalse)?0:1;
+  clStatus |=clSetKernelArg(hullPass1,6,sizeof(int),(void *)&matte);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  clStatus = clSetKernelArg(hullPass2,0,sizeof(cl_mem),(void *)(tempImageBuffer+1));
+  clStatus |=clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)tempImageBuffer);
+  imageWidth = inputImage->columns;
+  clStatus |=clSetKernelArg(hullPass2,2,sizeof(unsigned int),(void *)&imageWidth);
+  imageHeight = inputImage->rows;
+  clStatus |=clSetKernelArg(hullPass2,3,sizeof(unsigned int),(void *)&imageHeight);
+  matte = (inputImage->matte==MagickFalse)?0:1;
+  clStatus |=clSetKernelArg(hullPass2,6,sizeof(int),(void *)&matte);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+
+  global_work_size[0] = inputImage->columns;
+  global_work_size[1] = inputImage->rows;
+
+  
+  for (k = 0; k < 4; k++)
+  {
+    cl_int2 offset;
+    int polarity;
+
+    
+    offset.s[0] = X[k];
+    offset.s[1] = Y[k];
+    polarity = 1;
+    clStatus = clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+
+
+    if (k == 0)
+      clStatus =clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)(tempImageBuffer));
+    offset.s[0] = -X[k];
+    offset.s[1] = -Y[k];
+    polarity = 1;
+    clStatus = clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+
+    offset.s[0] = -X[k];
+    offset.s[1] = -Y[k];
+    polarity = -1;
+    clStatus = clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+
+    offset.s[0] = X[k];
+    offset.s[1] = Y[k];
+    polarity = -1;
+    clStatus = clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|= clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity);
+    clStatus|=clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset);
+    clStatus|=clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity);
+
+    if (k == 3)
+      clStatus |=clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)&filteredImageBuffer);
+
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clSetKernelArg failed.", "'%s'", ".");
+      goto cleanup;
+    }
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+    /* launch the kernel */
+    clStatus = clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    if (clStatus != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clEnqueueNDRangeKernel failed.", "'%s'", ".");
+      goto cleanup;
+    }  
+  }
+
+  if (ALIGNED(filteredPixels,CLPixelPacket)) 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
+  }
+  else 
+  {
+    length = inputImage->columns * inputImage->rows;
+    clStatus = clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
+  }
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Reading output image from CL buffer failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  outputReady = MagickTrue;
+
+cleanup:
+  if (queue != NULL)                          RelinquishOpenCLCommandQueue(clEnv, queue);
+  if (inputImageBuffer!=NULL)                clReleaseMemObject(inputImageBuffer);
+  for (k = 0; k < 2; k++)
+  {
+    if (tempImageBuffer[k]!=NULL)            clReleaseMemObject(tempImageBuffer[k]);
+  }
+  if (filteredImageBuffer!=NULL)             clReleaseMemObject(filteredImageBuffer);
+  if (hullPass1!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass1);
+  if (hullPass2!=NULL)                       RelinquishOpenCLKernel(clEnv, hullPass2);
+  if (outputReady == MagickFalse)
+  {
+    if (filteredImage != NULL)
+    {
+      DestroyImage(filteredImage);
+      filteredImage = NULL;
+    }
+  }
+  return filteredImage;
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%     D e s p e c k l e I m a g e  w i t h  O p e n C L                       %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  DespeckleImage() reduces the speckle noise in an image while perserving the
+%  edges of the original image.  A speckle removing filter uses a complementary 
+%  hulling technique (raising pixels that are darker than their surrounding
+%  neighbors, then complementarily lowering pixels that are brighter than their
+%  surrounding neighbors) to reduce the speckle index of that image (reference
+%  Crimmins speckle removal).
+%
+%  The format of the DespeckleImage method is:
+%
+%      Image *DespeckleImage(const Image *image,ExceptionInfo *exception)
+%
+%  A description of each parameter follows:
+%
+%    o image: the image.
+%
+%    o exception: return any errors or warnings in this structure.
+%
+*/
+
+MagickExport
+Image* AccelerateDespeckleImage(const Image* image, ExceptionInfo* exception)
+{
+  MagickBooleanType status;
+  Image* newImage = NULL;
+
+  assert(image != NULL);
+  assert(exception != NULL);
+
+  status = checkOpenCLEnvironment(exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  status = checkAccelerateCondition(image, AllChannels, exception);
+  if (status == MagickFalse)
+    return NULL;
+
+  newImage = ComputeDespeckleImage(image,exception);
+  OpenCLLogException(__FUNCTION__,__LINE__,exception);
+  return newImage;
+}
+
+#else  /* MAGICKCORE_OPENCL_SUPPORT  */
+
+MagickExport Image *AccelerateConvolveImageChannel(
+  const Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const KernelInfo *magick_unused(kernel),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(channel);
+  magick_unreferenced(kernel);
+  magick_unreferenced(exception);
+
+  return NULL;
+}
+
+MagickExport MagickBooleanType AccelerateFunctionImage(
+  Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const MagickFunction magick_unused(function),
+  const size_t magick_unused(number_parameters),
+  const double *magick_unused(parameters),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(channel);
+  magick_unreferenced(function);
+  magick_unreferenced(number_parameters);
+  magick_unreferenced(parameters);
+  magick_unreferenced(exception);
+
+  return MagickFalse;
+}
+
+MagickExport Image *AccelerateBlurImage(const Image *magick_unused(image),
+  const ChannelType magick_unused(channel),const double magick_unused(radius),
+  const double magick_unused(sigma),ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(channel);
+  magick_unreferenced(radius);
+  magick_unreferenced(sigma);
+  magick_unreferenced(exception);
+
+  return NULL;
+}
+
+MagickExport Image *AccelerateRadialBlurImage(
+  const Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const double magick_unused(angle),ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(channel);
+  magick_unreferenced(angle);
+  magick_unreferenced(exception);
+
+  return NULL;
+}
+
+
+MagickExport Image *AccelerateUnsharpMaskImage(
+  const Image *magick_unused(image),const ChannelType magick_unused(channel),
+  const double magick_unused(radius),const double magick_unused(sigma),
+  const double magick_unused(gain),const double magick_unused(threshold),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(channel);
+  magick_unreferenced(radius);
+  magick_unreferenced(sigma);
+  magick_unreferenced(gain);
+  magick_unreferenced(threshold);
+  magick_unreferenced(exception);
+
+  return NULL;
+}
+
+
+MagickExport MagickBooleanType AccelerateContrastImage(
+  Image* magick_unused(image),const MagickBooleanType magick_unused(sharpen),
+  ExceptionInfo* magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(sharpen);
+  magick_unreferenced(exception);
+
+  return MagickFalse;
+}
+
+MagickExport MagickBooleanType AccelerateEqualizeImage(
+  Image* magick_unused(image), const ChannelType magick_unused(channel),
+  ExceptionInfo* magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(channel);
+  magick_unreferenced(exception);
+
+  return MagickFalse;
+}
+
+MagickExport Image *AccelerateDespeckleImage(const Image* magick_unused(image),
+  ExceptionInfo* magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(exception);
+
+  return NULL;
+}
+
+MagickExport Image *AccelerateResizeImage(const Image* magick_unused(image),
+  const size_t magick_unused(resizedColumns),
+  const size_t magick_unused(resizedRows),
+  const ResizeFilter* magick_unused(resizeFilter),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(resizedColumns);
+  magick_unreferenced(resizedRows);
+  magick_unreferenced(resizeFilter);
+  magick_unreferenced(exception);
+
+  return NULL;
+}
+
+
+MagickExport
+MagickBooleanType AccelerateModulateImage(
+  Image* image, double percent_brightness, double percent_hue, 
+  double percent_saturation, ColorspaceType colorspace, ExceptionInfo* exception)
+{
+  magick_unreferenced(image);
+  magick_unreferenced(percent_brightness);
+  magick_unreferenced(percent_hue);
+  magick_unreferenced(percent_saturation);
+  magick_unreferenced(colorspace);
+  magick_unreferenced(exception);
+  return(MagickFalse);
+}
+
+
+#endif /* MAGICKCORE_OPENCL_SUPPORT */
+
+MagickExport MagickBooleanType AccelerateConvolveImage(
+  const Image *magick_unused(image),const KernelInfo *magick_unused(kernel),
+  Image *magick_unused(convolve_image),ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(image);
+  magick_unreferenced(kernel);
+  magick_unreferenced(convolve_image);
+  magick_unreferenced(exception);
+
+  /* legacy, do not use */
+  return(MagickFalse);
+}
+
index 3624adb0c21622aa1f9d7c4b0eea2704c8459c13..a6daae68fc43d4b80f75a853c8a14439b5bffe67 100644 (file)
@@ -23,10 +23,34 @@ extern "C" {
 #endif
 
 #include "MagickCore/morphology.h"
+#include "MagickCore/resample.h"
+#include "MagickCore/resize.h"
+#include "MagickCore/statistic.h"
 
 extern MagickExport MagickBooleanType
+  AccelerateContrastImage(Image *,const MagickBooleanType,ExceptionInfo *),
   AccelerateConvolveImage(const Image *,const KernelInfo *,Image *,
-    ExceptionInfo *);
+    ExceptionInfo *),
+  AccelerateEqualizeImage(Image *,const ChannelType,ExceptionInfo *),
+  AccelerateFunctionImage(Image *,const ChannelType,const MagickFunction,
+    const size_t,const double *,ExceptionInfo *),
+  AccelerateModulateImage(Image* image, double percent_brightness, 
+    double percent_hue, double percent_saturation, 
+    ColorspaceType colorspace, ExceptionInfo* exception);
+
+
+extern MagickExport Image
+  *AccelerateBlurImage(const Image *,const ChannelType,const double,
+    const double,ExceptionInfo *),
+  *AccelerateConvolveImageChannel(const Image *,const ChannelType,
+    const KernelInfo *,ExceptionInfo *),
+  *AccelerateDespeckleImage(const Image *,ExceptionInfo *),
+  *AccelerateRadialBlurImage(const Image *,const ChannelType,const double,
+    ExceptionInfo *),
+  *AccelerateResizeImage(const Image *,const size_t,const size_t,
+    const ResizeFilter *,ExceptionInfo *),
+  *AccelerateUnsharpMaskImage(const Image *,const ChannelType,const double,
+    const double,const double,const double,ExceptionInfo *);
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }
index 1d980d41cc7ae2c6c3e90914775a3f1243c5f2c3..e8e30fca69d5cbc70ada532bdb34369e43d18067 100644 (file)
@@ -112,14 +112,17 @@ extern "C" {
 #  define magick_aligned(x,y)  x __attribute__((aligned(y)))
 #  define magick_attribute  __attribute__
 #  define magick_unused(x)  magick_unused_ ## x __attribute__((unused))
+#  define magick_unreferenced(x)  /* nothing */
 #elif defined(MAGICKCORE_WINDOWS_SUPPORT) && !defined(__CYGWIN__)
 #  define magick_aligned(x,y)  __declspec(align(y)) x
 #  define magick_attribute(x)  /* nothing */
 #  define magick_unused(x) x
+#  define magick_unreferenced(x) (x)
 #else
 #  define magick_aligned(x,y)  /* nothing */
 #  define magick_attribute(x)  /* nothing */
 #  define magick_unused(x) x
+#  define magick_unreferenced(x)  /* nothing */
 #endif
 
 #if (((__GNUC__) > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)))
index 2f4043104f2718f6e6c3359bb89a7a33e4695525..03ff401a59a32cb52bda0f845094004db67b5bc3 100644 (file)
 /*
-  Copyright 1999-2014 ImageMagick Studio LLC, a non-profit organization
-  dedicated to making software imaging solutions freely available.
-  
-  You may not use this file except in compliance with the License.
-  obtain a copy of the License at
-  
-    http://www.imagemagick.org/script/license.php
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-
-  MagickCore OpenCL private methods.
+Copyright 1999-2014 ImageMagick Studio LLC, a non-profit organization
+dedicated to making software imaging solutions freely available.
+
+You may not use this file except in compliance with the License.
+obtain a copy of the License at
+
+http://www.imagemagick.org/script/license.php
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+MagickCore OpenCL private methods.
 */
 #ifndef _MAGICKCORE_OPENCL_PRIVATE_H
 #define _MAGICKCORE_OPENCL_PRIVATE_H
 
+/*
+Include declarations.
+*/
+#include "MagickCore/studio.h"
+
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
+#if defined(MAGICKCORE_OPENCL_SUPPORT)
+#include <CL/cl.h>
+#else
+  typedef void* cl_platform_id;
+  typedef void* cl_device_id;
+  typedef void* cl_context;
+  typedef void* cl_command_queue;
+  typedef void* cl_kernel;
+  typedef struct { unsigned char t[8]; } cl_device_type; /* 64-bit */
+#endif
+
+#if defined(MAGICKCORE_HDRI_SUPPORT)
+#define CLOptions "-cl-single-precision-constant -cl-mad-enable -DMAGICKCORE_HDRI_SUPPORT=1 "\
+  "-DCLQuantum=float -DCLSignedQuantum=float -DCLPixelType=float4 -DQuantumRange=%f " \
+  "-DQuantumScale=%f -DCharQuantumScale=%f -DMagickEpsilon=%f -DMagickPI=%f "\
+  " -DMaxMap=%u -DMAGICKCORE_QUANTUM_DEPTH=%u"
+#define CLPixelPacket  cl_float4
+#define CLCharQuantumScale 1.0f
+#elif (MAGICKCORE_QUANTUM_DEPTH == 8)
+#define CLOptions "-cl-single-precision-constant -cl-mad-enable " \
+  "-DCLQuantum=uchar -DCLSignedQuantum=char -DCLPixelType=uchar4 -DQuantumRange=%f " \
+  "-DQuantumScale=%f -DCharQuantumScale=%f -DMagickEpsilon=%f -DMagickPI=%f "\
+  "-DMaxMap=%u -DMAGICKCORE_QUANTUM_DEPTH=%u"
+#define CLPixelPacket  cl_uchar4
+#define CLCharQuantumScale 1.0f
+#elif (MAGICKCORE_QUANTUM_DEPTH == 16)
+#define CLOptions "-cl-single-precision-constant -cl-mad-enable " \
+  "-DCLQuantum=ushort -DCLSignedQuantum=short -DCLPixelType=ushort4 -DQuantumRange=%f "\
+  "-DQuantumScale=%f -DCharQuantumScale=%f -DMagickEpsilon=%f -DMagickPI=%f "\
+  "-DMaxMap=%u -DMAGICKCORE_QUANTUM_DEPTH=%u"
+#define CLPixelPacket  cl_ushort4
+#define CLCharQuantumScale 257.0f
+#elif (MAGICKCORE_QUANTUM_DEPTH == 32)
+#define CLOptions "-cl-single-precision-constant -cl-mad-enable " \
+  "-DCLQuantum=uint -DCLSignedQuantum=int -DCLPixelType=uint4 -DQuantumRange=%f "\
+  "-DQuantumScale=%f -DCharQuantumScale=%f -DMagickEpsilon=%f -DMagickPI=%f "\
+  "-DMaxMap=%u -DMAGICKCORE_QUANTUM_DEPTH=%u"
+#define CLPixelPacket  cl_uint4
+#define CLCharQuantumScale 16843009.0f
+#elif (MAGICKCORE_QUANTUM_DEPTH == 64)
+#define CLOptions "-cl-single-precision-constant -cl-mad-enable " \
+  "-DCLQuantum=ulong -DCLSignedQuantum=long -DCLPixelType=ulong4 -DQuantumRange=%f "\
+  "-DQuantumScale=%f -DCharQuantumScale=%f -DMagickEpsilon=%f -DMagickPI=%f "\
+  "-DMaxMap=%u -DMAGICKCORE_QUANTUM_DEPTH=%u"
+#define CLPixelPacket  cl_ulong4
+#define CLCharQuantumScale 72340172838076673.0f
+#endif
+
+extern MagickExport cl_context 
+  GetOpenCLContext(MagickCLEnv);
+
+extern MagickExport cl_kernel 
+  AcquireOpenCLKernel(MagickCLEnv, MagickOpenCLProgram, const char*);
+
+extern MagickExport cl_command_queue 
+  AcquireOpenCLCommandQueue(MagickCLEnv);
+
+extern MagickExport MagickBooleanType 
+  RelinquishOpenCLCommandQueue(MagickCLEnv, cl_command_queue),
+  RelinquishOpenCLKernel(MagickCLEnv, cl_kernel);
+
+extern MagickExport unsigned long 
+  GetOpenCLDeviceLocalMemorySize(MagickCLEnv),
+  GetOpenCLDeviceMaxMemAllocSize(MagickCLEnv);
+
+extern MagickExport const char* 
+  GetOpenCLCachedFilesDirectory();
+
+extern MagickExport void 
+  OpenCLLog(const char*);
+
+/* #define ACCELERATE_LOG_EXCEPTION 1 */
+static inline void OpenCLLogException(const char* function, 
+                        const unsigned int line, 
+                        ExceptionInfo* exception) {
+#ifdef ACCELERATE_LOG_EXCEPTION
+  if (exception->severity!=0) {
+    char message[MaxTextExtent];
+    /*  dump the source into a file */
+    (void) FormatLocaleString(message,MaxTextExtent,"%s:%d Exception(%d)"
+      ,function,line,exception->severity);
+    OpenCLLog(message);
+  }
+#endif
+}
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
index 0b2fd5e83ac9e3d440a1e4c9b85f81b6bf3f5a70..0c73a29df23962719a1eb30f29b112fa47cecf46 100644 (file)
 %
 %
 */
-\f
 /*
-  Include declarations.
+Include declarations.
 */
+#include <string.h>
 #include "MagickCore/studio.h"
 #include "MagickCore/artifact.h"
 #include "MagickCore/cache.h"
@@ -62,6 +63,8 @@
 #include "MagickCore/monitor.h"
 #include "MagickCore/montage.h"
 #include "MagickCore/morphology.h"
+#include "MagickCore/opencl.h"
+#include "MagickCore/opencl-private.h"
 #include "MagickCore/option.h"
 #include "MagickCore/policy.h"
 #include "MagickCore/property.h"
 #include "MagickCore/resample.h"
 #include "MagickCore/resource_.h"
 #include "MagickCore/splay-tree.h"
+#include "MagickCore/semaphore.h"
 #include "MagickCore/statistic.h"
 #include "MagickCore/string_.h"
 #include "MagickCore/token.h"
 #include "MagickCore/utility.h"
+
+#include <time.h>
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#include <sys/time.h>
+#include <linux/limits.h>
+#endif
+
+
+#ifdef MAGICKCORE_CLPERFMARKER
+#include "CLPerfMarker.h"
+#endif
+
+
+#if defined(MAGICKCORE_OPENCL_SUPPORT)
+
+struct _MagickCLEnv {
+  MagickBooleanType OpenCLInitialized;  /* whether OpenCL environment is initialized. */
+  MagickBooleanType OpenCLDisabled;    /* whether if OpenCL has been explicitely disabled. */
+
+  /*OpenCL objects */
+  cl_platform_id platform;
+  cl_device_type deviceType;
+  cl_device_id device;
+  cl_context context;
+
+  cl_program programs[MAGICK_OPENCL_NUM_PROGRAMS]; /* one program object maps one kernel source file */
+
+  SemaphoreInfo* lock;
+};
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   A c q u i r e M a g i c k O p e n C L E n v                               %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% AcquireMagickOpenCLEnv() allocates the MagickCLEnv structure 
+%
+*/
+
+MagickExport MagickCLEnv AcquireMagickOpenCLEnv()
+{
+  MagickCLEnv clEnv;
+  clEnv = (MagickCLEnv) AcquireMagickMemory(sizeof(struct _MagickCLEnv));
+  if (clEnv != NULL)
+  {
+    memset(clEnv, 0, sizeof(struct _MagickCLEnv));
+    AcquireSemaphoreInfo(&clEnv->lock);
+  }
+  return clEnv;
+}
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   R e l i n q u i s h M a g i c k O p e n C L E n v                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  RelinquishMagickOpenCLEnv() destroy the MagickCLEnv structure
+%
+%  The format of the RelinquishMagickOpenCLEnv method is:
+%
+%      MagickBooleanType RelinquishMagickOpenCLEnv(MagickCLEnv clEnv)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: MagickCLEnv structure to destroy
+%
+*/
+
+MagickExport MagickBooleanType RelinquishMagickOpenCLEnv(MagickCLEnv clEnv)
+{
+  if (clEnv != (MagickCLEnv)NULL)
+  {
+    RelinquishSemaphoreInfo(clEnv->lock);
+    RelinquishMagickMemory(clEnv);
+    return MagickTrue;
+  }
+  return MagickFalse;
+}
+
+
+/*
+* Default OpenCL environment
+*/
+MagickCLEnv defaultCLEnv;
+SemaphoreInfo* defaultCLEnvLock;
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   G e t D e f a u l t O p e n C L E n v                                     %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  GetDefaultOpenCLEnv() returns the default OpenCL env
+%
+%  The format of the GetDefaultOpenCLEnv method is:
+%
+%      MagickCLEnv GetDefaultOpenCLEnv()
+%
+%  A description of each parameter follows:
+%
+%    o exception: return any errors or warnings.
+%
+*/
+
+MagickExport MagickCLEnv GetDefaultOpenCLEnv()
+{ 
+  if (defaultCLEnv == NULL)
+  {
+    if (defaultCLEnvLock == NULL)
+    {
+      AcquireSemaphoreInfo(&defaultCLEnvLock);
+    }
+    LockSemaphoreInfo(defaultCLEnvLock);
+    defaultCLEnv = AcquireMagickOpenCLEnv();
+    UnlockSemaphoreInfo(defaultCLEnvLock); 
+  }
+  return defaultCLEnv; 
+}
+
+static void LockDefaultOpenCLEnv() {
+  if (defaultCLEnvLock == NULL)
+  {
+    AcquireSemaphoreInfo(&defaultCLEnvLock);
+  }
+  LockSemaphoreInfo(defaultCLEnvLock);
+}
+
+static void UnlockDefaultOpenCLEnv() {
+  if (defaultCLEnvLock == NULL)
+  {
+    AcquireSemaphoreInfo(&defaultCLEnvLock);
+  }
+  else
+    UnlockSemaphoreInfo(defaultCLEnvLock);
+}
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   S e t D e f a u l t O p e n C L E n v                                     %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  SetDefaultOpenCLEnv() sets the new OpenCL environment as default 
+%  and returns the old OpenCL environment
+%  
+%  The format of the SetDefaultOpenCLEnv() method is:
+%
+%      MagickCLEnv SetDefaultOpenCLEnv(MagickCLEnv clEnv)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: the new default OpenCL environment.
+%
+*/
+MagickExport MagickCLEnv SetDefaultOpenCLEnv(MagickCLEnv clEnv)     
+{
+  MagickCLEnv oldEnv;
+  LockDefaultOpenCLEnv();
+  oldEnv = defaultCLEnv;
+  defaultCLEnv = clEnv;
+  UnlockDefaultOpenCLEnv();
+  return oldEnv;
+} 
+
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   S e t M a g i c k O p e n C L E n v P a r a m                             %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  SetMagickOpenCLEnvParam() sets the parameters in the OpenCL environment  
+%  
+%  The format of the SetMagickOpenCLEnvParam() method is:
+%
+%      MagickBooleanType SetMagickOpenCLEnvParam(MagickCLEnv clEnv, 
+%        MagickOpenCLEnvParam param, size_t dataSize, void* data, 
+%        ExceptionInfo* exception)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: the OpenCL environment.
+%    
+%    o param: the parameter to be set.
+%
+%    o dataSize: the data size of the parameter value.
+%
+%    o data:  the pointer to the new parameter value
+%
+%    o exception: return any errors or warnings
+%
+*/
+
+static MagickBooleanType SetMagickOpenCLEnvParamInternal(MagickCLEnv clEnv, MagickOpenCLEnvParam param
+                                          , size_t dataSize, void* data, ExceptionInfo* exception)
+{
+  MagickBooleanType status = MagickFalse;
+
+  if (clEnv == NULL
+    || data == NULL)
+    goto cleanup;
+
+  switch(param)
+  {
+  case MAGICK_OPENCL_ENV_PARAM_DEVICE:
+    if (dataSize != sizeof(clEnv->device))
+      goto cleanup;
+    clEnv->device = *((cl_device_id*)data);
+    clEnv->OpenCLInitialized = MagickFalse;
+    status = MagickTrue;
+    break;
+
+  case MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED:
+    if (dataSize != sizeof(clEnv->OpenCLDisabled))
+      goto cleanup;
+    clEnv->OpenCLDisabled =  *((MagickBooleanType*)data);
+    clEnv->OpenCLInitialized = MagickFalse;
+    status = MagickTrue;
+    break;
+
+  case MAGICK_OPENCL_ENV_PARAM_OPENCL_INITIALIZED:
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleWarning, "SetMagickOpenCLEnvParm cannot modify the OpenCL initialization state.", "'%s'", ".");
+    break;
+
+  default:
+    goto cleanup;
+  };
+
+cleanup:
+  return status;
+}
+
+MagickExport
+  MagickBooleanType SetMagickOpenCLEnvParam(MagickCLEnv clEnv, MagickOpenCLEnvParam param
+                                          , size_t dataSize, void* data, ExceptionInfo* exception) {
+  MagickBooleanType status = MagickFalse;
+  if (clEnv!=NULL) {
+    LockSemaphoreInfo(clEnv->lock);
+    status = SetMagickOpenCLEnvParamInternal(clEnv,param,dataSize,data,exception);
+    UnlockSemaphoreInfo(clEnv->lock);
+  }
+  return status;
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   G e t M a g i c k O p e n C L E n v P a r a m                             %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  GetMagickOpenCLEnvParam() gets the parameters in the OpenCL environment  
+%  
+%  The format of the GetMagickOpenCLEnvParam() method is:
+%
+%      MagickBooleanType GetMagickOpenCLEnvParam(MagickCLEnv clEnv, 
+%        MagickOpenCLEnvParam param, size_t dataSize, void* data, 
+%        ExceptionInfo* exception)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: the OpenCL environment.
+%    
+%    o param: the parameter to be returned.
+%
+%    o dataSize: the data size of the parameter value.
+%
+%    o data:  the location where the returned parameter value will be stored 
+%
+%    o exception: return any errors or warnings
+%
+*/
+
+MagickExport
+  MagickBooleanType GetMagickOpenCLEnvParam(MagickCLEnv clEnv, MagickOpenCLEnvParam param
+                                          , size_t dataSize, void* data, ExceptionInfo* exception)
+{
+  MagickBooleanType status;
+  status = MagickFalse;
+
+  if (clEnv == NULL
+    || data == NULL)
+    goto cleanup;
+
+  switch(param)
+  {
+  case MAGICK_OPENCL_ENV_PARAM_DEVICE:
+    if (dataSize != sizeof(cl_device_id))
+      goto cleanup;
+    *((cl_device_id*)data) = clEnv->device;
+    status = MagickTrue;
+    break;
+
+  case MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED:
+    if (dataSize != sizeof(clEnv->OpenCLDisabled))
+      goto cleanup;
+    *((MagickBooleanType*)data) = clEnv->OpenCLDisabled;
+    status = MagickTrue;
+    break;
+
+  case MAGICK_OPENCL_ENV_PARAM_OPENCL_INITIALIZED:
+    if (dataSize != sizeof(clEnv->OpenCLDisabled))
+      goto cleanup;
+    *((MagickBooleanType*)data) = clEnv->OpenCLInitialized;
+    status = MagickTrue;
+    break;
+
+  default:
+    goto cleanup;
+  };
+
+cleanup:
+  return status;
+}
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   G e t O p e n C L C o n t e x t                                           %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  GetOpenCLContext() returns the OpenCL context  
+%  
+%  The format of the GetOpenCLContext() method is:
+%
+%      cl_context GetOpenCLContext(MagickCLEnv clEnv) 
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: OpenCL environment
+%
+*/
+
+MagickExport
+cl_context GetOpenCLContext(MagickCLEnv clEnv) {
+  if (clEnv == NULL)
+    return NULL;
+  else
+    return clEnv->context;
+}
+
+static char* getBinaryCLProgramName(MagickCLEnv clEnv, MagickOpenCLProgram prog, unsigned int signature)
+{
+  char* name;
+  char path[MaxTextExtent];
+  char deviceName[MaxTextExtent];
+  const char* prefix = "magick_opencl";
+  clGetDeviceInfo(clEnv->device, CL_DEVICE_NAME, MaxTextExtent, deviceName, NULL);
+  (void) FormatLocaleString(path,MaxTextExtent,"%s%s%s_%s_%02d_%08x.bin"
+         ,GetOpenCLCachedFilesDirectory()
+         ,DirectorySeparator,prefix,deviceName, (unsigned int)prog, signature);
+  name = (char*)AcquireMagickMemory(strlen(path)+1);
+  CopyMagickString(name,path,strlen(path)+1);
+  return name;
+}
+
+static MagickBooleanType saveBinaryCLProgram(MagickCLEnv clEnv, MagickOpenCLProgram prog, unsigned int signature, ExceptionInfo* exception)
+{
+  MagickBooleanType saveSuccessful;
+  cl_int clStatus;
+  size_t binaryProgramSize;
+  unsigned char* binaryProgram;
+  char* binaryFileName;
+  FILE* fileHandle;
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clBeginPerfMarkerAMD(__FUNCTION__,"");
+#endif
+
+  binaryProgram = NULL;
+  binaryFileName = NULL;
+  fileHandle = NULL;
+  saveSuccessful = MagickFalse;
+
+  clStatus = clGetProgramInfo(clEnv->programs[prog], CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binaryProgramSize, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clGetProgramInfo failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  binaryProgram = (unsigned char*) AcquireMagickMemory(binaryProgramSize);
+  clStatus = clGetProgramInfo(clEnv->programs[prog], CL_PROGRAM_BINARIES, sizeof(char*), &binaryProgram, NULL);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "clGetProgramInfo failed.", "'%s'", ".");
+    goto cleanup;
+  }
+
+  binaryFileName = getBinaryCLProgramName(clEnv, prog, signature);
+  fileHandle = fopen(binaryFileName, "wb");
+  if (fileHandle != NULL)
+  {
+    fwrite(binaryProgram, sizeof(char), binaryProgramSize, fileHandle);
+    saveSuccessful = MagickTrue;
+  }
+  else
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), DelegateWarning,
+      "Saving binary kernel failed.", "'%s'", ".");
+  }
+
+cleanup:
+  if (fileHandle != NULL)
+    fclose(fileHandle);
+  if (binaryProgram != NULL)
+    RelinquishMagickMemory(binaryProgram);
+  if (binaryFileName != NULL)
+    free(binaryFileName);
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clEndPerfMarkerAMD();
+#endif
+
+  return saveSuccessful;
+}
+
+static MagickBooleanType loadBinaryCLProgram(MagickCLEnv clEnv, MagickOpenCLProgram prog, unsigned int signature, ExceptionInfo* exception)
+{
+  MagickBooleanType loadSuccessful;
+  unsigned char* binaryProgram;
+  char* binaryFileName;
+  FILE* fileHandle;
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clBeginPerfMarkerAMD(__FUNCTION__,"");
+#endif
+
+  binaryProgram = NULL;
+  binaryFileName = NULL;
+  fileHandle = NULL;
+  loadSuccessful = MagickFalse;
+
+  binaryFileName = getBinaryCLProgramName(clEnv, prog, signature);
+  fileHandle = fopen(binaryFileName, "rb");
+  if (fileHandle != NULL)
+  {
+    int b_error;
+    size_t length;
+    cl_int clStatus;
+    cl_int clBinaryStatus;
+
+    b_error = 0 ;
+    length = 0;
+    b_error |= fseek( fileHandle, 0, SEEK_END ) < 0;
+    b_error |= ( length = ftell( fileHandle ) ) <= 0;
+    b_error |= fseek( fileHandle, 0, SEEK_SET ) < 0;
+    if( b_error )
+      goto cleanup;
+
+    binaryProgram = (unsigned char*)AcquireMagickMemory(length);
+    if (binaryProgram == NULL)
+      goto cleanup;
+
+    memset(binaryProgram, 0, length);
+    b_error |= fread(binaryProgram, 1, length, fileHandle) != length;
+
+    clEnv->programs[prog] = clCreateProgramWithBinary(clEnv->context, 1, &clEnv->device, &length, (const unsigned char**)&binaryProgram, &clBinaryStatus, &clStatus);
+    if (clStatus != CL_SUCCESS
+        || clBinaryStatus != CL_SUCCESS)
+      goto cleanup;
+
+    loadSuccessful = MagickTrue;
+  }
+
+cleanup:
+  if (fileHandle != NULL)
+    fclose(fileHandle);
+  if (binaryFileName != NULL)
+    free(binaryFileName);
+  if (binaryProgram != NULL)
+    RelinquishMagickMemory(binaryProgram);
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clEndPerfMarkerAMD();
+#endif
+
+  return loadSuccessful;
+}
+
+static unsigned int stringSignature(const char* string)
+{
+  unsigned int stringLength;
+  unsigned int n,i,j;
+  unsigned int signature;
+  union
+  {
+    const char* s;
+    const unsigned int* u;
+  }p;
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clBeginPerfMarkerAMD(__FUNCTION__,"");
+#endif
+
+  stringLength = strlen(string);
+  signature = stringLength;
+  n = stringLength/sizeof(unsigned int);
+  p.s = string;
+  for (i = 0; i < n; i++)
+  {
+    signature^=p.u[i];
+  }
+  if (n * sizeof(unsigned int) != stringLength)
+  {
+    char padded[4];
+    j = n * sizeof(unsigned int);
+    for (i = 0; i < 4; i++,j++)
+    {
+      if (j < stringLength)
+        padded[i] = p.s[j];
+      else
+        padded[i] = 0;
+    }
+    p.s = padded;
+    signature^=p.u[0];
+  }
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clEndPerfMarkerAMD();
+#endif
+
+  return signature;
+}
+
+/* OpenCL kernels for accelerate.c */
+extern const char *accelerateKernels, *accelerateKernels2;
+
+static MagickBooleanType CompileOpenCLKernels(MagickCLEnv clEnv, ExceptionInfo* exception) 
+{
+  MagickBooleanType status = MagickFalse;
+  cl_int clStatus;
+  unsigned int i;
+  char* accelerateKernelsBuffer = NULL;
+
+  /* The index of the program strings in this array has to match the value of the enum MagickOpenCLProgram */
+  const char* MagickOpenCLProgramStrings[MAGICK_OPENCL_NUM_PROGRAMS]; 
+
+  char options[MaxTextExtent];
+  unsigned int optionsSignature;
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clBeginPerfMarkerAMD(__FUNCTION__,"");
+#endif
+
+  /* Get additional options */
+  (void) FormatLocaleString(options, MaxTextExtent, CLOptions, (float)QuantumRange,
+    (float)QuantumScale, (float)CLCharQuantumScale, (float)MagickEpsilon, (float)MagickPI, (unsigned int)MaxMap, (unsigned int)MAGICKCORE_QUANTUM_DEPTH);
+
+  /*
+  if (getenv("MAGICK_OCL_DEF"))
+  {
+    strcat(options," ");
+    strcat(options,getenv("MAGICK_OCL_DEF"));
+  }
+  */
+
+  /*
+  if (getenv("MAGICK_OCL_BUILD"))
+    printf("options: %s\n", options);
+  */
+
+  optionsSignature = stringSignature(options);
+
+  /* get all the OpenCL program strings here */
+  accelerateKernelsBuffer = (char*) AcquireMagickMemory(strlen(accelerateKernels)+strlen(accelerateKernels2)+1);
+  sprintf(accelerateKernelsBuffer,"%s%s",accelerateKernels,accelerateKernels2);
+  MagickOpenCLProgramStrings[MAGICK_OPENCL_ACCELERATE] = accelerateKernelsBuffer;
+
+  for (i = 0; i < MAGICK_OPENCL_NUM_PROGRAMS; i++) 
+  {
+    MagickBooleanType loadSuccessful = MagickFalse;
+    unsigned int programSignature = stringSignature(MagickOpenCLProgramStrings[i]) ^ optionsSignature;
+
+    /* try to load the binary first */
+    if (!getenv("MAGICK_OCL_REC"))
+      loadSuccessful = loadBinaryCLProgram(clEnv, (MagickOpenCLProgram)i, programSignature, exception);
+
+    if (loadSuccessful == MagickFalse)
+    {
+      /* Binary CL program unavailable, compile the program from source */
+      size_t programLength = strlen(MagickOpenCLProgramStrings[i]);
+      clEnv->programs[i] = clCreateProgramWithSource(clEnv->context, 1, &(MagickOpenCLProgramStrings[i]), &programLength, &clStatus);
+      if (clStatus!=CL_SUCCESS)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), DelegateWarning,
+          "clCreateProgramWithSource failed.", "(%d)", (int)clStatus);
+
+        goto cleanup;
+      }
+    }
+
+    clStatus = clBuildProgram(clEnv->programs[i], 1, &clEnv->device, options, NULL, NULL);
+    if (clStatus!=CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), DelegateWarning,
+        "clBuildProgram failed.", "(%d)", (int)clStatus);
+
+      if (loadSuccessful == MagickFalse)
+      {
+        char path[MaxTextExtent];
+        FILE* fileHandle;
+
+        /*  dump the source into a file */
+        (void) FormatLocaleString(path,MaxTextExtent,"%s%s%s"
+         ,GetOpenCLCachedFilesDirectory()
+         ,DirectorySeparator,"magick_badcl.cl");
+        fileHandle = fopen(path, "wb");        
+        if (fileHandle != NULL)
+        {
+          fwrite(MagickOpenCLProgramStrings[i], sizeof(char), strlen(MagickOpenCLProgramStrings[i]), fileHandle);
+          fclose(fileHandle);
+        }
+
+        /* dump the build log */
+        {
+          char* log;
+          size_t logSize;
+          clGetProgramBuildInfo(clEnv->programs[i], clEnv->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
+          log = (char*)AcquireMagickMemory(logSize);
+          clGetProgramBuildInfo(clEnv->programs[i], clEnv->device, CL_PROGRAM_BUILD_LOG, logSize, log, &logSize);
+
+          (void) FormatLocaleString(path,MaxTextExtent,"%s%s%s"
+           ,GetOpenCLCachedFilesDirectory()
+           ,DirectorySeparator,"magick_badcl_build.log");
+          fileHandle = fopen(path, "wb");      
+          if (fileHandle != NULL)
+          {
+            const char* buildOptionsTitle = "build options: ";
+            fwrite(buildOptionsTitle, sizeof(char), strlen(buildOptionsTitle), fileHandle);
+            fwrite(options, sizeof(char), strlen(options), fileHandle);
+            fwrite("\n",sizeof(char), 1, fileHandle);
+            fwrite(log, sizeof(char), logSize, fileHandle);
+            fclose(fileHandle);
+          }
+          RelinquishMagickMemory(log);
+        }
+      }
+      goto cleanup;
+    }
+
+    if (loadSuccessful == MagickFalse)
+    {
+      /* Save the binary to a file to avoid re-compilation of the kernels in the future */
+      saveBinaryCLProgram(clEnv, (MagickOpenCLProgram)i, programSignature, exception);
+    }
+
+  }
+  status = MagickTrue;
+
+cleanup:
+
+  if (accelerateKernelsBuffer!=NULL) RelinquishMagickMemory(accelerateKernelsBuffer);
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clEndPerfMarkerAMD();
+#endif
+
+  return status;
+}
+
+static MagickBooleanType InitOpenCLPlatformDevice(MagickCLEnv clEnv, ExceptionInfo* exception) {
+  int i,j;
+  cl_int status;
+  cl_uint numPlatforms = 0;
+  cl_platform_id *platforms = NULL;
+  char* MAGICK_OCL_DEVICE = NULL;
+  MagickBooleanType OpenCLAvailable = MagickFalse;
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clBeginPerfMarkerAMD(__FUNCTION__,"");
+#endif
+
+  /* check if there's an environment variable overriding the device selection */
+  MAGICK_OCL_DEVICE = getenv("MAGICK_OCL_DEVICE");
+  if (MAGICK_OCL_DEVICE != NULL)
+  {
+    if (strcmp(MAGICK_OCL_DEVICE, "CPU") == 0)
+    {
+      clEnv->deviceType = CL_DEVICE_TYPE_CPU;
+    }
+    else if (strcmp(MAGICK_OCL_DEVICE, "GPU") == 0)
+    {
+      clEnv->deviceType = CL_DEVICE_TYPE_GPU;
+    }
+    else if (strcmp(MAGICK_OCL_DEVICE, "OFF") == 0)
+    {
+      /* OpenCL disabled */
+      goto cleanup;
+    }
+  }
+  else if (clEnv->deviceType == 0) {
+    clEnv->deviceType = CL_DEVICE_TYPE_ALL;
+  }
+
+  if (clEnv->device != NULL)
+  {
+    status = clGetDeviceInfo(clEnv->device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &clEnv->platform, NULL);
+    if (status != CL_SUCCESS) {
+      (void) ThrowMagickException(exception, GetMagickModule(), DelegateWarning,
+          "Failed to get OpenCL platform from the selected device.", "(%d)", status);
+    }
+    goto cleanup;
+  }
+  else if (clEnv->platform != NULL)
+  {
+    numPlatforms = 1;
+    platforms = (cl_platform_id *) AcquireMagickMemory(numPlatforms * sizeof(cl_platform_id));
+    if (platforms == (cl_platform_id *) NULL)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitError,
+        "AcquireMagickMemory failed.",".");
+      goto cleanup;
+    }
+    platforms[0] = clEnv->platform;
+  }
+  else
+  {
+    clEnv->device = NULL;
+
+    /* Get the number of OpenCL platforms available */
+    status = clGetPlatformIDs(0, NULL, &numPlatforms);
+    if (status != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), DelegateWarning, 
+        "clGetplatformIDs failed.", "(%d)", status);
+      goto cleanup;
+    }
+
+    /* No OpenCL available, just leave */
+    if (numPlatforms == 0) {
+      goto cleanup;
+    }
+
+    platforms = (cl_platform_id *) AcquireMagickMemory(numPlatforms * sizeof(cl_platform_id));
+    if (platforms == (cl_platform_id *) NULL)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), ResourceLimitError,
+        "AcquireMagickMemory failed.",".");
+      goto cleanup;
+    }
+
+    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+    if (status != CL_SUCCESS)
+    {
+      (void) ThrowMagickException(exception, GetMagickModule(), DelegateWarning,
+        "clGetPlatformIDs failed.", "(%d)", status);
+      goto cleanup;
+    }
+  }
+
+  /* Device selection */
+  clEnv->device = NULL;
+  for (j = 0; j < 2; j++) 
+  {
+
+    cl_device_type deviceType;
+    if (clEnv->deviceType == CL_DEVICE_TYPE_ALL)
+    {
+      if (j == 0)
+        deviceType = CL_DEVICE_TYPE_GPU;
+      else
+        deviceType = CL_DEVICE_TYPE_CPU;
+    }
+    else if (j == 1)
+    {
+      break;
+    }
+    else
+      deviceType = clEnv->deviceType;
+
+    for (i = 0; i < numPlatforms; i++)
+    {
+      cl_uint numDevices;
+      status = clGetDeviceIDs(platforms[i], deviceType, 1, &(clEnv->device), &numDevices);
+      if (status != CL_SUCCESS)
+      {
+        (void) ThrowMagickException(exception, GetMagickModule(), DelegateWarning,
+          "clGetPlatformIDs failed.", "(%d)", status);
+        goto cleanup;
+      }
+      if (clEnv->device != NULL)
+      {
+        clEnv->platform = platforms[i];
+  goto cleanup;
+      }
+    }
+  }
+
+cleanup:
+  if (platforms!=NULL)
+    RelinquishMagickMemory(platforms);
+
+  OpenCLAvailable = (clEnv->platform!=NULL
+          && clEnv->device!=NULL)?MagickTrue:MagickFalse;
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clEndPerfMarkerAMD();
+#endif
+
+  return OpenCLAvailable;
+}
+
+static MagickBooleanType EnableOpenCLInternal(MagickCLEnv clEnv) {
+  if (clEnv->OpenCLInitialized == MagickTrue
+    && clEnv->platform != NULL
+    && clEnv->device != NULL) {
+      clEnv->OpenCLDisabled = MagickFalse;
+      return MagickTrue;
+  }
+  clEnv->OpenCLDisabled = MagickTrue;
+  return MagickFalse;
+}
+
+
+static MagickBooleanType autoSelectDevice(MagickCLEnv clEnv, ExceptionInfo* exception);
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   I n i t O p e n C L E n v                                                 %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  InitOpenCLEnv() initialize the OpenCL environment
+%
+%  The format of the RelinquishMagickOpenCLEnv method is:
+%
+%      MagickBooleanType InitOpenCLEnv(MagickCLEnv clEnv, ExceptionInfo* exception)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: OpenCL environment structure
+%
+%    o exception: return any errors or warnings.
+%
+*/
+
+MagickExport
+MagickBooleanType InitOpenCLEnvInternal(MagickCLEnv clEnv, ExceptionInfo* exception) {
+  MagickBooleanType status = MagickTrue;
+  cl_int clStatus;
+  cl_context_properties cps[3];
+
+
+  clEnv->OpenCLInitialized = MagickTrue;
+  if (clEnv->OpenCLDisabled == MagickTrue)
+    goto cleanup;
+
+  clEnv->OpenCLDisabled = MagickTrue;
+  /* setup the OpenCL platform and device */
+  status = InitOpenCLPlatformDevice(clEnv, exception);
+  if (status == MagickFalse) {
+    /* No OpenCL device available */
+    goto cleanup;
+  }
+
+  /* create an OpenCL context */
+  cps[0] = CL_CONTEXT_PLATFORM;
+  cps[1] = (cl_context_properties)clEnv->platform;
+  cps[2] = 0;
+  clEnv->context = clCreateContext(cps, 1, &(clEnv->device), NULL, NULL, &clStatus);
+  if (clStatus != CL_SUCCESS)
+  {
+    (void) ThrowMagickException(exception, GetMagickModule(), DelegateWarning,
+        "clCreateContext failed.", "(%d)", clStatus);
+    status = MagickFalse;
+    goto cleanup;
+  }
+
+  status = CompileOpenCLKernels(clEnv, exception);
+  if (status == MagickFalse) {
+   (void) ThrowMagickException(exception, GetMagickModule(), DelegateWarning,
+        "clCreateCommandQueue failed.", "(%d)", status);
+
+    status = MagickFalse;
+    goto cleanup;
+  }
+
+  status = EnableOpenCLInternal(clEnv);
+cleanup:
+  return status;
+}
+
+
+MagickExport
+MagickBooleanType InitOpenCLEnv(MagickCLEnv clEnv, ExceptionInfo* exception) {
+  MagickBooleanType status = MagickFalse;
+
+  if (clEnv == NULL)
+    return MagickFalse;
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clBeginPerfMarkerAMD(__FUNCTION__,"");
+#endif
+
+  LockSemaphoreInfo(clEnv->lock);
+  if (clEnv->OpenCLInitialized == MagickFalse) {
+    if (clEnv->device==NULL
+        && clEnv->OpenCLDisabled == MagickFalse)
+      status = autoSelectDevice(clEnv, exception);
+    else
+      status = InitOpenCLEnvInternal(clEnv, exception);
+  }
+  UnlockSemaphoreInfo(clEnv->lock);
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clEndPerfMarkerAMD();
+#endif
+  return status;
+}
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   A c q u i r e O p e n C L C o m m a n d Q u e u e                         %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  AcquireOpenCLCommandQueue() acquires an OpenCL command queue
+%
+%  The format of the AcquireOpenCLCommandQueue method is:
+%
+%      cl_command_queue AcquireOpenCLCommandQueue(MagickCLEnv clEnv)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: the OpenCL environment.
+%
+*/
+
+MagickExport
+cl_command_queue AcquireOpenCLCommandQueue(MagickCLEnv clEnv)
+{
+  if (clEnv != NULL)
+    return clCreateCommandQueue(clEnv->context, clEnv->device, 0, NULL);
+  else
+    return NULL;
+}
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   R e l i n q u i s h O p e n C L C o m m a n d Q u e u e                   %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  RelinquishOpenCLCommandQueue() releases the OpenCL command queue
+%
+%  The format of the RelinquishOpenCLCommandQueue method is:
+%
+%      MagickBooleanType RelinquishOpenCLCommandQueue(MagickCLEnv clEnv,
+%        cl_command_queue queue)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: the OpenCL environment.
+%
+%    o queue: the OpenCL queue to be released.
+%
+%
+*/
+MagickExport
+MagickBooleanType RelinquishOpenCLCommandQueue(MagickCLEnv clEnv, cl_command_queue queue)
+{
+  if (clEnv != NULL)
+  {
+    return ((clReleaseCommandQueue(queue) == CL_SUCCESS) ? MagickTrue:MagickFalse);
+  }
+  else
+    return MagickFalse;
+}
+
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   A c q u i r e O p e n C L K e r n e l                                     %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  AcquireOpenCLKernel() acquires an OpenCL kernel
+%
+%  The format of the AcquireOpenCLKernel method is:
+%
+%      cl_kernel AcquireOpenCLKernel(MagickCLEnv clEnv, 
+%        MagickOpenCLProgram program, const char* kernelName)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: the OpenCL environment.
+%
+%    o program: the OpenCL program module that the kernel belongs to.
+%
+%    o kernelName:  the name of the kernel
+%
+*/
+
+MagickExport
+  cl_kernel AcquireOpenCLKernel(MagickCLEnv clEnv, MagickOpenCLProgram program, const char* kernelName)
+{
+  cl_int clStatus;
+  cl_kernel kernel = NULL;
+  if (clEnv != NULL && kernelName!=NULL)
+  {
+    kernel = clCreateKernel(clEnv->programs[program], kernelName, &clStatus);
+  }
+  return kernel;
+}
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   R e l i n q u i s h O p e n C L K e r n e l                               %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  RelinquishOpenCLKernel() releases an OpenCL kernel
+%
+%  The format of the RelinquishOpenCLKernel method is:
+%
+%    MagickBooleanType RelinquishOpenCLKernel(MagickCLEnv clEnv,
+%      cl_kernel kernel)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: the OpenCL environment.
+%
+%    o kernel: the OpenCL kernel object to be released.
+%
+%
+*/
+
+MagickExport
+  MagickBooleanType RelinquishOpenCLKernel(MagickCLEnv clEnv, cl_kernel kernel)
+{
+  MagickBooleanType status = MagickFalse;
+  if (clEnv != NULL && kernel != NULL)
+  {
+    status = ((clReleaseKernel(kernel) == CL_SUCCESS)?MagickTrue:MagickFalse);
+  }
+  return status;
+}
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   G e t O p e n C L D e v i c e L o c a l M e m o r y S i z e               %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  GetOpenCLDeviceLocalMemorySize() returns local memory size of the device
+%
+%  The format of the GetOpenCLDeviceLocalMemorySize method is:
+%
+%    unsigned long GetOpenCLDeviceLocalMemorySize(MagickCLEnv clEnv)
+%
+%  A description of each parameter follows:
+%
+%    o clEnv: the OpenCL environment.
+%
+%
+*/
+
+MagickExport
+ unsigned long GetOpenCLDeviceLocalMemorySize(MagickCLEnv clEnv)
+{
+  cl_ulong localMemorySize;
+  clGetDeviceInfo(clEnv->device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemorySize, NULL);
+  return (unsigned long)localMemorySize;
+}
+
+MagickExport
+  unsigned long GetOpenCLDeviceMaxMemAllocSize(MagickCLEnv clEnv)
+{
+  cl_ulong maxMemAllocSize;
+  clGetDeviceInfo(clEnv->device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAllocSize, NULL);
+  return (unsigned long)maxMemAllocSize;
+}
+
+
+/*
+ Beginning of the OpenCL device selection infrastructure
+*/
+
+
+#define DS_DEVICE_NAME_LENGTH 256
+
+typedef enum {
+  DS_SUCCESS = 0
+ ,DS_INVALID_PROFILE = 1000
+ ,DS_MEMORY_ERROR
+ ,DS_INVALID_PERF_EVALUATOR_TYPE
+ ,DS_INVALID_PERF_EVALUATOR
+ ,DS_PERF_EVALUATOR_ERROR
+ ,DS_FILE_ERROR
+ ,DS_UNKNOWN_DEVICE_TYPE
+ ,DS_PROFILE_FILE_ERROR
+ ,DS_SCORE_SERIALIZER_ERROR
+ ,DS_SCORE_DESERIALIZER_ERROR
+} ds_status;
+
+/* device type */
+typedef enum {
+  DS_DEVICE_NATIVE_CPU = 0
+ ,DS_DEVICE_OPENCL_DEVICE 
+} ds_device_type;
+
+
+typedef struct {
+  ds_device_type  type;
+  cl_device_id    oclDeviceID;
+  char*           oclDeviceName;
+  char*           oclDriverVersion;
+  cl_uint         oclMaxClockFrequency;
+  cl_uint         oclMaxComputeUnits;
+  void*           score;            /* a pointer to the score data, the content/format is application defined */
+} ds_device;
+
+typedef struct {
+  unsigned int  numDevices;
+  ds_device*    devices;
+  const char*   version;
+} ds_profile;
+
+/* deallocate memory used by score */
+typedef ds_status (*ds_score_release)(void* score);
+
+static ds_status releaseDeviceResource(ds_device* device, ds_score_release sr) {
+  ds_status status = DS_SUCCESS;
+  if (device) {
+    if (device->oclDeviceName)      free(device->oclDeviceName);
+    if (device->oclDriverVersion)   free(device->oclDriverVersion);
+    if (device->score)              status = sr(device->score);
+  }
+  return status;
+}
+
+static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
+  ds_status status = DS_SUCCESS;
+  if (profile!=NULL) {
+    if (profile->devices!=NULL && sr!=NULL) {
+      unsigned int i;
+      for (i = 0; i < profile->numDevices; i++) {
+        status = releaseDeviceResource(profile->devices+i,sr);
+        if (status != DS_SUCCESS)
+          break;
+      }
+      free(profile->devices);
+    }
+    free(profile);
+  }
+  return status;
+}
+
+
+static ds_status initDSProfile(ds_profile** p, const char* version) {
+  int numDevices = 0;
+  cl_uint numPlatforms = 0;
+  cl_platform_id* platforms = NULL;
+  cl_device_id*   devices = NULL;
+  ds_status status = DS_SUCCESS;
+  ds_profile* profile = NULL;
+  unsigned int next = 0;
+  unsigned int i;
+
+  if (p == NULL)
+    return DS_INVALID_PROFILE;
+
+  profile = (ds_profile*)malloc(sizeof(ds_profile));
+  if (profile == NULL)
+    return DS_MEMORY_ERROR;
+  
+  memset(profile, 0, sizeof(ds_profile));
+
+  clGetPlatformIDs(0, NULL, &numPlatforms);
+  if (numPlatforms > 0) {
+    platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
+    if (platforms == NULL) {
+      status = DS_MEMORY_ERROR;
+      goto cleanup;
+    }
+    clGetPlatformIDs(numPlatforms, platforms, NULL);
+    for (i = 0; i < (unsigned int)numPlatforms; i++) {
+      cl_uint num;
+      clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num);
+      numDevices+=num;
+    }
+  }
+
+  profile->numDevices = numDevices+1;     /* +1 to numDevices to include the native CPU */
+
+  profile->devices = (ds_device*)malloc(profile->numDevices*sizeof(ds_device));    
+  if (profile->devices == NULL) {
+    profile->numDevices = 0;
+    status = DS_MEMORY_ERROR;
+    goto cleanup;    
+  }
+  memset(profile->devices, 0, profile->numDevices*sizeof(ds_device));
+
+  if (numDevices > 0) {
+    devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
+    if (devices == NULL) {
+      status = DS_MEMORY_ERROR;
+      goto cleanup;
+    }
+    for (i = 0; i < (unsigned int)numPlatforms; i++) {
+      cl_uint num;
+
+      int d;
+      for (d = 0; d < 2; d++) { 
+        unsigned int j;
+        cl_device_type deviceType;
+        switch(d) {
+        case 0:
+          deviceType = CL_DEVICE_TYPE_GPU;
+          break;
+        case 1:
+          deviceType = CL_DEVICE_TYPE_CPU;
+          break;
+        default:
+          continue;
+          break;
+        }
+        clGetDeviceIDs(platforms[i], deviceType, numDevices, devices, &num);
+        for (j = 0; j < num; j++, next++) {
+          char buffer[DS_DEVICE_NAME_LENGTH];
+          size_t length;
+
+          profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
+          profile->devices[next].oclDeviceID = devices[j];
+
+          clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME
+            , DS_DEVICE_NAME_LENGTH, &buffer, NULL);
+          length = strlen(buffer);
+          profile->devices[next].oclDeviceName = (char*)malloc(length+1);
+          memcpy(profile->devices[next].oclDeviceName, buffer, length+1);
+
+          clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION
+            , DS_DEVICE_NAME_LENGTH, &buffer, NULL);
+          length = strlen(buffer);
+          profile->devices[next].oclDriverVersion = (char*)malloc(length+1);
+          memcpy(profile->devices[next].oclDriverVersion, buffer, length+1);
+
+          clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_MAX_CLOCK_FREQUENCY
+            , sizeof(cl_uint), &profile->devices[next].oclMaxClockFrequency, NULL);
+
+          clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_MAX_COMPUTE_UNITS
+            , sizeof(cl_uint), &profile->devices[next].oclMaxComputeUnits, NULL);
+        }
+      }
+    }
+  }
+
+  profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
+  profile->version = version;
+
+cleanup:
+  if (platforms)  free(platforms);
+  if (devices)    free(devices);
+  if (status == DS_SUCCESS) {
+    *p = profile;
+  }
+  else {
+    if (profile) {
+      if (profile->devices)
+        free(profile->devices);
+      free(profile);
+    }
+  }
+  return status;
+}
+
+/* Pointer to a function that calculates the score of a device (ex: device->score) 
+ update the data size of score. The encoding and the format of the score data 
+ is implementation defined. The function should return DS_SUCCESS if there's no error to be reported.
+ */
+typedef ds_status (*ds_perf_evaluator)(ds_device* device, void* data);
+
+typedef enum {
+  DS_EVALUATE_ALL
+  ,DS_EVALUATE_NEW_ONLY
+} ds_evaluation_type;
+
+static ds_status profileDevices(ds_profile* profile, const ds_evaluation_type type
+                         ,ds_perf_evaluator evaluator, void* evaluatorData, unsigned int* numUpdates) {
+  ds_status status = DS_SUCCESS;
+  unsigned int i;
+  unsigned int updates = 0;
+
+  if (profile == NULL) {
+    return DS_INVALID_PROFILE;
+  }
+  if (evaluator == NULL) {
+    return DS_INVALID_PERF_EVALUATOR;
+  }
+
+  for (i = 0; i < profile->numDevices; i++) {
+    ds_status evaluatorStatus;
+    
+    switch (type) {
+    case DS_EVALUATE_NEW_ONLY:
+      if (profile->devices[i].score != NULL)
+        break;
+      /*  else fall through */
+    case DS_EVALUATE_ALL:
+      evaluatorStatus = evaluator(profile->devices+i, evaluatorData);
+      if (evaluatorStatus != DS_SUCCESS) {
+        status = evaluatorStatus;
+        return status;
+      }
+      updates++;
+      break;
+    default:
+      return DS_INVALID_PERF_EVALUATOR_TYPE;
+      break;
+    };
+  }
+  if (numUpdates)
+    *numUpdates = updates;
+  return status;
+}
+
+
+#define DS_TAG_VERSION                      "<version>"
+#define DS_TAG_VERSION_END                  "</version>"
+#define DS_TAG_DEVICE                       "<device>"
+#define DS_TAG_DEVICE_END                   "</device>"
+#define DS_TAG_SCORE                        "<score>"
+#define DS_TAG_SCORE_END                    "</score>"
+#define DS_TAG_DEVICE_TYPE                  "<type>"
+#define DS_TAG_DEVICE_TYPE_END              "</type>"
+#define DS_TAG_DEVICE_NAME                  "<name>"
+#define DS_TAG_DEVICE_NAME_END              "</name>"
+#define DS_TAG_DEVICE_DRIVER_VERSION        "<driver>"
+#define DS_TAG_DEVICE_DRIVER_VERSION_END    "</driver>"
+#define DS_TAG_DEVICE_MAX_COMPUTE_UNITS     "<max cu>"
+#define DS_TAG_DEVICE_MAX_COMPUTE_UNITS_END "</max cu>"
+#define DS_TAG_DEVICE_MAX_CLOCK_FREQ        "<max clock>"
+#define DS_TAG_DEVICE_MAX_CLOCK_FREQ_END    "</max clock>"
+
+#define DS_DEVICE_NATIVE_CPU_STRING  "native_cpu"
+
+
+
+typedef ds_status (*ds_score_serializer)(ds_device* device, void** serializedScore, unsigned int* serializedScoreSize);
+static ds_status writeProfileToFile(ds_profile* profile, ds_score_serializer serializer, const char* file) {
+  ds_status status = DS_SUCCESS;
+  FILE* profileFile = NULL;
+
+
+  if (profile == NULL)
+    return DS_INVALID_PROFILE;
+
+  profileFile = fopen(file, "wb");
+  if (profileFile==NULL) {
+    status = DS_FILE_ERROR;
+  }
+  else {
+    unsigned int i;
+
+    /* write version string */
+    fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
+    fwrite(profile->version, sizeof(char), strlen(profile->version), profileFile);
+    fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END), profileFile);
+    fwrite("\n", sizeof(char), 1, profileFile);
+
+    for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
+      void* serializedScore;
+      unsigned int serializedScoreSize;
+
+      fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
+
+      fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE), profileFile);
+      fwrite(&profile->devices[i].type,sizeof(ds_device_type),1, profileFile);
+      fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char), strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
+
+      switch(profile->devices[i].type) {
+      case DS_DEVICE_NATIVE_CPU:
+        { 
+          /* There's no need to emit a device name for the native CPU device. */
+          /*
+          fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), profileFile);
+          fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
+          fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), strlen(DS_TAG_DEVICE_NAME_END), profileFile);
+          */
+        }
+        break;
+      case DS_DEVICE_OPENCL_DEVICE: 
+        {
+          char tmp[16];
+
+          fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), profileFile);
+          fwrite(profile->devices[i].oclDeviceName,sizeof(char),strlen(profile->devices[i].oclDeviceName), profileFile);
+          fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), strlen(DS_TAG_DEVICE_NAME_END), profileFile);
+
+          fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char), strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
+          fwrite(profile->devices[i].oclDriverVersion,sizeof(char),strlen(profile->devices[i].oclDriverVersion), profileFile);
+          fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char), strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
+
+          fwrite(DS_TAG_DEVICE_MAX_COMPUTE_UNITS, sizeof(char), strlen(DS_TAG_DEVICE_MAX_COMPUTE_UNITS), profileFile);
+          sprintf(tmp,"%d",profile->devices[i].oclMaxComputeUnits);
+          fwrite(tmp,sizeof(char),strlen(tmp), profileFile);
+          fwrite(DS_TAG_DEVICE_MAX_COMPUTE_UNITS_END, sizeof(char), strlen(DS_TAG_DEVICE_MAX_COMPUTE_UNITS_END), profileFile);
+
+          fwrite(DS_TAG_DEVICE_MAX_CLOCK_FREQ, sizeof(char), strlen(DS_TAG_DEVICE_MAX_CLOCK_FREQ), profileFile);
+          sprintf(tmp,"%d",profile->devices[i].oclMaxClockFrequency);
+          fwrite(tmp,sizeof(char),strlen(tmp), profileFile);
+          fwrite(DS_TAG_DEVICE_MAX_CLOCK_FREQ_END, sizeof(char), strlen(DS_TAG_DEVICE_MAX_CLOCK_FREQ_END), profileFile);
+        }
+        break;
+      default:
+        status = DS_UNKNOWN_DEVICE_TYPE;
+        break;
+      };
+
+      fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
+      status = serializer(profile->devices+i, &serializedScore, &serializedScoreSize);
+      if (status == DS_SUCCESS && serializedScore!=NULL && serializedScoreSize > 0) {
+        fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
+        free(serializedScore);
+      }
+      fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END), profileFile);
+      fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END), profileFile);
+      fwrite("\n",sizeof(char),1,profileFile);
+    }
+    fclose(profileFile);
+  }
+  return status;
+}
+
+
+static ds_status readProFile(const char* fileName, char** content, size_t* contentSize) {
+  ds_status status = DS_SUCCESS;
+  FILE * input = NULL;
+  size_t size = 0;
+  size_t rsize = 0;
+  char* binary = NULL;
+
+  *contentSize = 0;
+  *content = NULL;
+
+  input = fopen(fileName, "rb");
+  if(input == NULL) {
+    return DS_FILE_ERROR;
+  }
+
+  fseek(input, 0L, SEEK_END); 
+  size = ftell(input);
+  rewind(input);
+  binary = (char*)malloc(size);
+  if(binary == NULL) {
+    status = DS_FILE_ERROR;
+    goto cleanup;
+  }
+  rsize = fread(binary, sizeof(char), size, input);
+  if (rsize!=size
+      || ferror(input)) {
+    status = DS_FILE_ERROR;
+    goto cleanup;
+  }
+  *contentSize = size;
+  *content = binary;
+
+cleanup:
+  if (input != NULL) fclose(input);
+  if (status != DS_SUCCESS
+      && binary != NULL) {
+      free(binary);
+      *content = NULL;
+      *contentSize = 0;
+  }
+  return status;
+}
+
+
+static const char* findString(const char* contentStart, const char* contentEnd, const char* string) {
+  size_t stringLength;
+  const char* currentPosition;
+  const char* found;
+  found = NULL;
+  stringLength = strlen(string);
+  currentPosition = contentStart;
+  for(currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) {
+    if (*currentPosition == string[0]) {
+      if (currentPosition+stringLength < contentEnd) {
+        if (strncmp(currentPosition, string, stringLength) == 0) {
+          found = currentPosition;
+          break;
+        }
+      }
+    }
+  }
+  return found;
+}
+
+
+typedef ds_status (*ds_score_deserializer)(ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize); 
+static ds_status readProfileFromFile(ds_profile* profile, ds_score_deserializer deserializer, const char* file) {
+
+  ds_status status = DS_SUCCESS;
+  char* contentStart = NULL;
+  const char* contentEnd = NULL;
+  size_t contentSize;
+
+  if (profile==NULL)
+    return DS_INVALID_PROFILE;
+
+  status = readProFile(file, &contentStart, &contentSize);
+  if (status == DS_SUCCESS) {
+    const char* currentPosition;
+    const char* dataStart;
+    const char* dataEnd;
+    size_t versionStringLength;
+
+    contentEnd = contentStart + contentSize;
+    currentPosition = contentStart;
+
+
+    /* parse the version string */
+    dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
+    if (dataStart == NULL) {
+      status = DS_PROFILE_FILE_ERROR;
+      goto cleanup;
+    }
+    dataStart += strlen(DS_TAG_VERSION);
+
+    dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
+    if (dataEnd==NULL) {
+      status = DS_PROFILE_FILE_ERROR;
+      goto cleanup;
+    }
+
+    versionStringLength = strlen(profile->version);
+    if (versionStringLength!=(dataEnd-dataStart)   
+        || strncmp(profile->version, dataStart, versionStringLength)!=(int)0) {
+      /* version mismatch */
+      status = DS_PROFILE_FILE_ERROR;
+      goto cleanup;
+    }
+    currentPosition = dataEnd+strlen(DS_TAG_VERSION_END);
+
+    /* parse the device information */
+    while (1) {
+      unsigned int i;
+
+      const char* deviceTypeStart;
+      const char* deviceTypeEnd;
+      ds_device_type deviceType;
+
+      const char* deviceNameStart;
+      const char* deviceNameEnd;
+
+      const char* deviceScoreStart;
+      const char* deviceScoreEnd;
+
+      const char* deviceDriverStart;
+      const char* deviceDriverEnd;
+
+      const char* tmpStart;
+      const char* tmpEnd;
+      char tmp[16];
+
+      cl_uint maxClockFrequency;
+      cl_uint maxComputeUnits;
+
+      dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
+      if (dataStart==NULL) {
+        /* nothing useful remain, quit...*/
+        break;
+      }
+      dataStart+=strlen(DS_TAG_DEVICE);
+      dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
+      if (dataEnd==NULL) {
+        status = DS_PROFILE_FILE_ERROR;
+        goto cleanup;
+      }
+
+      /* parse the device type */
+      deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
+      if (deviceTypeStart==NULL) {
+        status = DS_PROFILE_FILE_ERROR;
+        goto cleanup;       
+      }
+      deviceTypeStart+=strlen(DS_TAG_DEVICE_TYPE);
+      deviceTypeEnd = findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
+      if (deviceTypeEnd==NULL) {
+        status = DS_PROFILE_FILE_ERROR;
+        goto cleanup;
+      }
+      memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
+
+
+      /* parse the device name */
+      if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
+
+        deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
+        if (deviceNameStart==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+        deviceNameStart+=strlen(DS_TAG_DEVICE_NAME);
+        deviceNameEnd = findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
+        if (deviceNameEnd==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+
+
+        deviceDriverStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
+        if (deviceDriverStart==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+        deviceDriverStart+=strlen(DS_TAG_DEVICE_DRIVER_VERSION);
+        deviceDriverEnd = findString(deviceDriverStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION_END);
+        if (deviceDriverEnd ==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+
+
+        tmpStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_MAX_COMPUTE_UNITS);
+        if (tmpStart==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+        tmpStart+=strlen(DS_TAG_DEVICE_MAX_COMPUTE_UNITS);
+        tmpEnd = findString(tmpStart, contentEnd, DS_TAG_DEVICE_MAX_COMPUTE_UNITS_END);
+        if (tmpEnd ==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+        memcpy(tmp,tmpStart,tmpEnd-tmpStart);
+        tmp[tmpEnd-tmpStart] = '\0';
+        maxComputeUnits = atoi(tmp);
+
+
+        tmpStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_MAX_CLOCK_FREQ);
+        if (tmpStart==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+        tmpStart+=strlen(DS_TAG_DEVICE_MAX_CLOCK_FREQ);
+        tmpEnd = findString(tmpStart, contentEnd, DS_TAG_DEVICE_MAX_CLOCK_FREQ_END);
+        if (tmpEnd ==NULL) {
+          status = DS_PROFILE_FILE_ERROR;
+          goto cleanup;       
+        }
+        memcpy(tmp,tmpStart,tmpEnd-tmpStart);
+        tmp[tmpEnd-tmpStart] = '\0';
+        maxClockFrequency = atoi(tmp);
+
+
+        /* check if this device is on the system */
+        for (i = 0; i < profile->numDevices; i++) {
+          if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
+            size_t actualDeviceNameLength;
+            size_t driverVersionLength;
+            
+            actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
+            driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
+            if (actualDeviceNameLength == (deviceNameEnd - deviceNameStart)
+               && driverVersionLength == (deviceDriverEnd - deviceDriverStart)
+               && maxComputeUnits == profile->devices[i].oclMaxComputeUnits
+               && maxClockFrequency == profile->devices[i].oclMaxClockFrequency
+               && strncmp(profile->devices[i].oclDeviceName, deviceNameStart, actualDeviceNameLength)==(int)0
+               && strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart, driverVersionLength)==(int)0) {
+
+              deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
+              if (deviceNameStart==NULL) {
+                status = DS_PROFILE_FILE_ERROR;
+                goto cleanup;       
+              }
+              deviceScoreStart+=strlen(DS_TAG_SCORE);
+              deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
+              status = deserializer(profile->devices+i, (const unsigned char*)deviceScoreStart, deviceScoreEnd-deviceScoreStart);
+              if (status != DS_SUCCESS) {
+                goto cleanup;
+              }
+            }
+          }
+        }
+
+      }
+      else if (deviceType == DS_DEVICE_NATIVE_CPU) {
+        for (i = 0; i < profile->numDevices; i++) {
+          if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
+            deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
+            if (deviceScoreStart==NULL) {
+              status = DS_PROFILE_FILE_ERROR;
+              goto cleanup;       
+            }
+            deviceScoreStart+=strlen(DS_TAG_SCORE);
+            deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
+            status = deserializer(profile->devices+i, (const unsigned char*)deviceScoreStart, deviceScoreEnd-deviceScoreStart);
+            if (status != DS_SUCCESS) {
+              goto cleanup;
+            }
+          }
+        }
+      }
+
+      /* skip over the current one to find the next device */
+      currentPosition = dataEnd+strlen(DS_TAG_DEVICE_END);
+    }
+  }
+cleanup:
+  if (contentStart!=NULL) free(contentStart);
+  return status;
+}
+
+static ds_status getNumDeviceWithEmptyScore(ds_profile* profile, unsigned int* num) {
+  unsigned int i;
+  if (profile == NULL || num==NULL)
+    return DS_MEMORY_ERROR;
+  *num=0;
+  for (i = 0; i < profile->numDevices; i++) {
+    if (profile->devices[i].score == NULL) {
+      *num++;
+    }
+  }
+  return DS_SUCCESS;
+}
+
+/*
+ End of the OpenCL device selection infrastructure
+*/
+
+
+
+typedef struct _AccelerateTimer {
+  long long _freq;     
+  long long _clocks;
+  long long _start;
+} AccelerateTimer;
+
+static void startAccelerateTimer(AccelerateTimer* timer) {
+#ifdef _WIN32
+      QueryPerformanceCounter((LARGE_INTEGER*)&timer->_start); 
+
+
+#else
+      struct timeval s;
+      gettimeofday(&s, 0);
+      timer->_start = (long long)s.tv_sec * (long long)1.0E3 + (long long)s.tv_usec / (long long)1.0E3;
+#endif  
+}
+
+static void stopAccelerateTimer(AccelerateTimer* timer) {
+      long long n=0;
+#ifdef _WIN32
+      QueryPerformanceCounter((LARGE_INTEGER*)&(n));   
+#else
+      struct timeval s;
+      gettimeofday(&s, 0);
+      n = (long long)s.tv_sec * (long long)1.0E3+ (long long)s.tv_usec / (long long)1.0E3;
+#endif
+      n -= timer->_start;
+      timer->_start = 0;
+      timer->_clocks += n;
+}
+
+static void resetAccelerateTimer(AccelerateTimer* timer) {
+   timer->_clocks = 0; 
+   timer->_start = 0;
+}
+
+
+static void initAccelerateTimer(AccelerateTimer* timer) {
+#ifdef _WIN32
+    QueryPerformanceFrequency((LARGE_INTEGER*)&timer->_freq);
+#else
+    timer->_freq = (long long)1.0E3;
+#endif
+   resetAccelerateTimer(timer);
+}
+
+double readAccelerateTimer(AccelerateTimer* timer) { return (double)timer->_clocks/(double)timer->_freq; };
+
+
+typedef double AccelerateScoreType;
+
+static ds_status AcceleratePerfEvaluator(ds_device* device, void* data) {
+
+  ds_status status = DS_SUCCESS;
+  MagickCLEnv clEnv = NULL;
+  MagickCLEnv oldClEnv = NULL;
+  ExceptionInfo* exception = NULL;
+  AccelerateTimer timer;
+
+  if (device == NULL) {
+    status = DS_PERF_EVALUATOR_ERROR;
+    goto cleanup;
+  }
+
+  clEnv = AcquireMagickOpenCLEnv();
+  exception = AcquireExceptionInfo();
+
+  if (device->type == DS_DEVICE_NATIVE_CPU) {
+    /* CPU device */
+    MagickBooleanType flag = MagickTrue;
+    SetMagickOpenCLEnvParamInternal(clEnv, MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED
+                                  , sizeof(MagickBooleanType), &flag, exception);
+  }
+  else if (device->type == DS_DEVICE_OPENCL_DEVICE) {
+    /* OpenCL device */
+    SetMagickOpenCLEnvParamInternal(clEnv, MAGICK_OPENCL_ENV_PARAM_DEVICE
+      , sizeof(cl_device_id), &device->oclDeviceID,exception);
+  }
+  else {
+    status = DS_PERF_EVALUATOR_ERROR;
+    goto cleanup;
+  }
+  InitOpenCLEnvInternal(clEnv, exception);
+  oldClEnv = defaultCLEnv;
+  defaultCLEnv = clEnv;
+
+  /* microbenchmark */
+  {
+#define ACCELERATE_PERF_DIMEN       "2048x1536"
+#define NUM_ITER                      2
+
+    Image* inputImage;
+    ImageInfo* imageInfo;
+    int i;
+
+    imageInfo = AcquireImageInfo();
+    CloneString(&imageInfo->size,ACCELERATE_PERF_DIMEN);
+    CopyMagickString(imageInfo->filename,"xc:none",MaxTextExtent);
+    inputImage = ReadImage(imageInfo,exception);
+
+    initAccelerateTimer(&timer);
+
+    for (i = 0; i <=NUM_ITER; i++) {
+
+      Image* bluredImage;
+      Image* unsharpedImage;
+      Image* resizedImage;
+
+      if (i > 0)
+        startAccelerateTimer(&timer);
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clBeginPerfMarkerAMD("PerfEvaluatorRegion","");
+#endif
+
+      bluredImage = BlurImage(inputImage, 10.0f, 3.5f, exception);
+      unsharpedImage = UnsharpMaskImage(bluredImage, 2.0f,2.0f,50.0f,10.0f,exception);
+      resizedImage = ResizeImage(unsharpedImage,640,480,LanczosFilter,1.0,exception);
+
+#ifdef MAGICKCORE_CLPERFMARKER
+  clEndPerfMarkerAMD();
+#endif
+
+      if (i > 0)
+        stopAccelerateTimer(&timer);
+
+      if (bluredImage) DestroyImage(bluredImage);
+      if (unsharpedImage) DestroyImage(unsharpedImage);
+      if (resizedImage) DestroyImage(resizedImage);
+    }
+    DestroyImage(inputImage);
+  }
+  /* end of microbenchmark */
+  
+  if (device->score == NULL) {
+    device->score = malloc(sizeof(AccelerateScoreType));
+  }
+  *(AccelerateScoreType*)device->score = readAccelerateTimer(&timer);
+
+cleanup:
+  if (clEnv!=NULL)
+    RelinquishMagickOpenCLEnv(clEnv);
+  if (oldClEnv!=NULL)
+    defaultCLEnv = oldClEnv;
+  return status;
+}
+
+
+
+ds_status AccelerateScoreSerializer(ds_device* device, void** serializedScore, unsigned int* serializedScoreSize) {
+  if (device
+     && device->score) {
+    /* generate a string from the score */
+    char* s = (char*)malloc(sizeof(char)*256);
+    sprintf(s,"%.4f",*((AccelerateScoreType*)device->score));
+    *serializedScore = (void*)s;
+    *serializedScoreSize = strlen(s);
+    return DS_SUCCESS;
+  }
+  else {
+    return DS_SCORE_SERIALIZER_ERROR;
+  }
+}
+
+ds_status AccelerateScoreDeserializer(ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize) {
+  if (device) {
+    /* convert the string back to an int */
+    char* s = (char*)malloc(serializedScoreSize+1);
+    memcpy(s, serializedScore, serializedScoreSize);
+    s[serializedScoreSize] = (char)'\0';
+    device->score = malloc(sizeof(AccelerateScoreType));
+    *((AccelerateScoreType*)device->score) = (AccelerateScoreType)atof(s);
+    free(s);
+    return DS_SUCCESS;
+  }
+  else {
+    return DS_SCORE_DESERIALIZER_ERROR;
+  }
+}
+
+ds_status AccelerateScoreRelease(void* score) {
+  if (score!=NULL) {
+    free(score);
+  }
+  return DS_SUCCESS;
+}
+
+
+#define IMAGEMAGICK_PROFILE_VERSION "ImageMagick Device Selection v0.9"
+#define IMAGEMAGICK_PROFILE_FILE    "ImagemagickOpenCLDeviceProfile"
+static MagickBooleanType autoSelectDevice(MagickCLEnv clEnv, ExceptionInfo* exception) {
+
+  MagickBooleanType mStatus = MagickFalse;
+  ds_status status;
+  ds_profile* profile;
+  unsigned int numDeviceProfiled = 0;
+  unsigned int i;
+  unsigned int bestDeviceIndex;
+  AccelerateScoreType bestScore;
+  char path[MaxTextExtent];
+
+
+  LockDefaultOpenCLEnv();
+
+  status = initDSProfile(&profile, IMAGEMAGICK_PROFILE_VERSION);
+  if (status!=DS_SUCCESS) {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Error when initializing the profile", "'%s'", ".");
+    goto cleanup;
+  }
+
+  (void) FormatLocaleString(path,MaxTextExtent,"%s%s%s"
+         ,GetOpenCLCachedFilesDirectory()
+         ,DirectorySeparator,IMAGEMAGICK_PROFILE_FILE);
+
+  readProfileFromFile(profile, AccelerateScoreDeserializer, path);
+  status = profileDevices(profile, DS_EVALUATE_NEW_ONLY, AcceleratePerfEvaluator, NULL, &numDeviceProfiled);
+  if (status!=DS_SUCCESS) {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleFatalError, "Error when initializing the profile", "'%s'", ".");
+    goto cleanup;
+  }
+  if (numDeviceProfiled > 0) {
+    status = writeProfileToFile(profile, AccelerateScoreSerializer, path);
+    if (status!=DS_SUCCESS) {
+      (void) ThrowMagickException(exception, GetMagickModule(), ModuleWarning, "Error when saving the profile into a file", "'%s'", ".");
+    }
+  }
+
+  /* pick the best device */
+  bestDeviceIndex = 0;
+  bestScore = *(AccelerateScoreType*)profile->devices[bestDeviceIndex].score;
+  for (i = 1; i < profile->numDevices; i++) {
+    AccelerateScoreType score = *(AccelerateScoreType*)profile->devices[i].score;
+    if (score < bestScore) {
+      bestDeviceIndex = i;
+      bestScore = score;
+    }
+  }
+
+  /* set up clEnv with the best device */
+  if (profile->devices[bestDeviceIndex].type == DS_DEVICE_NATIVE_CPU) {
+    /* CPU device */
+    MagickBooleanType flag = MagickTrue;
+    SetMagickOpenCLEnvParamInternal(clEnv, MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED
+                                  , sizeof(MagickBooleanType), &flag, exception);
+  }
+  else if (profile->devices[bestDeviceIndex].type == DS_DEVICE_OPENCL_DEVICE) {
+    /* OpenCL device */
+    SetMagickOpenCLEnvParamInternal(clEnv, MAGICK_OPENCL_ENV_PARAM_DEVICE
+      , sizeof(cl_device_id), &profile->devices[bestDeviceIndex].oclDeviceID,exception);
+  }
+  else {
+    status = DS_PERF_EVALUATOR_ERROR;
+    goto cleanup;
+  }
+  InitOpenCLEnvInternal(clEnv, exception);
+
+  status = releaseDSProfile(profile, AccelerateScoreRelease);
+  if (status!=DS_SUCCESS) {
+    (void) ThrowMagickException(exception, GetMagickModule(), ModuleWarning, "Error when releasing the profile", "'%s'", ".");
+  }
+  mStatus = MagickTrue;
+
+cleanup:
+
+  UnlockDefaultOpenCLEnv();
+  return mStatus;
+}
+
+
+/*
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                                                             %
+%                                                                             %
+%                                                                             %
++   I n i t I m a g e M a g i c k O p e n C L                                 %
+%                                                                             %
+%                                                                             %
+%                                                                             %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+%  InitImageMagickOpenCL() provides a simplified interface to initialize
+%  the OpenCL environtment in ImageMagick
+%  
+%  The format of the InitImageMagickOpenCL() method is:
+%
+%      MagickBooleanType InitImageMagickOpenCL(ImageMagickOpenCLMode mode, 
+%                                        void* userSelectedDevice, 
+%                                        void* selectedDevice) 
+%
+%  A description of each parameter follows:
+%
+%    o mode: OpenCL mode in ImageMagick, could be off,auto,user
+%
+%    o userSelectedDevice:  when in user mode, a pointer to the selected
+%                           cl_device_id
+%
+%    o selectedDevice: a pointer to cl_device_id where the selected
+%                      cl_device_id by ImageMagick could be returned
+%
+%    o exception: exception
+%
+*/
+MagickBooleanType InitImageMagickOpenCL(ImageMagickOpenCLMode mode, 
+                                        void* userSelectedDevice, 
+                                        void* selectedDevice,
+                                        ExceptionInfo* exception) {
+  MagickBooleanType status = MagickTrue;
+  MagickCLEnv clEnv = NULL;
+  MagickBooleanType flag;
+
+  exception = AcquireExceptionInfo();
+  clEnv = GetDefaultOpenCLEnv();
+  if (clEnv!=NULL) {
+    switch(mode) {
+
+    case MAGICK_OPENCL_OFF:
+      flag = MagickTrue;
+      SetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED
+        , sizeof(MagickBooleanType), &flag, exception);
+      status = InitOpenCLEnv(clEnv, exception);
+
+      if (selectedDevice)
+        *(cl_device_id*)selectedDevice = NULL;
+      break;
+
+    case MAGICK_OPENCL_DEVICE_SELECT_USER:
+
+      if (userSelectedDevice == NULL)
+        return MagickFalse;
+
+      flag = MagickFalse;
+      SetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED
+        , sizeof(MagickBooleanType), &flag, exception);
+
+      SetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_DEVICE
+        , sizeof(cl_device_id), userSelectedDevice,exception);
+
+      status = InitOpenCLEnv(clEnv, exception);
+      if (selectedDevice) {
+        GetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_DEVICE
+          , sizeof(cl_device_id), selectedDevice, exception);
+      }
+      break;
+
+    case MAGICK_OPENCL_DEVICE_SELECT_AUTO:
+    default:
+      {
+        cl_device_id d = NULL;
+        flag = MagickFalse;
+        SetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED
+          , sizeof(MagickBooleanType), &flag, exception);
+        SetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_DEVICE
+          , sizeof(cl_device_id), &d,exception);
+        status = InitOpenCLEnv(clEnv, exception);
+        if (selectedDevice) {
+          GetMagickOpenCLEnvParam(clEnv, MAGICK_OPENCL_ENV_PARAM_DEVICE
+            , sizeof(cl_device_id),  selectedDevice, exception);
+        }
+      }
+      break;
+    };
+  }
+  return status;
+}
+
+
+#else
+
+struct _MagickCLEnv {
+  MagickBooleanType OpenCLInitialized;  /* whether OpenCL environment is initialized. */
+};
+
+extern MagickExport MagickCLEnv AcquireMagickOpenCLEnv()
+{
+  return NULL;
+}
+
+extern MagickExport MagickBooleanType RelinquishMagickOpenCLEnv(
+  MagickCLEnv magick_unused(clEnv))
+{
+  magick_unreferenced(clEnv);
+
+  return MagickFalse;
+}
+
+/*
+* Return the OpenCL environment
+*/ 
+MagickExport MagickCLEnv GetDefaultOpenCLEnv(
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(exception);
+
+  return (MagickCLEnv) NULL;
+}
+
+MagickExport MagickCLEnv SetDefaultOpenCLEnv(
+  MagickCLEnv magick_unused(clEnv))
+{
+  magick_unreferenced(clEnv);
+
+  return (MagickCLEnv) NULL;
+} 
+
+MagickExport MagickBooleanType SetMagickOpenCLEnvParam(
+  MagickCLEnv magick_unused(clEnv),MagickOpenCLEnvParam magick_unused(param),
+  size_t magick_unused(dataSize),void *magick_unused(data),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(clEnv);
+  magick_unreferenced(param);
+  magick_unreferenced(dataSize);
+  magick_unreferenced(data);
+  magick_unreferenced(exception);
+
+  return MagickFalse;
+}
+
+MagickExport MagickBooleanType GetMagickOpenCLEnvParam(
+  MagickCLEnv magick_unused(clEnv),MagickOpenCLEnvParam magick_unused(param),
+  size_t magick_unused(dataSize),void *magick_unused(data),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(clEnv);
+  magick_unreferenced(param);
+  magick_unreferenced(dataSize);
+  magick_unreferenced(data);
+  magick_unreferenced(exception);
+
+  return MagickFalse;
+}
+
+MagickExport MagickBooleanType InitOpenCLEnv(MagickCLEnv magick_unused(clEnv),
+  ExceptionInfo *magick_unused(exception))
+{
+  magick_unreferenced(clEnv);
+  magick_unreferenced(exception);
+
+  return MagickFalse;
+}
+
+MagickExport cl_command_queue AcquireOpenCLCommandQueue(
+  MagickCLEnv magick_unused(clEnv))
+{
+  magick_unreferenced(clEnv);
+
+  return (cl_command_queue) NULL;
+}
+
+MagickExport MagickBooleanType RelinquishCommandQueue(
+  MagickCLEnv magick_unused(clEnv),cl_command_queue magick_unused(queue))
+{
+  magick_unreferenced(clEnv);
+  magick_unreferenced(queue);
+
+  return MagickFalse;
+}
+
+MagickExport cl_kernel AcquireOpenCLKernel(
+  MagickCLEnv magick_unused(clEnv),MagickOpenCLProgram magick_unused(program),
+  const char *magick_unused(kernelName))
+{
+  magick_unreferenced(clEnv);
+  magick_unreferenced(program);
+  magick_unreferenced(kernelName);
+
+  return (cl_kernel)NULL;
+}
+
+MagickExport MagickBooleanType RelinquishOpenCLKernel(
+  MagickCLEnv magick_unused(clEnv),cl_kernel magick_unused(kernel))
+{
+  magick_unreferenced(clEnv);
+  magick_unreferenced(kernel);
+
+  return MagickFalse;
+}
+
+MagickExport unsigned long GetOpenCLDeviceLocalMemorySize(
+  MagickCLEnv magick_unused(clEnv))
+{
+  magick_unreferenced(clEnv);
+
+  return 0;
+}
+
+MagickBooleanType InitImageMagickOpenCL(ImageMagickOpenCLMode mode, 
+                                        void* userSelectedDevice, 
+                                        void* selectedDevice,
+                                        ExceptionInfo* exception) 
+{
+  magick_unreferenced(mode);
+  magick_unreferenced(userSelectedDevice);
+  magick_unreferenced(selectedDevice);
+  magick_unreferenced(exception);
+  return MagickFalse;
+}
+
+#endif /* MAGICKCORE_OPENCL_SUPPORT */
+
+char* openclCachedFilesDirectory;
+SemaphoreInfo* openclCachedFilesDirectoryLock;
+
+MagickExport
+const char* GetOpenCLCachedFilesDirectory() {
+  if (openclCachedFilesDirectory == NULL) {
+    if (openclCachedFilesDirectoryLock == NULL)
+    {
+      AcquireSemaphoreInfo(&openclCachedFilesDirectoryLock);
+    }
+    LockSemaphoreInfo(openclCachedFilesDirectoryLock);
+    if (openclCachedFilesDirectory == NULL) {
+      char path[MaxTextExtent];
+      char *home = NULL;
+      char *temp = NULL;
+      struct stat attributes;
+      MagickBooleanType status;
+
+#ifdef MAGICKCORE_WINDOWS_SUPPORT
+      home=GetEnvironmentValue("LOCALAPPDATA");
+      if (home == (char *) NULL)
+        home=GetEnvironmentValue("APPDATA");
+      if (home == (char *) NULL)
+        home=GetEnvironmentValue("USERPROFILE");
+#else
+      home=GetEnvironmentValue("HOME");
+#endif
+      if (home != (char *) NULL)
+      {
+        /*
+        Search $HOME/.magick.
+        */
+        (void) FormatLocaleString(path,MaxTextExtent,"%s%s.magick",home,
+          DirectorySeparator);
+        home=DestroyString(home);
+        temp = (char*)AcquireMagickMemory(strlen(path)+1);
+        CopyMagickString(temp,path,strlen(path)+1);
+        status=GetPathAttributes(path,&attributes);
+        if (status == MagickFalse) {
+#ifdef MAGICKCORE_WINDOWS_SUPPORT
+          mkdir(path);
+#else
+          mkdir(path, 0777);
+#endif
+        }
+      }
+      openclCachedFilesDirectory = temp;
+    }
+    UnlockSemaphoreInfo(openclCachedFilesDirectoryLock); 
+  }
+  return openclCachedFilesDirectory;
+}
+
+
+/* create a loggin function */
+MagickExport
+void OpenCLLog(const char* message) {
+
+#define OPENCL_LOG_FILE "ImageMagickOpenCL.log"
+
+  FILE* log;
+  if (message) {
+    char path[MaxTextExtent];
+
+    /*  dump the source into a file */
+    (void) FormatLocaleString(path,MaxTextExtent,"%s%s%s"
+      ,GetOpenCLCachedFilesDirectory()
+      ,DirectorySeparator,OPENCL_LOG_FILE);
+
+
+    log = fopen(path, "ab");
+    fwrite(message, sizeof(char), strlen(message), log);
+    fwrite("\n", sizeof(char), 1, log);
+    fclose(log);
+  }
+}
+
index b343f81dc2d0ff51a9ac72f6b31298058978ad1f..f3838d89df312fdd2a33cfe82b40694034616967 100644 (file)
 #ifndef _MAGICKCORE_OPENCL_H
 #define _MAGICKCORE_OPENCL_H
 
+
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
+/* OpenCL program modules */
+typedef enum {
+  MAGICK_OPENCL_ACCELERATE = 0
+  ,MAGICK_OPENCL_NUM_PROGRAMS   /* !!! This has to be the last entry !!! */
+} MagickOpenCLProgram;
+
+
+typedef struct _MagickCLEnv* MagickCLEnv;
+
+extern MagickExport
+  MagickCLEnv AcquireMagickOpenCLEnv();
+
+extern MagickExport
+  MagickBooleanType RelinquishMagickOpenCLEnv(MagickCLEnv);
+
+extern MagickExport
+  MagickCLEnv GetDefaultOpenCLEnv();
+
+extern MagickExport
+  MagickCLEnv SetDefaultOpenCLEnv(MagickCLEnv);
+
+
+/* Parameter type accepted by SetMagickOpenCLEnvParm and GetMagickOpenCLEnvParm */
+typedef enum {
+    MAGICK_OPENCL_ENV_PARAM_DEVICE                  /* cl_device_id (from OpenCL) */
+  , MAGICK_OPENCL_ENV_PARAM_OPENCL_DISABLED         /* MagickBooleanType */
+  , MAGICK_OPENCL_ENV_PARAM_OPENCL_INITIALIZED      /* MagickBooleanType */
+} MagickOpenCLEnvParam;
+
+extern MagickExport
+  MagickBooleanType SetMagickOpenCLEnvParam(MagickCLEnv, MagickOpenCLEnvParam, size_t, void*, ExceptionInfo*);
+
+extern MagickExport
+  MagickBooleanType GetMagickOpenCLEnvParam(MagickCLEnv, MagickOpenCLEnvParam, size_t, void*, ExceptionInfo*);
+
+
+extern MagickExport
+  MagickBooleanType InitOpenCLEnv(MagickCLEnv, ExceptionInfo*);
+
+typedef enum {
+  MAGICK_OPENCL_OFF = 0
+, MAGICK_OPENCL_DEVICE_SELECT_AUTO = 1
+, MAGICK_OPENCL_DEVICE_SELECT_USER = 2
+} ImageMagickOpenCLMode ;
+
+extern MagickExport
+MagickBooleanType InitImageMagickOpenCL(ImageMagickOpenCLMode, void*, void*, ExceptionInfo*);
+
+
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
index ca052925e2ff94620fd2b964c4a5734c47df04c7..aa3e0f182cef27ff4b1cdc58899f0c32d2eb7662 100644 (file)
@@ -22,6 +22,9 @@
 extern "C" {
 #endif
 
+typedef struct _ResizeFilter
+  ResizeFilter;
+
 extern MagickExport Image
   *AdaptiveResizeImage(const Image *,const size_t,const size_t,ExceptionInfo *),
   *InterpolativeResizeImage(const Image *,const size_t,const size_t,