granicus.if.org Git - imagemagick/blob - MagickCore/accelerate-kernels-private.h

   1 /*
   2   Copyright 1999-2018 ImageMagick Studio LLC, a non-profit organization
   3   dedicated to making software imaging solutions freely available.
   4
   5   You may not use this file except in compliance with the License.
   6   obtain a copy of the License at
   7
   8     https://www.imagemagick.org/script/license.php
   9
  10   Unless required by applicable law or agreed to in writing, software
  11   distributed under the License is distributed on an "AS IS" BASIS,
  12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13   See the License for the specific language governing permissions and
  14   limitations under the License.
  15
  16   MagickCore private kernels for accelerated functions.
  17 */
  18
  19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
  20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
  21
  22 #if defined(__cplusplus) || defined(c_plusplus)
  23 extern "C" {
  24 #endif
  25
  26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
  27
  28 /*
  29   Define declarations.
  30 */
  31 #define OPENCL_DEFINE(VAR,...)  "\n #""define " #VAR " " #__VA_ARGS__ " \n"
  32 #define OPENCL_ELIF(...)        "\n #""elif " #__VA_ARGS__ " \n"
  33 #define OPENCL_ELSE()           "\n #""else " " \n"
  34 #define OPENCL_ENDIF()          "\n #""endif " " \n"
  35 #define OPENCL_IF(...)          "\n #""if " #__VA_ARGS__ " \n"
  36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
  37
  38 const char *accelerateKernels =
  39
  40 /*
  41   Define declarations.
  42 */
  43   OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
  44   OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
  45   OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
  46   OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
  47   OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
  48   OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
  49   OPENCL_DEFINE(SigmaRandom, (attenuate))
  50   OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
  51   OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
  52   OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
  53
  54 /*
  55   Typedef declarations.
  56 */
  57   STRINGIFY(
  58     typedef enum
  59   {
  60     UndefinedColorspace,
  61     RGBColorspace,            /* Linear RGB colorspace */
  62     GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
  63     TransparentColorspace,
  64     OHTAColorspace,
  65     LabColorspace,
  66     XYZColorspace,
  67     YCbCrColorspace,
  68     YCCColorspace,
  69     YIQColorspace,
  70     YPbPrColorspace,
  71     YUVColorspace,
  72     CMYKColorspace,           /* negared linear RGB with black separated */
  73     sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
  74     HSBColorspace,
  75     HSLColorspace,
  76     HWBColorspace,
  77     Rec601LumaColorspace,
  78     Rec601YCbCrColorspace,
  79     Rec709LumaColorspace,
  80     Rec709YCbCrColorspace,
  81     LogColorspace,
  82     CMYColorspace,            /* negated linear RGB colorspace */
  83     LuvColorspace,
  84     HCLColorspace,
  85     LCHColorspace,            /* alias for LCHuv */
  86     LMSColorspace,
  87     LCHabColorspace,          /* Cylindrical (Polar) Lab */
  88     LCHuvColorspace,          /* Cylindrical (Polar) Luv */
  89     scRGBColorspace,
  90     HSIColorspace,
  91     HSVColorspace,            /* alias for HSB */
  92     HCLpColorspace,
  93     YDbDrColorspace
  94   } ColorspaceType;
  95   )
  96
  97   STRINGIFY(
  98     typedef enum
  99     {
 100       UndefinedCompositeOp,
 101       NoCompositeOp,
 102       ModulusAddCompositeOp,
 103       AtopCompositeOp,
 104       BlendCompositeOp,
 105       BumpmapCompositeOp,
 106       ChangeMaskCompositeOp,
 107       ClearCompositeOp,
 108       ColorBurnCompositeOp,
 109       ColorDodgeCompositeOp,
 110       ColorizeCompositeOp,
 111       CopyBlackCompositeOp,
 112       CopyBlueCompositeOp,
 113       CopyCompositeOp,
 114       CopyCyanCompositeOp,
 115       CopyGreenCompositeOp,
 116       CopyMagentaCompositeOp,
 117       CopyOpacityCompositeOp,
 118       CopyRedCompositeOp,
 119       CopyYellowCompositeOp,
 120       DarkenCompositeOp,
 121       DstAtopCompositeOp,
 122       DstCompositeOp,
 123       DstInCompositeOp,
 124       DstOutCompositeOp,
 125       DstOverCompositeOp,
 126       DifferenceCompositeOp,
 127       DisplaceCompositeOp,
 128       DissolveCompositeOp,
 129       ExclusionCompositeOp,
 130       HardLightCompositeOp,
 131       HueCompositeOp,
 132       InCompositeOp,
 133       LightenCompositeOp,
 134       LinearLightCompositeOp,
 135       LuminizeCompositeOp,
 136       MinusDstCompositeOp,
 137       ModulateCompositeOp,
 138       MultiplyCompositeOp,
 139       OutCompositeOp,
 140       OverCompositeOp,
 141       OverlayCompositeOp,
 142       PlusCompositeOp,
 143       ReplaceCompositeOp,
 144       SaturateCompositeOp,
 145       ScreenCompositeOp,
 146       SoftLightCompositeOp,
 147       SrcAtopCompositeOp,
 148       SrcCompositeOp,
 149       SrcInCompositeOp,
 150       SrcOutCompositeOp,
 151       SrcOverCompositeOp,
 152       ModulusSubtractCompositeOp,
 153       ThresholdCompositeOp,
 154       XorCompositeOp,
 155       /* These are new operators, added after the above was last sorted.
 156        * The list should be re-sorted only when a new library version is
 157        * created.
 158        */
 159       DivideDstCompositeOp,
 160       DistortCompositeOp,
 161       BlurCompositeOp,
 162       PegtopLightCompositeOp,
 163       VividLightCompositeOp,
 164       PinLightCompositeOp,
 165       LinearDodgeCompositeOp,
 166       LinearBurnCompositeOp,
 167       MathematicsCompositeOp,
 168       DivideSrcCompositeOp,
 169       MinusSrcCompositeOp,
 170       DarkenIntensityCompositeOp,
 171       LightenIntensityCompositeOp
 172     } CompositeOperator;
 173   )
 174
 175   STRINGIFY(
 176      typedef enum
 177      {
 178        UndefinedFunction,
 179        ArcsinFunction,
 180        ArctanFunction,
 181        PolynomialFunction,
 182        SinusoidFunction
 183      } MagickFunction;
 184   )
 185
 186   STRINGIFY(
 187     typedef enum
 188     {
 189       UndefinedNoise,
 190       UniformNoise,
 191       GaussianNoise,
 192       MultiplicativeGaussianNoise,
 193       ImpulseNoise,
 194       LaplacianNoise,
 195       PoissonNoise,
 196       RandomNoise
 197     } NoiseType;
 198   )
 199
 200   STRINGIFY(
 201     typedef enum
 202   {
 203     UndefinedPixelIntensityMethod = 0,
 204     AveragePixelIntensityMethod,
 205     BrightnessPixelIntensityMethod,
 206     LightnessPixelIntensityMethod,
 207     Rec601LumaPixelIntensityMethod,
 208     Rec601LuminancePixelIntensityMethod,
 209     Rec709LumaPixelIntensityMethod,
 210     Rec709LuminancePixelIntensityMethod,
 211     RMSPixelIntensityMethod,
 212     MSPixelIntensityMethod
 213   } PixelIntensityMethod;
 214   )
 215
 216   STRINGIFY(
 217   typedef enum {
 218     BoxWeightingFunction = 0,
 219     TriangleWeightingFunction,
 220     CubicBCWeightingFunction,
 221     HannWeightingFunction,
 222     HammingWeightingFunction,
 223     BlackmanWeightingFunction,
 224     GaussianWeightingFunction,
 225     QuadraticWeightingFunction,
 226     JincWeightingFunction,
 227     SincWeightingFunction,
 228     SincFastWeightingFunction,
 229     KaiserWeightingFunction,
 230     WelshWeightingFunction,
 231     BohmanWeightingFunction,
 232     LagrangeWeightingFunction,
 233     CosineWeightingFunction,
 234   } ResizeWeightingFunctionType;
 235   )
 236
 237   STRINGIFY(
 238      typedef enum
 239      {
 240        UndefinedChannel = 0x0000,
 241        RedChannel = 0x0001,
 242        GrayChannel = 0x0001,
 243        CyanChannel = 0x0001,
 244        GreenChannel = 0x0002,
 245        MagentaChannel = 0x0002,
 246        BlueChannel = 0x0004,
 247        YellowChannel = 0x0004,
 248        BlackChannel = 0x0008,
 249        AlphaChannel = 0x0010,
 250        OpacityChannel = 0x0010,
 251        IndexChannel = 0x0020,             /* Color Index Table? */
 252        ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
 253        WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
 254        MetaChannel = 0x0100,              /* ???? */
 255        CompositeChannels = 0x001F,
 256        AllChannels = 0x7ffffff,
 257        /*
 258          Special purpose channel types.
 259          FUTURE: are these needed any more - they are more like hacks
 260          SyncChannels for example is NOT a real channel but a 'flag'
 261          It really says -- "User has not defined channels"
 262          Though it does have extra meaning in the "-auto-level" operator
 263        */
 264        TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
 265        RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
 266        GrayChannels = 0x0400,
 267        SyncChannels = 0x20000,    /* channels modified as a single unit */
 268        DefaultChannels = AllChannels
 269      } ChannelType;  /* must correspond to PixelChannel */
 270   )
 271
 272 /*
 273   Helper functions.
 274 */
 275
 276 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
 277
 278   STRINGIFY(
 279     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
 280     {
 281       return((CLQuantum) value);
 282     }
 283   )
 284
 285 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
 286
 287   STRINGIFY(
 288     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
 289     {
 290       return((CLQuantum) (257.0f*value));
 291     }
 292   )
 293
 294 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
 295
 296   STRINGIFY(
 297     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
 298     {
 299       return((CLQuantum) (16843009.0*value));
 300     }
 301   )
 302
 303 OPENCL_ENDIF()
 304
 305   STRINGIFY(
 306     inline int ClampToCanvas(const int offset,const int range)
 307       {
 308         return clamp(offset, (int)0, range-1);
 309       }
 310   )
 311
 312   STRINGIFY(
 313     inline CLQuantum ClampToQuantum(const float value)
 314       {
 315         return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
 316       }
 317   )
 318
 319   STRINGIFY(
 320     inline uint ScaleQuantumToMap(CLQuantum value)
 321       {
 322         if (value >= (CLQuantum) MaxMap)
 323           return ((uint)MaxMap);
 324         else
 325           return ((uint)value);
 326       }
 327   )
 328
 329   STRINGIFY(
 330     inline float PerceptibleReciprocal(const float x)
 331     {
 332       float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
 333       return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
 334     }
 335   )
 336
 337   STRINGIFY(
 338   inline float RoundToUnity(const float value)
 339    {
 340      return clamp(value,0.0f,1.0f);
 341    }
 342   )
 343
 344   STRINGIFY(
 345
 346   inline unsigned int getPixelIndex(const unsigned int number_channels,
 347     const unsigned int columns, const unsigned int x, const unsigned int y)
 348   {
 349     return (x * number_channels) + (y * columns * number_channels);
 350   }
 351
 352   inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
 353   inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
 354   inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
 355   inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
 356
 357   inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
 358   inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
 359   inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
 360   inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
 361
 362   inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
 363   inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
 364   inline float getBlueF4(float4 p)                      { return p.x; }
 365   inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
 366
 367   inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
 368   inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
 369   inline float getGreenF4(float4 p)                     { return p.y; }
 370   inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
 371
 372   inline CLQuantum getRed(CLPixelType p)                { return p.z; }
 373   inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
 374   inline float getRedF4(float4 p)                       { return p.z; }
 375   inline void setRedF4(float4* p, float value)          { (*p).z = value; }
 376
 377   inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
 378   inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
 379   inline float getAlphaF4(float4 p)                     { return p.w; }
 380   inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
 381
 382   inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
 383     const ChannelType channel, float *red, float *green, float *blue, float *alpha)
 384   {
 385     if ((channel & RedChannel) != 0)
 386       *red=getPixelRed(p);
 387
 388     if (number_channels > 2)
 389       {
 390         if ((channel & GreenChannel) != 0)
 391           *green=getPixelGreen(p);
 392
 393         if ((channel & BlueChannel) != 0)
 394           *blue=getPixelBlue(p);
 395       }
 396
 397     if (((number_channels == 4) || (number_channels == 2)) &&
 398         ((channel & AlphaChannel) != 0))
 399       *alpha=getPixelAlpha(p,number_channels);
 400   }
 401
 402   inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
 403     const unsigned int columns, const unsigned int x, const unsigned int y)
 404   {
 405     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
 406
 407     float4 pixel;
 408
 409     pixel.x=getPixelRed(p);
 410
 411     if (number_channels > 2)
 412       {
 413         pixel.y=getPixelGreen(p);
 414         pixel.z=getPixelBlue(p);
 415       }
 416
 417     if ((number_channels == 4) || (number_channels == 2))
 418       pixel.w=getPixelAlpha(p,number_channels);
 419     return(pixel);
 420   }
 421
 422   inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
 423     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
 424   {
 425     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
 426
 427     float red = 0.0f;
 428     float green = 0.0f;
 429     float blue = 0.0f;
 430     float alpha = 0.0f;
 431
 432     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
 433     return (float4)(red, green, blue, alpha);
 434   }
 435
 436   inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
 437     const ChannelType channel, float red, float green, float blue, float alpha)
 438   {
 439     if ((channel & RedChannel) != 0)
 440       setPixelRed(p,ClampToQuantum(red));
 441
 442     if (number_channels > 2)
 443       {
 444         if ((channel & GreenChannel) != 0)
 445           setPixelGreen(p,ClampToQuantum(green));
 446
 447         if ((channel & BlueChannel) != 0)
 448           setPixelBlue(p,ClampToQuantum(blue));
 449       }
 450
 451     if (((number_channels == 4) || (number_channels == 2)) &&
 452         ((channel & AlphaChannel) != 0))
 453       setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
 454   }
 455
 456   inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
 457     const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
 458   {
 459     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
 460
 461     setPixelRed(p,ClampToQuantum(pixel.x));
 462
 463     if (number_channels > 2)
 464       {
 465         setPixelGreen(p,ClampToQuantum(pixel.y));
 466         setPixelBlue(p,ClampToQuantum(pixel.z));
 467       }
 468
 469     if ((number_channels == 4) || (number_channels == 2))
 470       setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
 471   }
 472
 473   inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
 474     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
 475     float4 pixel)
 476   {
 477     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
 478     WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
 479   }
 480
 481   inline float GetPixelIntensity(const unsigned int colorspace,
 482     const unsigned int method,float red,float green,float blue)
 483   {
 484     float intensity;
 485
 486     if (colorspace == GRAYColorspace)
 487       return red;
 488
 489     switch (method)
 490     {
 491       case AveragePixelIntensityMethod:
 492         {
 493           intensity=(red+green+blue)/3.0;
 494           break;
 495         }
 496       case BrightnessPixelIntensityMethod:
 497         {
 498           intensity=MagickMax(MagickMax(red,green),blue);
 499           break;
 500         }
 501       case LightnessPixelIntensityMethod:
 502         {
 503           intensity=(MagickMin(MagickMin(red,green),blue)+
 504               MagickMax(MagickMax(red,green),blue))/2.0;
 505           break;
 506         }
 507       case MSPixelIntensityMethod:
 508         {
 509           intensity=(float) (((float) red*red+green*green+blue*blue)/
 510               (3.0*QuantumRange));
 511           break;
 512         }
 513       case Rec601LumaPixelIntensityMethod:
 514         {
 515           /*
 516           if (image->colorspace == RGBColorspace)
 517           {
 518             red=EncodePixelGamma(red);
 519             green=EncodePixelGamma(green);
 520             blue=EncodePixelGamma(blue);
 521           }
 522           */
 523           intensity=0.298839*red+0.586811*green+0.114350*blue;
 524           break;
 525         }
 526       case Rec601LuminancePixelIntensityMethod:
 527         {
 528           /*
 529           if (image->colorspace == sRGBColorspace)
 530           {
 531             red=DecodePixelGamma(red);
 532             green=DecodePixelGamma(green);
 533             blue=DecodePixelGamma(blue);
 534           }
 535           */
 536           intensity=0.298839*red+0.586811*green+0.114350*blue;
 537           break;
 538         }
 539       case Rec709LumaPixelIntensityMethod:
 540       default:
 541         {
 542           /*
 543           if (image->colorspace == RGBColorspace)
 544           {
 545             red=EncodePixelGamma(red);
 546             green=EncodePixelGamma(green);
 547             blue=EncodePixelGamma(blue);
 548           }
 549           */
 550           intensity=0.212656*red+0.715158*green+0.072186*blue;
 551           break;
 552         }
 553       case Rec709LuminancePixelIntensityMethod:
 554         {
 555           /*
 556           if (image->colorspace == sRGBColorspace)
 557           {
 558             red=DecodePixelGamma(red);
 559             green=DecodePixelGamma(green);
 560             blue=DecodePixelGamma(blue);
 561           }
 562           */
 563           intensity=0.212656*red+0.715158*green+0.072186*blue;
 564           break;
 565         }
 566       case RMSPixelIntensityMethod:
 567         {
 568           intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
 569               sqrt(3.0));
 570           break;
 571         }
 572     }
 573
 574     return intensity;
 575   }
 576
 577   inline int mirrorBottom(int value)
 578   {
 579       return (value < 0) ? - (value) : value;
 580   }
 581
 582   inline int mirrorTop(int value, int width)
 583   {
 584       return (value >= width) ? (2 * width - value - 1) : value;
 585   }
 586   )
 587
 588 /*
 589 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 590 %                                                                             %
 591 %                                                                             %
 592 %                                                                             %
 593 %     A d d N o i s e                                                         %
 594 %                                                                             %
 595 %                                                                             %
 596 %                                                                             %
 597 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 598 */
 599
 600   STRINGIFY(
 601   /*
 602   Part of MWC64X by David Thomas, dt10@imperial.ac.uk
 603   This is provided under BSD, full license is with the main package.
 604   See http://www.doc.ic.ac.uk/~dt10/research
 605   */
 606
 607   // Pre: a<M, b<M
 608   // Post: r=(a+b) mod M
 609   ulong MWC_AddMod64(ulong a, ulong b, ulong M)
 610   {
 611     ulong v=a+b;
 612     //if( (v>=M) || (v<a) )
 613     if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
 614       v=v-M;
 615     return v;
 616   }
 617
 618   // Pre: a<M,b<M
 619   // Post: r=(a*b) mod M
 620   // This could be done more efficently, but it is portable, and should
 621   // be easy to understand. It can be replaced with any of the better
 622   // modular multiplication algorithms (for example if you know you have
 623   // double precision available or something).
 624   ulong MWC_MulMod64(ulong a, ulong b, ulong M)
 625   {
 626     ulong r=0;
 627     while(a!=0){
 628       if(a&1)
 629         r=MWC_AddMod64(r,b,M);
 630       b=MWC_AddMod64(b,b,M);
 631       a=a>>1;
 632     }
 633     return r;
 634   }
 635
 636   // Pre: a<M, e>=0
 637   // Post: r=(a^b) mod M
 638   // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
 639   // most architectures
 640   ulong MWC_PowMod64(ulong a, ulong e, ulong M)
 641   {
 642     ulong sqr=a, acc=1;
 643     while(e!=0){
 644       if(e&1)
 645         acc=MWC_MulMod64(acc,sqr,M);
 646         sqr=MWC_MulMod64(sqr,sqr,M);
 647       e=e>>1;
 648     }
 649     return acc;
 650   }
 651
 652   uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
 653   {
 654     ulong m=MWC_PowMod64(A, distance, M);
 655     ulong x=curr.x*(ulong)A+curr.y;
 656     x=MWC_MulMod64(x, m, M);
 657     return (uint2)((uint)(x/A), (uint)(x%A));
 658   }
 659
 660   uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
 661   {
 662     // This is an arbitrary constant for starting LCG jumping from. I didn't
 663     // want to start from 1, as then you end up with the two or three first values
 664     // being a bit poor in ones - once you've decided that, one constant is as
 665     // good as any another. There is no deep mathematical reason for it, I just
 666     // generated a random number.
 667     enum{ MWC_BASEID = 4077358422479273989UL };
 668
 669     ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
 670     ulong m=MWC_PowMod64(A, dist, M);
 671
 672     ulong x=MWC_MulMod64(MWC_BASEID, m, M);
 673     return (uint2)((uint)(x/A), (uint)(x%A));
 674   }
 675
 676   //! Represents the state of a particular generator
 677   typedef struct{ uint x; uint c; } mwc64x_state_t;
 678
 679   enum{ MWC64X_A = 4294883355U };
 680   enum{ MWC64X_M = 18446383549859758079UL };
 681
 682   void MWC64X_Step(mwc64x_state_t *s)
 683   {
 684     uint X=s->x, C=s->c;
 685
 686     uint Xn=MWC64X_A*X+C;
 687     uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
 688     uint Cn=mad_hi(MWC64X_A,X,carry);
 689
 690     s->x=Xn;
 691     s->c=Cn;
 692   }
 693
 694   void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
 695   {
 696     uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
 697     s->x=tmp.x;
 698     s->c=tmp.y;
 699   }
 700
 701   void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
 702   {
 703     uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
 704     s->x=tmp.x;
 705     s->c=tmp.y;
 706   }
 707
 708   //! Return a 32-bit integer in the range [0..2^32)
 709   uint MWC64X_NextUint(mwc64x_state_t *s)
 710   {
 711     uint res=s->x ^ s->c;
 712     MWC64X_Step(s);
 713     return res;
 714   }
 715
 716   //
 717   // End of MWC64X excerpt
 718   //
 719
 720   float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
 721   {
 722     return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
 723   }
 724
 725   float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
 726   {
 727     float
 728       alpha,
 729       beta,
 730       noise,
 731       sigma;
 732
 733     noise = 0.0f;
 734     alpha=mwcReadPseudoRandomValue(r);
 735     switch(noise_type)
 736     {
 737       case UniformNoise:
 738       default:
 739         {
 740           noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
 741           break;
 742         }
 743       case GaussianNoise:
 744         {
 745           float
 746             gamma,
 747             tau;
 748
 749           if (alpha == 0.0f)
 750             alpha=1.0f;
 751           beta=mwcReadPseudoRandomValue(r);
 752           gamma=sqrt(-2.0f*log(alpha));
 753           sigma=gamma*cospi((2.0f*beta));
 754           tau=gamma*sinpi((2.0f*beta));
 755           noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
 756           break;
 757         }
 758       case ImpulseNoise:
 759       {
 760         if (alpha < (SigmaImpulse/2.0f))
 761           noise=0.0f;
 762         else
 763           if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
 764             noise=QuantumRange;
 765           else
 766             noise=pixel;
 767         break;
 768       }
 769       case LaplacianNoise:
 770       {
 771         if (alpha <= 0.5f)
 772           {
 773             if (alpha <= MagickEpsilon)
 774               noise=(pixel-QuantumRange);
 775             else
 776               noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
 777                 0.5f);
 778             break;
 779           }
 780         beta=1.0f-alpha;
 781         if (beta <= (0.5f*MagickEpsilon))
 782           noise=(pixel+QuantumRange);
 783         else
 784           noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
 785         break;
 786       }
 787       case MultiplicativeGaussianNoise:
 788       {
 789         sigma=1.0f;
 790         if (alpha > MagickEpsilon)
 791           sigma=sqrt(-2.0f*log(alpha));
 792         beta=mwcReadPseudoRandomValue(r);
 793         noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
 794           cospi((2.0f*beta))/2.0f);
 795         break;
 796       }
 797       case PoissonNoise:
 798       {
 799         float
 800           poisson;
 801         unsigned int i;
 802         poisson=exp(-SigmaPoisson*QuantumScale*pixel);
 803         for (i=0; alpha > poisson; i++)
 804         {
 805           beta=mwcReadPseudoRandomValue(r);
 806           alpha*=beta;
 807         }
 808         noise=(QuantumRange*i/SigmaPoisson);
 809         break;
 810       }
 811       case RandomNoise:
 812       {
 813         noise=(QuantumRange*SigmaRandom*alpha);
 814         break;
 815       }
 816     }
 817     return noise;
 818   }
 819
 820   __kernel
 821   void AddNoise(const __global CLQuantum *image,
 822     const unsigned int number_channels,const ChannelType channel,
 823     const unsigned int length,const unsigned int pixelsPerWorkItem,
 824     const NoiseType noise_type,const float attenuate,const unsigned int seed0,
 825     const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
 826     __global CLQuantum *filteredImage)
 827   {
 828     mwc64x_state_t rng;
 829     rng.x = seed0;
 830     rng.c = seed1;
 831
 832     uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
 833     uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
 834     MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
 835
 836     uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
 837     uint count = pixelsPerWorkItem;
 838
 839     while (count > 0 && pos < length)
 840     {
 841       const __global CLQuantum *p = image + pos;
 842       __global CLQuantum *q = filteredImage + pos;
 843
 844       float red;
 845       float green;
 846       float blue;
 847       float alpha;
 848
 849       ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
 850
 851       if ((channel & RedChannel) != 0)
 852         red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
 853
 854       if (number_channels > 2)
 855       {
 856         if ((channel & GreenChannel) != 0)
 857           green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
 858
 859         if ((channel & BlueChannel) != 0)
 860           blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
 861       }
 862
 863       if (((number_channels == 4) || (number_channels == 2)) &&
 864           ((channel & AlphaChannel) != 0))
 865         alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
 866
 867       WriteChannels(q, number_channels, channel, red, green, blue, alpha);
 868
 869       pos += (get_local_size(0) * number_channels);
 870       count--;
 871     }
 872   }
 873   )
 874
 875 /*
 876 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 877 %                                                                             %
 878 %                                                                             %
 879 %                                                                             %
 880 %    B l u r                                                                  %
 881 %                                                                             %
 882 %                                                                             %
 883 %                                                                             %
 884 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 885 */
 886
 887   STRINGIFY(
 888   /*
 889   Reduce image noise and reduce detail levels by row
 890   */
 891   __kernel void BlurRow(const __global CLQuantum *image,
 892     const unsigned int number_channels,const ChannelType channel,
 893     __constant float *filter,const unsigned int width,
 894     const unsigned int imageColumns,const unsigned int imageRows,
 895     __local float4 *temp,__global float4 *tempImage)
 896   {
 897     const int x = get_global_id(0);
 898     const int y = get_global_id(1);
 899
 900     const int columns = imageColumns;
 901
 902     const unsigned int radius = (width-1)/2;
 903     const int wsize = get_local_size(0);
 904     const unsigned int loadSize = wsize+width;
 905
 906     //group coordinate
 907     const int groupX=get_local_size(0)*get_group_id(0);
 908
 909     //parallel load and clamp
 910     for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
 911     {
 912       int cx = ClampToCanvas(i + groupX - radius, columns);
 913       temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
 914     }
 915
 916     // barrier
 917     barrier(CLK_LOCAL_MEM_FENCE);
 918
 919     // only do the work if this is not a patched item
 920     if (get_global_id(0) < columns)
 921     {
 922       // compute
 923       float4 result = (float4) 0;
 924
 925       int i = 0;
 926
 927       for ( ; i+7 < width; )
 928       {
 929         for (int j=0; j < 8; j++)
 930           result+=filter[i+j]*temp[i+j+get_local_id(0)];
 931         i+=8;
 932       }
 933
 934       for ( ; i < width; i++)
 935         result+=filter[i]*temp[i+get_local_id(0)];
 936
 937       // write back to global
 938       tempImage[y*columns+x] = result;
 939     }
 940   }
 941   )
 942
 943   STRINGIFY(
 944   /*
 945   Reduce image noise and reduce detail levels by line
 946   */
 947   __kernel void BlurColumn(const __global float4 *blurRowData,
 948     const unsigned int number_channels,const ChannelType channel,
 949     __constant float *filter,const unsigned int width,
 950     const unsigned int imageColumns,const unsigned int imageRows,
 951     __local float4 *temp,__global CLQuantum *filteredImage)
 952   {
 953     const int x = get_global_id(0);
 954     const int y = get_global_id(1);
 955
 956     const int columns = imageColumns;
 957     const int rows = imageRows;
 958
 959     unsigned int radius = (width-1)/2;
 960     const int wsize = get_local_size(1);
 961     const unsigned int loadSize = wsize+width;
 962
 963     //group coordinate
 964     const int groupX=get_local_size(0)*get_group_id(0);
 965     const int groupY=get_local_size(1)*get_group_id(1);
 966     //notice that get_local_size(0) is 1, so
 967     //groupX=get_group_id(0);
 968
 969     //parallel load and clamp
 970     for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
 971       temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
 972
 973     // barrier
 974     barrier(CLK_LOCAL_MEM_FENCE);
 975
 976     // only do the work if this is not a patched item
 977     if (get_global_id(1) < rows)
 978     {
 979       // compute
 980       float4 result = (float4) 0;
 981
 982       int i = 0;
 983
 984       for ( ; i+7 < width; )
 985       {
 986         for (int j=0; j < 8; j++)
 987           result+=filter[i+j]*temp[i+j+get_local_id(1)];
 988         i+=8;
 989       }
 990
 991       for ( ; i < width; i++)
 992         result+=filter[i]*temp[i+get_local_id(1)];
 993
 994       // write back to global
 995       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
 996     }
 997   }
 998   )
 999
1000 /*
1001 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1002 %                                                                             %
1003 %                                                                             %
1004 %                                                                             %
1005 %    C o n t r a s t                                                          %
1006 %                                                                             %
1007 %                                                                             %
1008 %                                                                             %
1009 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1010 */
1011
1012   STRINGIFY(
1013
1014   inline float4 ConvertRGBToHSB(const float4 pixel)
1015   {
1016     float4 result=0.0f;
1017     result.w=pixel.w;
1018     float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
1019     if (tmax != 0.0f)
1020     {
1021       float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
1022       float delta=tmax-tmin;
1023
1024       result.y=delta/tmax;
1025       result.z=QuantumScale*tmax;
1026       if (delta != 0.0f)
1027       {
1028         result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
1029         result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
1030           (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
1031         result.x/=6.0f;
1032         result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
1033       }
1034     }
1035     return(result);
1036   }
1037
1038   inline float4 ConvertHSBToRGB(const float4 pixel)
1039   {
1040     float hue=pixel.x;
1041     float saturation=pixel.y;
1042     float brightness=pixel.z;
1043
1044     float4 result=pixel;
1045
1046     if (saturation == 0.0f)
1047     {
1048       result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
1049     }
1050     else
1051     {
1052       float h=6.0f*(hue-floor(hue));
1053       float f=h-floor(h);
1054       float p=brightness*(1.0f-saturation);
1055       float q=brightness*(1.0f-saturation*f);
1056       float t=brightness*(1.0f-(saturation*(1.0f-f)));
1057       int ih = (int)h;
1058
1059       if (ih == 1)
1060       {
1061         result.x=ClampToQuantum(QuantumRange*q);
1062         result.y=ClampToQuantum(QuantumRange*brightness);
1063         result.z=ClampToQuantum(QuantumRange*p);
1064       }
1065       else if (ih == 2)
1066       {
1067         result.x=ClampToQuantum(QuantumRange*p);
1068         result.y=ClampToQuantum(QuantumRange*brightness);
1069         result.z=ClampToQuantum(QuantumRange*t);
1070       }
1071       else if (ih == 3)
1072       {
1073         result.x=ClampToQuantum(QuantumRange*p);
1074         result.y=ClampToQuantum(QuantumRange*q);
1075         result.z=ClampToQuantum(QuantumRange*brightness);
1076       }
1077       else if (ih == 4)
1078       {
1079         result.x=ClampToQuantum(QuantumRange*t);
1080         result.y=ClampToQuantum(QuantumRange*p);
1081         result.z=ClampToQuantum(QuantumRange*brightness);
1082       }
1083       else if (ih == 5)
1084       {
1085         result.x=ClampToQuantum(QuantumRange*brightness);
1086         result.y=ClampToQuantum(QuantumRange*p);
1087         result.z=ClampToQuantum(QuantumRange*q);
1088       }
1089       else
1090       {
1091         result.x=ClampToQuantum(QuantumRange*brightness);
1092         result.y=ClampToQuantum(QuantumRange*t);
1093         result.z=ClampToQuantum(QuantumRange*p);
1094       }
1095     }
1096     return(result);
1097   }
1098
1099   __kernel void Contrast(__global CLQuantum *image,
1100     const unsigned int number_channels,const int sign)
1101   {
1102     const int x=get_global_id(0);
1103     const int y=get_global_id(1);
1104     const unsigned int columns=get_global_size(0);
1105
1106     float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
1107     if (number_channels < 3)
1108       pixel.y=pixel.z=pixel.x;
1109
1110     pixel=ConvertRGBToHSB(pixel);
1111     float brightness=pixel.z;
1112     brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1113     brightness=clamp(brightness,0.0f,1.0f);
1114     pixel.z=brightness;
1115     pixel=ConvertHSBToRGB(pixel);
1116
1117     WriteAllChannels(image,number_channels,columns,x,y,pixel);
1118   }
1119   )
1120
1121 /*
1122 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1123 %                                                                             %
1124 %                                                                             %
1125 %                                                                             %
1126 %    C o n t r a s t S t r e t c h                                            %
1127 %                                                                             %
1128 %                                                                             %
1129 %                                                                             %
1130 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1131 */
1132
1133     STRINGIFY(
1134     /*
1135     */
1136     __kernel void Histogram(__global CLPixelType * restrict im,
1137       const ChannelType channel,
1138       const unsigned int colorspace,
1139       const unsigned int method,
1140       __global uint4 * restrict histogram)
1141       {
1142         const int x = get_global_id(0);
1143         const int y = get_global_id(1);
1144         const int columns = get_global_size(0);
1145         const int c = x + y * columns;
1146         if ((channel & SyncChannels) != 0)
1147         {
1148           float red=(float)getRed(im[c]);
1149           float green=(float)getGreen(im[c]);
1150           float blue=(float)getBlue(im[c]);
1151
1152           float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1153           uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1154           atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1155         }
1156         else
1157         {
1158           // for equalizing, we always need all channels?
1159           // otherwise something more
1160         }
1161       }
1162     )
1163
1164     STRINGIFY(
1165     /*
1166     */
1167     __kernel void ContrastStretch(__global CLPixelType * restrict im,
1168       const ChannelType channel,
1169       __global CLPixelType * restrict stretch_map,
1170       const float4 white, const float4 black)
1171       {
1172         const int x = get_global_id(0);
1173         const int y = get_global_id(1);
1174         const int columns = get_global_size(0);
1175         const int c = x + y * columns;
1176
1177         uint ePos;
1178         CLPixelType oValue, eValue;
1179         CLQuantum red, green, blue, alpha;
1180
1181         //read from global
1182         oValue=im[c];
1183
1184         if ((channel & RedChannel) != 0)
1185         {
1186           if (getRedF4(white) != getRedF4(black))
1187           {
1188             ePos = ScaleQuantumToMap(getRed(oValue));
1189             eValue = stretch_map[ePos];
1190             red = getRed(eValue);
1191           }
1192         }
1193
1194         if ((channel & GreenChannel) != 0)
1195         {
1196           if (getGreenF4(white) != getGreenF4(black))
1197           {
1198             ePos = ScaleQuantumToMap(getGreen(oValue));
1199             eValue = stretch_map[ePos];
1200             green = getGreen(eValue);
1201           }
1202         }
1203
1204         if ((channel & BlueChannel) != 0)
1205         {
1206           if (getBlueF4(white) != getBlueF4(black))
1207           {
1208             ePos = ScaleQuantumToMap(getBlue(oValue));
1209             eValue = stretch_map[ePos];
1210             blue = getBlue(eValue);
1211           }
1212         }
1213
1214         if ((channel & AlphaChannel) != 0)
1215         {
1216           if (getAlphaF4(white) != getAlphaF4(black))
1217           {
1218             ePos = ScaleQuantumToMap(getAlpha(oValue));
1219             eValue = stretch_map[ePos];
1220             alpha = getAlpha(eValue);
1221           }
1222         }
1223
1224         //write back
1225         im[c]=(CLPixelType)(blue, green, red, alpha);
1226
1227       }
1228     )
1229
1230 /*
1231 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1232 %                                                                             %
1233 %                                                                             %
1234 %                                                                             %
1235 %    C o n v o l v e                                                          %
1236 %                                                                             %
1237 %                                                                             %
1238 %                                                                             %
1239 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1240 */
1241
1242   STRINGIFY(
1243     __kernel
1244     void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1245     const unsigned int imageWidth, const unsigned int imageHeight,
1246     __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1247     const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1248
1249       int2 blockID;
1250       blockID.x = get_global_id(0) / get_local_size(0);
1251       blockID.y = get_global_id(1) / get_local_size(1);
1252
1253       // image area processed by this workgroup
1254       int2 imageAreaOrg;
1255       imageAreaOrg.x = blockID.x * get_local_size(0);
1256       imageAreaOrg.y = blockID.y * get_local_size(1);
1257
1258       int2 midFilterDimen;
1259       midFilterDimen.x = (filterWidth-1)/2;
1260       midFilterDimen.y = (filterHeight-1)/2;
1261
1262       int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1263
1264       // dimension of the local cache
1265       int2 cachedAreaDimen;
1266       cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1267       cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1268
1269       // cache the pixels accessed by this workgroup in local memory
1270       int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1271       int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1272       int groupSize = get_local_size(0) * get_local_size(1);
1273       for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1274
1275         int2 cachedAreaIndex;
1276         cachedAreaIndex.x = i % cachedAreaDimen.x;
1277         cachedAreaIndex.y = i / cachedAreaDimen.x;
1278
1279         int2 imagePixelIndex;
1280         imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1281
1282         // only support EdgeVirtualPixelMethod through ClampToCanvas
1283         // TODO: implement other virtual pixel method
1284         imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1285         imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1286
1287         pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1288       }
1289
1290       // cache the filter
1291       for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1292         filterCache[i] = filter[i];
1293       }
1294       barrier(CLK_LOCAL_MEM_FENCE);
1295
1296
1297       int2 imageIndex;
1298       imageIndex.x = imageAreaOrg.x + get_local_id(0);
1299       imageIndex.y = imageAreaOrg.y + get_local_id(1);
1300
1301       // if out-of-range, stops here and quit
1302       if (imageIndex.x >= imageWidth
1303         || imageIndex.y >= imageHeight) {
1304           return;
1305       }
1306
1307       int filterIndex = 0;
1308       float4 sum = (float4)0.0f;
1309       float gamma = 0.0f;
1310       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1311         int cacheIndexY = get_local_id(1);
1312         for (int j = 0; j < filterHeight; j++) {
1313           int cacheIndexX = get_local_id(0);
1314           for (int i = 0; i < filterWidth; i++) {
1315             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1316             float f = filterCache[filterIndex];
1317
1318             sum.x += f * p.x;
1319             sum.y += f * p.y;
1320             sum.z += f * p.z;
1321             sum.w += f * p.w;
1322
1323             gamma += f;
1324             filterIndex++;
1325             cacheIndexX++;
1326           }
1327           cacheIndexY++;
1328         }
1329       }
1330       else {
1331         int cacheIndexY = get_local_id(1);
1332         for (int j = 0; j < filterHeight; j++) {
1333           int cacheIndexX = get_local_id(0);
1334           for (int i = 0; i < filterWidth; i++) {
1335
1336             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1337             float alpha = QuantumScale*p.w;
1338             float f = filterCache[filterIndex];
1339             float g = alpha * f;
1340
1341             sum.x += g*p.x;
1342             sum.y += g*p.y;
1343             sum.z += g*p.z;
1344             sum.w += f*p.w;
1345
1346             gamma += g;
1347             filterIndex++;
1348             cacheIndexX++;
1349           }
1350           cacheIndexY++;
1351         }
1352         gamma = PerceptibleReciprocal(gamma);
1353         sum.xyz = gamma*sum.xyz;
1354       }
1355       CLPixelType outputPixel;
1356       outputPixel.x = ClampToQuantum(sum.x);
1357       outputPixel.y = ClampToQuantum(sum.y);
1358       outputPixel.z = ClampToQuantum(sum.z);
1359       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1360
1361       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1362     }
1363   )
1364
1365   STRINGIFY(
1366     __kernel
1367     void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1368                   const uint imageWidth, const uint imageHeight,
1369                   __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1370                   const uint matte, const ChannelType channel) {
1371
1372       int2 imageIndex;
1373       imageIndex.x = get_global_id(0);
1374       imageIndex.y = get_global_id(1);
1375
1376       /*
1377       unsigned int imageWidth = get_global_size(0);
1378       unsigned int imageHeight = get_global_size(1);
1379       */
1380       if (imageIndex.x >= imageWidth
1381           || imageIndex.y >= imageHeight)
1382           return;
1383
1384       int2 midFilterDimen;
1385       midFilterDimen.x = (filterWidth-1)/2;
1386       midFilterDimen.y = (filterHeight-1)/2;
1387
1388       int filterIndex = 0;
1389       float4 sum = (float4)0.0f;
1390       float gamma = 0.0f;
1391       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1392         for (int j = 0; j < filterHeight; j++) {
1393           int2 inputPixelIndex;
1394           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1395           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1396           for (int i = 0; i < filterWidth; i++) {
1397             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1398             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1399
1400             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1401             float f = filter[filterIndex];
1402
1403             sum.x += f * p.x;
1404             sum.y += f * p.y;
1405             sum.z += f * p.z;
1406             sum.w += f * p.w;
1407
1408             gamma += f;
1409
1410             filterIndex++;
1411           }
1412         }
1413       }
1414       else {
1415
1416         for (int j = 0; j < filterHeight; j++) {
1417           int2 inputPixelIndex;
1418           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1419           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1420           for (int i = 0; i < filterWidth; i++) {
1421             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1422             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1423
1424             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1425             float alpha = QuantumScale*p.w;
1426             float f = filter[filterIndex];
1427             float g = alpha * f;
1428
1429             sum.x += g*p.x;
1430             sum.y += g*p.y;
1431             sum.z += g*p.z;
1432             sum.w += f*p.w;
1433
1434             gamma += g;
1435
1436
1437             filterIndex++;
1438           }
1439         }
1440         gamma = PerceptibleReciprocal(gamma);
1441         sum.xyz = gamma*sum.xyz;
1442       }
1443
1444       CLPixelType outputPixel;
1445       outputPixel.x = ClampToQuantum(sum.x);
1446       outputPixel.y = ClampToQuantum(sum.y);
1447       outputPixel.z = ClampToQuantum(sum.z);
1448       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1449
1450       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1451     }
1452   )
1453
1454 /*
1455 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1456 %                                                                             %
1457 %                                                                             %
1458 %                                                                             %
1459 %     D e s p e c k l e                                                       %
1460 %                                                                             %
1461 %                                                                             %
1462 %                                                                             %
1463 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1464 */
1465
1466   STRINGIFY(
1467
1468   __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1469   , const unsigned int imageWidth, const unsigned int imageHeight
1470   , const int2 offset, const int polarity, const int matte) {
1471
1472     int x = get_global_id(0);
1473     int y = get_global_id(1);
1474
1475     CLPixelType v = inputImage[y*imageWidth+x];
1476
1477     int2 neighbor;
1478     neighbor.y = y + offset.y;
1479     neighbor.x = x + offset.x;
1480
1481     int2 clampedNeighbor;
1482     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1483     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1484
1485     CLPixelType r = (clampedNeighbor.x == neighbor.x
1486                      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1487     :(CLPixelType)0;
1488
1489     int sv[4];
1490     sv[0] = (int)v.x;
1491     sv[1] = (int)v.y;
1492     sv[2] = (int)v.z;
1493     sv[3] = (int)v.w;
1494
1495     int sr[4];
1496     sr[0] = (int)r.x;
1497     sr[1] = (int)r.y;
1498     sr[2] = (int)r.z;
1499     sr[3] = (int)r.w;
1500
1501     if (polarity > 0) {
1502       \n #pragma unroll 4\n
1503       for (unsigned int i = 0; i < 4; i++) {
1504         sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1505       }
1506     }
1507     else {
1508       \n #pragma unroll 4\n
1509       for (unsigned int i = 0; i < 4; i++) {
1510         sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1511       }
1512
1513     }
1514
1515     v.x = (CLQuantum)sv[0];
1516     v.y = (CLQuantum)sv[1];
1517     v.z = (CLQuantum)sv[2];
1518
1519     if (matte!=0)
1520       v.w = (CLQuantum)sv[3];
1521
1522     outputImage[y*imageWidth+x] = v;
1523
1524     }
1525
1526
1527   )
1528
1529   STRINGIFY(
1530
1531   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1532   , const unsigned int imageWidth, const unsigned int imageHeight
1533   , const int2 offset, const int polarity, const int matte) {
1534
1535     int x = get_global_id(0);
1536     int y = get_global_id(1);
1537
1538     CLPixelType v = inputImage[y*imageWidth+x];
1539
1540     int2 neighbor, clampedNeighbor;
1541
1542     neighbor.y = y + offset.y;
1543     neighbor.x = x + offset.x;
1544     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1545     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1546
1547     CLPixelType r = (clampedNeighbor.x == neighbor.x
1548       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1549     :(CLPixelType)0;
1550
1551
1552     neighbor.y = y - offset.y;
1553     neighbor.x = x - offset.x;
1554     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1555     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1556
1557     CLPixelType s = (clampedNeighbor.x == neighbor.x
1558       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1559     :(CLPixelType)0;
1560
1561
1562     int sv[4];
1563     sv[0] = (int)v.x;
1564     sv[1] = (int)v.y;
1565     sv[2] = (int)v.z;
1566     sv[3] = (int)v.w;
1567
1568     int sr[4];
1569     sr[0] = (int)r.x;
1570     sr[1] = (int)r.y;
1571     sr[2] = (int)r.z;
1572     sr[3] = (int)r.w;
1573
1574     int ss[4];
1575     ss[0] = (int)s.x;
1576     ss[1] = (int)s.y;
1577     ss[2] = (int)s.z;
1578     ss[3] = (int)s.w;
1579
1580     if (polarity > 0) {
1581       \n #pragma unroll 4\n
1582       for (unsigned int i = 0; i < 4; i++) {
1583         //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1584         //
1585         //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1586         //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1587         sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1588       }
1589     }
1590     else {
1591       \n #pragma unroll 4\n
1592       for (unsigned int i = 0; i < 4; i++) {
1593         //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1594         //
1595         //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1596         sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1597       }
1598     }
1599
1600     v.x = (CLQuantum)sv[0];
1601     v.y = (CLQuantum)sv[1];
1602     v.z = (CLQuantum)sv[2];
1603
1604     if (matte!=0)
1605       v.w = (CLQuantum)sv[3];
1606
1607     outputImage[y*imageWidth+x] = v;
1608
1609     }
1610   )
1611
1612 /*
1613 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1614 %                                                                             %
1615 %                                                                             %
1616 %                                                                             %
1617 %     E q u a l i z e                                                         %
1618 %                                                                             %
1619 %                                                                             %
1620 %                                                                             %
1621 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1622 */
1623
1624     STRINGIFY(
1625     /*
1626     */
1627     __kernel void Equalize(__global CLPixelType * restrict im,
1628       const ChannelType channel,
1629       __global CLPixelType * restrict equalize_map,
1630       const float4 white, const float4 black)
1631       {
1632         const int x = get_global_id(0);
1633         const int y = get_global_id(1);
1634         const int columns = get_global_size(0);
1635         const int c = x + y * columns;
1636
1637         uint ePos;
1638         CLPixelType oValue, eValue;
1639         CLQuantum red, green, blue, alpha;
1640
1641         //read from global
1642         oValue=im[c];
1643
1644         if ((channel & SyncChannels) != 0)
1645         {
1646           if (getRedF4(white) != getRedF4(black))
1647           {
1648             ePos = ScaleQuantumToMap(getRed(oValue));
1649             eValue = equalize_map[ePos];
1650             red = getRed(eValue);
1651             ePos = ScaleQuantumToMap(getGreen(oValue));
1652             eValue = equalize_map[ePos];
1653             green = getRed(eValue);
1654             ePos = ScaleQuantumToMap(getBlue(oValue));
1655             eValue = equalize_map[ePos];
1656             blue = getRed(eValue);
1657             ePos = ScaleQuantumToMap(getAlpha(oValue));
1658             eValue = equalize_map[ePos];
1659             alpha = getRed(eValue);
1660
1661             //write back
1662             im[c]=(CLPixelType)(blue, green, red, alpha);
1663           }
1664
1665         }
1666
1667         // for equalizing, we always need all channels?
1668         // otherwise something more
1669
1670      }
1671     )
1672
1673 /*
1674 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1675 %                                                                             %
1676 %                                                                             %
1677 %                                                                             %
1678 %     F u n c t i o n                                                         %
1679 %                                                                             %
1680 %                                                                             %
1681 %                                                                             %
1682 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1683 */
1684
1685   STRINGIFY(
1686   /*
1687   apply FunctionImageChannel(braightness-contrast)
1688   */
1689   CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1690     const unsigned int number_parameters,__constant float *parameters)
1691   {
1692     float result = 0.0f;
1693
1694     switch (function)
1695     {
1696     case PolynomialFunction:
1697       {
1698         for (unsigned int i=0; i < number_parameters; i++)
1699           result = result*QuantumScale*pixel + parameters[i];
1700         result *= QuantumRange;
1701         break;
1702       }
1703     case SinusoidFunction:
1704       {
1705         float  freq,phase,ampl,bias;
1706         freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1707         phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1708         ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1709         bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1710         result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1711           (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1712         break;
1713       }
1714     case ArcsinFunction:
1715       {
1716         float  width,range,center,bias;
1717         width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1718         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1719         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1720         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1721
1722         result = 2.0f/width*(QuantumScale*pixel - center);
1723         result = range/MagickPI*asin(result)+bias;
1724         result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1725         result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1726         result *= QuantumRange;
1727         break;
1728       }
1729     case ArctanFunction:
1730       {
1731         float slope,range,center,bias;
1732         slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1733         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1734         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1735         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1736         result = MagickPI*slope*(QuantumScale*pixel-center);
1737         result = QuantumRange*(range/MagickPI*atan(result) + bias);
1738         break;
1739       }
1740     case UndefinedFunction:
1741       break;
1742     }
1743     return(ClampToQuantum(result));
1744   }
1745   )
1746
1747   STRINGIFY(
1748   /*
1749   Improve brightness / contrast of the image
1750   channel : define which channel is improved
1751   function : the function called to enchance the brightness contrast
1752   number_parameters : numbers of parameters
1753   parameters : the parameter
1754   */
1755   __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1756     const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1757     __constant float *parameters)
1758   {
1759     const unsigned int x = get_global_id(0);
1760     const unsigned int y = get_global_id(1);
1761     const unsigned int columns = get_global_size(0);
1762     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1763
1764     float red;
1765     float green;
1766     float blue;
1767     float alpha;
1768
1769     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1770
1771     if ((channel & RedChannel) != 0)
1772       red=ApplyFunction(red, function, number_parameters, parameters);
1773
1774     if (number_channels > 2)
1775       {
1776         if ((channel & GreenChannel) != 0)
1777           green=ApplyFunction(green, function, number_parameters, parameters);
1778
1779         if ((channel & BlueChannel) != 0)
1780           blue=ApplyFunction(blue, function, number_parameters, parameters);
1781       }
1782
1783     if (((number_channels == 4) || (number_channels == 2)) &&
1784         ((channel & AlphaChannel) != 0))
1785       alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1786
1787     WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1788   }
1789   )
1790
1791 /*
1792 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1793 %                                                                             %
1794 %                                                                             %
1795 %                                                                             %
1796 %     G r a y s c a l e                                                       %
1797 %                                                                             %
1798 %                                                                             %
1799 %                                                                             %
1800 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1801 */
1802
1803   STRINGIFY(
1804   __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1805     const unsigned int colorspace,const unsigned int method)
1806   {
1807     const unsigned int x = get_global_id(0);
1808     const unsigned int y = get_global_id(1);
1809     const unsigned int columns = get_global_size(0);
1810     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1811
1812     float
1813       blue,
1814       green,
1815       red;
1816
1817     red=getPixelRed(p);
1818     green=getPixelGreen(p);
1819     blue=getPixelBlue(p);
1820
1821     CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1822
1823     setPixelRed(p,intensity);
1824     setPixelGreen(p,intensity);
1825     setPixelBlue(p,intensity);
1826   }
1827   )
1828
1829 /*
1830 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1831 %                                                                             %
1832 %                                                                             %
1833 %                                                                             %
1834 %     L o c a l C o n t r a s t                                               %
1835 %                                                                             %
1836 %                                                                             %
1837 %                                                                             %
1838 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1839 */
1840
1841     STRINGIFY(
1842
1843       __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1844           const int radius,
1845           const int imageWidth,
1846           const int imageHeight)
1847       {
1848         const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1849
1850         int x = get_local_id(0);
1851         int y = get_global_id(1);
1852
1853         if ((x >= imageWidth) || (y >= imageHeight))
1854           return;
1855
1856         global CLPixelType *src = srcImage + y * imageWidth;
1857
1858         for (int i = x; i < imageWidth; i += get_local_size(0)) {
1859             float sum = 0.0f;
1860             float weight = 1.0f;
1861
1862             int j = i - radius;
1863             while ((j + 7) < i) {
1864                 for (int k = 0; k < 8; ++k) // Unroll 8x
1865                     sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1866                 weight += 8.0f;
1867                 j+=8;
1868             }
1869             while (j < i) {
1870                 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1871                 weight += 1.0f;
1872                 ++j;
1873             }
1874
1875             while ((j + 7) < radius + i) {
1876                 for (int k = 0; k < 8; ++k) // Unroll 8x
1877                     sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1878                 weight -= 8.0f;
1879                 j+=8;
1880             }
1881             while (j < radius + i) {
1882                 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1883                 weight -= 1.0f;
1884                 ++j;
1885             }
1886
1887             tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1888         }
1889       }
1890     )
1891
1892     STRINGIFY(
1893       __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1894           const int radius,
1895           const float strength,
1896           const int imageWidth,
1897           const int imageHeight)
1898       {
1899         const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1900
1901         int x = get_global_id(0);
1902         int y = get_global_id(1);
1903
1904         if ((x >= imageWidth) || (y >= imageHeight))
1905                 return;
1906
1907         global float *src = blurImage + x;
1908
1909         float sum = 0.0f;
1910         float weight = 1.0f;
1911
1912         int j = y - radius;
1913         while ((j + 7) < y) {
1914             for (int k = 0; k < 8; ++k) // Unroll 8x
1915                 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1916             weight += 8.0f;
1917             j+=8;
1918         }
1919         while (j < y) {
1920             sum += weight * src[mirrorBottom(j) * imageWidth];
1921             weight += 1.0f;
1922             ++j;
1923         }
1924
1925         while ((j + 7) < radius + y) {
1926             for (int k = 0; k < 8; ++k) // Unroll 8x
1927                 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1928             weight -= 8.0f;
1929             j+=8;
1930         }
1931         while (j < radius + y) {
1932             sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1933             weight -= 1.0f;
1934             ++j;
1935         }
1936
1937         CLPixelType pixel = srcImage[x + y * imageWidth];
1938         float srcVal = dot(RGB, convert_float4(pixel));
1939         float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1940         mult = (srcVal + mult) / srcVal;
1941
1942         pixel.x = ClampToQuantum(pixel.x * mult);
1943         pixel.y = ClampToQuantum(pixel.y * mult);
1944         pixel.z = ClampToQuantum(pixel.z * mult);
1945
1946         dstImage[x + y * imageWidth] = pixel;
1947       }
1948     )
1949
1950 /*
1951 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1952 %                                                                             %
1953 %                                                                             %
1954 %                                                                             %
1955 %     M o d u l a t e                                                         %
1956 %                                                                             %
1957 %                                                                             %
1958 %                                                                             %
1959 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1960 */
1961
1962   STRINGIFY(
1963
1964   inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1965     float *hue, float *saturation, float *lightness)
1966   {
1967   float
1968     c,
1969     tmax,
1970     tmin;
1971
1972   /*
1973      Convert RGB to HSL colorspace.
1974      */
1975   tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1976   tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1977
1978   c=tmax-tmin;
1979
1980   *lightness=(tmax+tmin)/2.0;
1981   if (c <= 0.0)
1982   {
1983     *hue=0.0;
1984     *saturation=0.0;
1985     return;
1986   }
1987
1988   if (tmax == (QuantumScale*red))
1989   {
1990     *hue=(QuantumScale*green-QuantumScale*blue)/c;
1991     if ((QuantumScale*green) < (QuantumScale*blue))
1992       *hue+=6.0;
1993   }
1994   else
1995     if (tmax == (QuantumScale*green))
1996       *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
1997     else
1998       *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
1999
2000   *hue*=60.0/360.0;
2001   if (*lightness <= 0.5)
2002     *saturation=c/(2.0*(*lightness));
2003   else
2004     *saturation=c/(2.0-2.0*(*lightness));
2005   }
2006
2007   inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2008       CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2009   {
2010     float
2011       b,
2012       c,
2013       g,
2014       h,
2015       tmin,
2016       r,
2017       x;
2018
2019     /*
2020        Convert HSL to RGB colorspace.
2021        */
2022     h=hue*360.0;
2023     if (lightness <= 0.5)
2024       c=2.0*lightness*saturation;
2025     else
2026       c=(2.0-2.0*lightness)*saturation;
2027     tmin=lightness-0.5*c;
2028     h-=360.0*floor(h/360.0);
2029     h/=60.0;
2030     x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2031     switch ((int) floor(h) % 6)
2032     {
2033       case 0:
2034         {
2035           r=tmin+c;
2036           g=tmin+x;
2037           b=tmin;
2038           break;
2039         }
2040       case 1:
2041         {
2042           r=tmin+x;
2043           g=tmin+c;
2044           b=tmin;
2045           break;
2046         }
2047       case 2:
2048         {
2049           r=tmin;
2050           g=tmin+c;
2051           b=tmin+x;
2052           break;
2053         }
2054       case 3:
2055         {
2056           r=tmin;
2057           g=tmin+x;
2058           b=tmin+c;
2059           break;
2060         }
2061       case 4:
2062         {
2063           r=tmin+x;
2064           g=tmin;
2065           b=tmin+c;
2066           break;
2067         }
2068       case 5:
2069         {
2070           r=tmin+c;
2071           g=tmin;
2072           b=tmin+x;
2073           break;
2074         }
2075       default:
2076         {
2077           r=0.0;
2078           g=0.0;
2079           b=0.0;
2080         }
2081     }
2082     *red=ClampToQuantum(QuantumRange*r);
2083     *green=ClampToQuantum(QuantumRange*g);
2084     *blue=ClampToQuantum(QuantumRange*b);
2085   }
2086
2087   inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2088     CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2089   {
2090     float
2091       hue,
2092       lightness,
2093       saturation;
2094
2095     /*
2096     Increase or decrease color lightness, saturation, or hue.
2097     */
2098     ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2099     hue+=0.5*(0.01*percent_hue-1.0);
2100     while (hue < 0.0)
2101       hue+=1.0;
2102     while (hue >= 1.0)
2103       hue-=1.0;
2104     saturation*=0.01*percent_saturation;
2105     lightness*=0.01*percent_lightness;
2106     ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2107   }
2108
2109   __kernel void Modulate(__global CLPixelType *im,
2110     const float percent_brightness,
2111     const float percent_hue,
2112     const float percent_saturation,
2113     const int colorspace)
2114   {
2115
2116     const int x = get_global_id(0);
2117     const int y = get_global_id(1);
2118     const int columns = get_global_size(0);
2119     const int c = x + y * columns;
2120
2121     CLPixelType pixel = im[c];
2122
2123     CLQuantum
2124         blue,
2125         green,
2126         red;
2127
2128     red=getRed(pixel);
2129     green=getGreen(pixel);
2130     blue=getBlue(pixel);
2131
2132     switch (colorspace)
2133     {
2134       case HSLColorspace:
2135       default:
2136         {
2137           ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2138               &red, &green, &blue);
2139         }
2140
2141     }
2142
2143     CLPixelType filteredPixel;
2144
2145     setRed(&filteredPixel, red);
2146     setGreen(&filteredPixel, green);
2147     setBlue(&filteredPixel, blue);
2148     filteredPixel.w = pixel.w;
2149
2150     im[c] = filteredPixel;
2151   }
2152   )
2153
2154 /*
2155 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2156 %                                                                             %
2157 %                                                                             %
2158 %                                                                             %
2159 %     M o t i o n B l u r                                                     %
2160 %                                                                             %
2161 %                                                                             %
2162 %                                                                             %
2163 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2164 */
2165
2166   STRINGIFY(
2167     __kernel
2168     void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2169                     const unsigned int imageWidth, const unsigned int imageHeight,
2170                     const __global float *filter, const unsigned int width, const __global int2* offset,
2171                     const float4 bias,
2172                     const ChannelType channel, const unsigned int matte) {
2173
2174       int2 currentPixel;
2175       currentPixel.x = get_global_id(0);
2176       currentPixel.y = get_global_id(1);
2177
2178       if (currentPixel.x >= imageWidth
2179           || currentPixel.y >= imageHeight)
2180           return;
2181
2182       float4 pixel;
2183       pixel.x = (float)bias.x;
2184       pixel.y = (float)bias.y;
2185       pixel.z = (float)bias.z;
2186       pixel.w = (float)bias.w;
2187
2188       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2189
2190         for (int i = 0; i < width; i++) {
2191           // only support EdgeVirtualPixelMethod through ClampToCanvas
2192           // TODO: implement other virtual pixel method
2193           int2 samplePixel = currentPixel + offset[i];
2194           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2195           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2196           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2197
2198           pixel.x += (filter[i] * (float)samplePixelValue.x);
2199           pixel.y += (filter[i] * (float)samplePixelValue.y);
2200           pixel.z += (filter[i] * (float)samplePixelValue.z);
2201           pixel.w += (filter[i] * (float)samplePixelValue.w);
2202         }
2203
2204         CLPixelType outputPixel;
2205         outputPixel.x = ClampToQuantum(pixel.x);
2206         outputPixel.y = ClampToQuantum(pixel.y);
2207         outputPixel.z = ClampToQuantum(pixel.z);
2208         outputPixel.w = ClampToQuantum(pixel.w);
2209         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2210       }
2211       else {
2212
2213         float gamma = 0.0f;
2214         for (int i = 0; i < width; i++) {
2215           // only support EdgeVirtualPixelMethod through ClampToCanvas
2216           // TODO: implement other virtual pixel method
2217           int2 samplePixel = currentPixel + offset[i];
2218           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2219           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2220
2221           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2222
2223           float alpha = QuantumScale*samplePixelValue.w;
2224           float k = filter[i];
2225           pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2226           pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2227           pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2228
2229           pixel.w += k * alpha * samplePixelValue.w;
2230
2231           gamma+=k*alpha;
2232         }
2233         gamma = PerceptibleReciprocal(gamma);
2234         pixel.xyz = gamma*pixel.xyz;
2235
2236         CLPixelType outputPixel;
2237         outputPixel.x = ClampToQuantum(pixel.x);
2238         outputPixel.y = ClampToQuantum(pixel.y);
2239         outputPixel.z = ClampToQuantum(pixel.z);
2240         outputPixel.w = ClampToQuantum(pixel.w);
2241         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2242       }
2243     }
2244   )
2245
2246 /*
2247 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2248 %                                                                             %
2249 %                                                                             %
2250 %                                                                             %
2251 %     R e s i z e                                                             %
2252 %                                                                             %
2253 %                                                                             %
2254 %                                                                             %
2255 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2256 */
2257
2258   STRINGIFY(
2259   // Based on Box from resize.c
2260   float BoxResizeFilter(const float x)
2261   {
2262     return 1.0f;
2263   }
2264   )
2265
2266   STRINGIFY(
2267   // Based on CubicBC from resize.c
2268   float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2269   {
2270     /*
2271     Cubic Filters using B,C determined values:
2272     Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
2273     Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
2274     Spline              B = 1   C = 0    B-Spline Gaussian approximation
2275     Hermite             B = 0   C = 0    B-Spline interpolator
2276
2277     See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2278     Graphics Computer Graphics, Volume 22, Number 4, August 1988
2279     http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2280     Mitchell.pdf.
2281
2282     Coefficents are determined from B,C values:
2283     P0 = (  6 - 2*B       )/6 = coeff[0]
2284     P1 =         0
2285     P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2286     P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2287     Q0 = (      8*B +24*C )/6 = coeff[3]
2288     Q1 = (    -12*B -48*C )/6 = coeff[4]
2289     Q2 = (      6*B +30*C )/6 = coeff[5]
2290     Q3 = (    - 1*B - 6*C )/6 = coeff[6]
2291
2292     which are used to define the filter:
2293
2294     P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
2295     Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
2296
2297     which ensures function is continuous in value and derivative (slope).
2298     */
2299     if (x < 1.0)
2300       return(resizeFilterCoefficients[0]+x*(x*
2301       (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2302     if (x < 2.0)
2303       return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2304       (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2305     return(0.0);
2306   }
2307   )
2308
2309   STRINGIFY(
2310   float Sinc(const float x)
2311   {
2312     if (x != 0.0f)
2313     {
2314       const float alpha=(float) (MagickPI*x);
2315       return sinpi(x)/alpha;
2316     }
2317     return(1.0f);
2318   }
2319   )
2320
2321   STRINGIFY(
2322   float Triangle(const float x)
2323   {
2324     /*
2325     1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2326     a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
2327     for Sinc().
2328     */
2329     return ((x<1.0f)?(1.0f-x):0.0f);
2330   }
2331   )
2332
2333
2334   STRINGIFY(
2335   float Hann(const float x)
2336   {
2337     /*
2338     Cosine window function:
2339       0.5+0.5*cos(pi*x).
2340     */
2341     const float cosine=cos((MagickPI*x));
2342     return(0.5f+0.5f*cosine);
2343   }
2344   )
2345
2346   STRINGIFY(
2347   float Hamming(const float x)
2348   {
2349     /*
2350       Offset cosine window function:
2351        .54 + .46 cos(pi x).
2352     */
2353     const float cosine=cos((MagickPI*x));
2354     return(0.54f+0.46f*cosine);
2355   }
2356   )
2357
2358   STRINGIFY(
2359   float Blackman(const float x)
2360   {
2361     /*
2362       Blackman: 2nd order cosine windowing function:
2363         0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2364
2365       Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2366       five flops.
2367     */
2368     const float cosine=cos((MagickPI*x));
2369     return(0.34f+cosine*(0.5f+cosine*0.16f));
2370   }
2371   )
2372
2373   STRINGIFY(
2374   inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2375   {
2376     switch (filterType)
2377     {
2378     /* Call Sinc even for SincFast to get better precision on GPU
2379        and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
2380     case SincWeightingFunction:
2381     case SincFastWeightingFunction:
2382       return Sinc(x);
2383     case CubicBCWeightingFunction:
2384       return CubicBC(x,filterCoefficients);
2385     case BoxWeightingFunction:
2386       return BoxResizeFilter(x);
2387     case TriangleWeightingFunction:
2388       return Triangle(x);
2389     case HannWeightingFunction:
2390       return Hann(x);
2391     case HammingWeightingFunction:
2392       return Hamming(x);
2393     case BlackmanWeightingFunction:
2394       return Blackman(x);
2395
2396     default:
2397       return 0.0f;
2398     }
2399   }
2400   )
2401
2402
2403   STRINGIFY(
2404   inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2405            , const ResizeWeightingFunctionType resizeWindowType
2406            , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2407   {
2408     float scale;
2409     float xBlur = fabs(x/resizeFilterBlur);
2410     if (resizeWindowSupport < MagickEpsilon
2411         || resizeWindowType == BoxWeightingFunction)
2412     {
2413       scale = 1.0f;
2414     }
2415     else
2416     {
2417       scale = resizeFilterScale;
2418       scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2419     }
2420     float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2421     return weight;
2422   }
2423
2424   )
2425
2426   ;
2427   const char *accelerateKernels2 =
2428
2429   STRINGIFY(
2430
2431   inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2432     return (numWorkItems/pixelPerWorkgroup);
2433   }
2434
2435   // returns the index of the pixel for the current workitem to compute.
2436   // returns -1 if this workitem doesn't need to participate in any computation
2437   inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2438     const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2439     int pixelIndex = itemID/numWorkItemsPerPixel;
2440     pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2441     return pixelIndex;
2442   }
2443
2444   )
2445
2446   STRINGIFY(
2447   __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2448     void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2449       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2450       const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2451       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2452       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2453       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2454       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2455       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2456   {
2457     // calculate the range of resized image pixels computed by this workgroup
2458     const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2459     const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2460     const unsigned int actualNumPixelToCompute = stopX - startX;
2461
2462     // calculate the range of input image pixels to cache
2463     float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2464     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2465     scale = PerceptibleReciprocal(scale);
2466
2467     const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2468     const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2469
2470     // cache the input pixels into local memory
2471     const unsigned int y = get_global_id(1);
2472     const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2473     const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2474     event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2475     wait_group_events(1, &e);
2476
2477     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2478     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2479     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2480     {
2481       const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2482       const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2483       const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2484
2485       // determine which resized pixel computed by this workitem
2486       const unsigned int itemID = get_local_id(0);
2487       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2488
2489       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2490
2491       float4 filteredPixel = (float4)0.0f;
2492       float density = 0.0f;
2493       float gamma = 0.0f;
2494       // -1 means this workitem doesn't participate in the computation
2495       if (pixelIndex != -1)
2496       {
2497         // x coordinated of the resized pixel computed by this workitem
2498         const int x = chunkStartX + pixelIndex;
2499
2500         // calculate how many steps required for this pixel
2501         const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2502         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2503         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2504         const unsigned int n = stop - start;
2505
2506         // calculate how many steps this workitem will contribute
2507         unsigned int numStepsPerWorkItem = n / numItems;
2508         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2509
2510         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2511         if (startStep < n)
2512         {
2513           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2514
2515           unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2516           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2517           {
2518             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2519               (ResizeWeightingFunctionType) resizeFilterType,
2520               (ResizeWeightingFunctionType) resizeWindowType,
2521               resizeFilterScale, resizeFilterWindowSupport,
2522               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2523
2524             float4 cp = (float4)0.0f;
2525
2526             __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2527             cp.x = (float) *(p);
2528             if (number_channels > 2)
2529             {
2530               cp.y = (float) *(p + 1);
2531               cp.z = (float) *(p + 2);
2532             }
2533
2534             if (alpha_index != 0)
2535             {
2536               cp.w = (float) *(p + alpha_index);
2537
2538               float alpha = weight * QuantumScale * cp.w;
2539
2540               filteredPixel.x += alpha * cp.x;
2541               filteredPixel.y += alpha * cp.y;
2542               filteredPixel.z += alpha * cp.z;
2543               filteredPixel.w += weight * cp.w;
2544               gamma += alpha;
2545             }
2546             else
2547               filteredPixel += ((float4) weight)*cp;
2548
2549             density += weight;
2550           }
2551         }
2552       }
2553
2554       // initialize the accumulators to zero
2555       if (itemID < actualNumPixelInThisChunk) {
2556         outputPixelCache[itemID] = (float4)0.0f;
2557         densityCache[itemID] = 0.0f;
2558         if (alpha_index != 0)
2559           gammaCache[itemID] = 0.0f;
2560       }
2561       barrier(CLK_LOCAL_MEM_FENCE);
2562
2563       // accumulatte the filtered pixel value and the density
2564       for (unsigned int i = 0; i < numItems; i++) {
2565         if (pixelIndex != -1) {
2566           if (itemID%numItems == i) {
2567             outputPixelCache[pixelIndex]+=filteredPixel;
2568             densityCache[pixelIndex]+=density;
2569             if (alpha_index != 0)
2570               gammaCache[pixelIndex]+=gamma;
2571           }
2572         }
2573         barrier(CLK_LOCAL_MEM_FENCE);
2574       }
2575
2576       if (itemID < actualNumPixelInThisChunk)
2577       {
2578         float4 filteredPixel = outputPixelCache[itemID];
2579
2580         float gamma = 0.0f;
2581         if (alpha_index != 0)
2582           gamma = gammaCache[itemID];
2583
2584         float density = densityCache[itemID];
2585         if ((density != 0.0f) && (density != 1.0f))
2586         {
2587           density = PerceptibleReciprocal(density);
2588           filteredPixel *= (float4) density;
2589           if (alpha_index != 0)
2590             gamma *= density;
2591         }
2592
2593         if (alpha_index != 0)
2594         {
2595           gamma = PerceptibleReciprocal(gamma);
2596           filteredPixel.x *= gamma;
2597           filteredPixel.y *= gamma;
2598           filteredPixel.z *= gamma;
2599         }
2600
2601         WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2602       }
2603     }
2604   }
2605   )
2606
2607
2608   STRINGIFY(
2609  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2610     void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2611       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2612       const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2613       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2614       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2615       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2616       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2617       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2618   {
2619     // calculate the range of resized image pixels computed by this workgroup
2620     const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2621     const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2622     const unsigned int actualNumPixelToCompute = stopY - startY;
2623
2624     // calculate the range of input image pixels to cache
2625     float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2626     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2627     scale = PerceptibleReciprocal(scale);
2628
2629     const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2630     const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2631
2632     // cache the input pixels into local memory
2633     const unsigned int x = get_global_id(0);
2634     unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2635     unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2636     unsigned int stride = inputColumns * number_channels;
2637     for (unsigned int i = 0; i < number_channels; i++)
2638     {
2639       event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2640       wait_group_events(1,&e);
2641     }
2642
2643     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2644     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2645     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2646     {
2647       const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2648       const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2649       const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2650
2651       // determine which resized pixel computed by this workitem
2652       const unsigned int itemID = get_local_id(1);
2653       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2654
2655       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2656
2657       float4 filteredPixel = (float4)0.0f;
2658       float density = 0.0f;
2659       float gamma = 0.0f;
2660       // -1 means this workitem doesn't participate in the computation
2661       if (pixelIndex != -1)
2662       {
2663         // x coordinated of the resized pixel computed by this workitem
2664         const int y = chunkStartY + pixelIndex;
2665
2666         // calculate how many steps required for this pixel
2667         const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2668         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2669         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2670         const unsigned int n = stop - start;
2671
2672         // calculate how many steps this workitem will contribute
2673         unsigned int numStepsPerWorkItem = n / numItems;
2674         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2675
2676         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2677         if (startStep < n)
2678         {
2679           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2680
2681           unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2682           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2683           {
2684             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2685               (ResizeWeightingFunctionType) resizeFilterType,
2686               (ResizeWeightingFunctionType) resizeWindowType,
2687               resizeFilterScale, resizeFilterWindowSupport,
2688               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2689
2690             float4 cp = (float4)0.0f;
2691
2692             __local CLQuantum *p = inputImageCache + cacheIndex;
2693             cp.x = (float) *(p);
2694             if (number_channels > 2)
2695             {
2696               cp.y = (float) *(p + rangeLength);
2697               cp.z = (float) *(p + (rangeLength * 2));
2698             }
2699
2700             if (alpha_index != 0)
2701             {
2702               cp.w = (float) *(p + (rangeLength * alpha_index));
2703
2704               float alpha = weight * QuantumScale * cp.w;
2705
2706               filteredPixel.x += alpha * cp.x;
2707               filteredPixel.y += alpha * cp.y;
2708               filteredPixel.z += alpha * cp.z;
2709               filteredPixel.w += weight * cp.w;
2710               gamma += alpha;
2711             }
2712             else
2713               filteredPixel += ((float4) weight)*cp;
2714
2715             density += weight;
2716           }
2717         }
2718       }
2719
2720       // initialize the accumulators to zero
2721       if (itemID < actualNumPixelInThisChunk) {
2722         outputPixelCache[itemID] = (float4)0.0f;
2723         densityCache[itemID] = 0.0f;
2724         if (alpha_index != 0)
2725           gammaCache[itemID] = 0.0f;
2726       }
2727       barrier(CLK_LOCAL_MEM_FENCE);
2728
2729       // accumulatte the filtered pixel value and the density
2730       for (unsigned int i = 0; i < numItems; i++) {
2731         if (pixelIndex != -1) {
2732           if (itemID%numItems == i) {
2733             outputPixelCache[pixelIndex]+=filteredPixel;
2734             densityCache[pixelIndex]+=density;
2735             if (alpha_index != 0)
2736               gammaCache[pixelIndex]+=gamma;
2737           }
2738         }
2739         barrier(CLK_LOCAL_MEM_FENCE);
2740       }
2741
2742       if (itemID < actualNumPixelInThisChunk)
2743       {
2744         float4 filteredPixel = outputPixelCache[itemID];
2745
2746         float gamma = 0.0f;
2747         if (alpha_index != 0)
2748           gamma = gammaCache[itemID];
2749
2750         float density = densityCache[itemID];
2751         if ((density != 0.0f) && (density != 1.0f))
2752         {
2753           density = PerceptibleReciprocal(density);
2754           filteredPixel *= (float4) density;
2755           if (alpha_index != 0)
2756             gamma *= density;
2757         }
2758
2759         if (alpha_index != 0)
2760         {
2761           gamma = PerceptibleReciprocal(gamma);
2762           filteredPixel.x *= gamma;
2763           filteredPixel.y *= gamma;
2764           filteredPixel.z *= gamma;
2765         }
2766
2767         WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2768       }
2769     }
2770   }
2771   )
2772
2773 /*
2774 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2775 %                                                                             %
2776 %                                                                             %
2777 %                                                                             %
2778 %     R o t a t i o n a l B l u r                                             %
2779 %                                                                             %
2780 %                                                                             %
2781 %                                                                             %
2782 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2783 */
2784
2785   STRINGIFY(
2786   __kernel void RotationalBlur(const __global CLQuantum *image,
2787     const unsigned int number_channels,const unsigned int channel,
2788     const float2 blurCenter,__constant float *cos_theta,
2789     __constant float *sin_theta,const unsigned int cossin_theta_size,
2790     __global CLQuantum *filteredImage)
2791   {
2792     const int x = get_global_id(0);
2793     const int y = get_global_id(1);
2794     const int columns = get_global_size(0);
2795     const int rows = get_global_size(1);
2796     unsigned int step = 1;
2797     float center_x = (float) x - blurCenter.x;
2798     float center_y = (float) y - blurCenter.y;
2799     float radius = hypot(center_x, center_y);
2800
2801     float blur_radius = hypot(blurCenter.x, blurCenter.y);
2802
2803     if (radius > MagickEpsilon)
2804     {
2805       step = (unsigned int) (blur_radius / radius);
2806       if (step == 0)
2807         step = 1;
2808       if (step >= cossin_theta_size)
2809         step = cossin_theta_size-1;
2810     }
2811
2812     float4 result = 0.0f;
2813     float normalize = 0.0f;
2814     float gamma = 0.0f;
2815
2816     for (unsigned int i=0; i<cossin_theta_size; i+=step)
2817     {
2818       int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2819       int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2820
2821       float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2822
2823       if ((number_channels == 4) || (number_channels == 2))
2824       {
2825         float alpha = (float)(QuantumScale*pixel.w);
2826
2827         gamma += alpha;
2828
2829         result.x += alpha * pixel.x;
2830         result.y += alpha * pixel.y;
2831         result.z += alpha * pixel.z;
2832         result.w += pixel.w;
2833       }
2834       else
2835         result += pixel;
2836
2837       normalize += 1.0f;
2838     }
2839
2840     normalize = PerceptibleReciprocal(normalize);
2841
2842     if ((number_channels == 4) || (number_channels == 2))
2843     {
2844       gamma = PerceptibleReciprocal(gamma);
2845       result.x *= gamma;
2846       result.y *= gamma;
2847       result.z *= gamma;
2848       result.w *= normalize;
2849     }
2850     else
2851       result *= normalize;
2852
2853     WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2854   }
2855   )
2856
2857 /*
2858 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2859 %                                                                             %
2860 %                                                                             %
2861 %                                                                             %
2862 %     U n s h a r p M a s k                                                   %
2863 %                                                                             %
2864 %                                                                             %
2865 %                                                                             %
2866 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2867 */
2868
2869   STRINGIFY(
2870   __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2871     const __global float4 *blurRowData,const unsigned int number_channels,
2872     const ChannelType channel,const unsigned int columns,
2873     const unsigned int rows,__local float4* cachedData,
2874     __local float* cachedFilter,const __global float *filter,
2875     const unsigned int width,const float gain, const float threshold,
2876     __global CLQuantum *filteredImage)
2877   {
2878     const unsigned int radius = (width-1)/2;
2879
2880     // cache the pixel shared by the workgroup
2881     const int groupX = get_group_id(0);
2882     const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2883     const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2884
2885     if ((groupStartY >= 0) && (groupStopY < rows))
2886     {
2887       event_t e = async_work_group_strided_copy(cachedData,
2888         blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2889       wait_group_events(1,&e);
2890     }
2891     else
2892     {
2893       for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2894         cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2895
2896       barrier(CLK_LOCAL_MEM_FENCE);
2897     }
2898     // cache the filter as well
2899     event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2900     wait_group_events(1,&e);
2901
2902     // only do the work if this is not a patched item
2903     const int cy = get_global_id(1);
2904
2905     if (cy < rows)
2906     {
2907       float4 blurredPixel = (float4) 0.0f;
2908
2909       int i = 0;
2910
2911       for ( ; i+7 < width; )
2912       {
2913         for (int j=0; j < 8; j++, i++)
2914           blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2915       }
2916
2917       for ( ; i < width; i++)
2918         blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2919
2920       float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2921       float4 outputPixel = inputImagePixel - blurredPixel;
2922
2923       float quantumThreshold = QuantumRange*threshold;
2924
2925       int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2926       outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2927
2928       //write back
2929       WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2930     }
2931   }
2932   )
2933
2934   STRINGIFY(
2935   __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2936     const ChannelType channel,__constant float *filter,const unsigned int width,
2937     const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2938     const float gain,const float threshold,__global CLQuantum *filteredImage)
2939   {
2940     const unsigned int x = get_global_id(0);
2941     const unsigned int y = get_global_id(1);
2942
2943     const unsigned int radius = (width - 1) / 2;
2944
2945     int row = y - radius;
2946     int baseRow = get_group_id(1) * get_local_size(1) - radius;
2947     int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2948
2949     while (row < endRow) {
2950       int srcy = (row < 0) ? -row : row; // mirror pad
2951       srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2952
2953       float4 value = 0.0f;
2954
2955       int ix = x - radius;
2956       int i = 0;
2957
2958       while (i + 7 < width) {
2959         for (int j = 0; j < 8; ++j) { // unrolled
2960           int srcx = ix + j;
2961           srcx = (srcx < 0) ? -srcx : srcx;
2962           srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2963           value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2964         }
2965         ix += 8;
2966         i += 8;
2967       }
2968
2969       while (i < width) {
2970         int srcx = (ix < 0) ? -ix : ix; // mirror pad
2971         srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2972         value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2973         ++i;
2974         ++ix;
2975       }
2976       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2977       row += get_local_size(1);
2978     }
2979
2980     barrier(CLK_LOCAL_MEM_FENCE);
2981
2982     const int px = get_local_id(0);
2983     const int py = get_local_id(1);
2984     const int prp = get_local_size(0);
2985     float4 value = (float4)(0.0f);
2986
2987     int i = 0;
2988     while (i + 7 < width) {
2989       for (int j = 0; j < 8; ++j) // unrolled
2990         value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2991       i += 8;
2992     }
2993     while (i < width) {
2994       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2995       ++i;
2996     }
2997
2998     if ((x < columns) && (y < rows)) {
2999       float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
3000       float4 diff = srcPixel - value;
3001
3002       float quantumThreshold = QuantumRange*threshold;
3003
3004       int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3005       value = select(srcPixel + diff * gain, srcPixel, mask);
3006
3007       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
3008     }
3009   }
3010   )
3011
3012   STRINGIFY(
3013     __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
3014     void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
3015       const unsigned int number_channels,const unsigned int max_channels,
3016       const float threshold,const int passes,const unsigned int imageWidth,
3017       const unsigned int imageHeight)
3018   {
3019     const int pad = (1 << (passes - 1));
3020     const int tileSize = 64;
3021     const int tileRowPixels = 64;
3022     const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3023
3024     CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
3025
3026     local float buffer[64 * 64];
3027
3028     int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3029     int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3030
3031     for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3032       int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
3033                 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
3034
3035       for (int channel = 0; channel < max_channels; ++channel)
3036         stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
3037     }
3038
3039     for (int channel = 0; channel < max_channels; ++channel) {
3040       // Load LDS
3041       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3042         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3043
3044       // Process
3045
3046       float tmp[16];
3047       float accum[16];
3048       float pixel;
3049
3050       for (int i = 0; i < 16; i++)
3051         accum[i]=0.0f;
3052
3053       for (int pass = 0; pass < passes; ++pass) {
3054         const int radius = 1 << pass;
3055         const int x = get_local_id(0);
3056         const float thresh = threshold * noise[pass];
3057
3058         // Apply horizontal hat
3059         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3060           const int offset = i * tileRowPixels;
3061           if (pass == 0)
3062             tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3063           pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3064           barrier(CLK_LOCAL_MEM_FENCE);
3065           buffer[x + offset] = pixel;
3066         }
3067         barrier(CLK_LOCAL_MEM_FENCE);
3068
3069         // Apply vertical hat
3070         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3071           pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3072           float delta = tmp[i / 4] - pixel;
3073           tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3074           if (delta < -thresh)
3075             delta += thresh;
3076           else if (delta > thresh)
3077             delta -= thresh;
3078           else
3079             delta = 0;
3080           accum[i / 4] += delta;
3081         }
3082         barrier(CLK_LOCAL_MEM_FENCE);
3083
3084         if (pass < passes - 1)
3085           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3086             buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3087         else  // last pass
3088           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3089             accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3090         barrier(CLK_LOCAL_MEM_FENCE);
3091       }
3092
3093       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3094         stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3095
3096       barrier(CLK_LOCAL_MEM_FENCE);
3097     }
3098
3099     // Write from stage to output
3100
3101     if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3102       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3103         if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3104           int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3105           for (int channel = 0; channel < max_channels; ++channel) {
3106             dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3107           }
3108         }
3109       }
3110     }
3111   }
3112   )
3113
3114   ;
3115
3116 #endif // MAGICKCORE_OPENCL_SUPPORT
3117
3118 #if defined(__cplusplus) || defined(c_plusplus)
3119 }
3120 #endif
3121
3122 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H