granicus.if.org Git - imagemagick/blob - MagickCore/accelerate-kernels-private.h

   1 /*
   2   Copyright 1999-2018 ImageMagick Studio LLC, a non-profit organization
   3   dedicated to making software imaging solutions freely available.
   4
   5   You may not use this file except in compliance with the License.
   6   obtain a copy of the License at
   7
   8     https://www.imagemagick.org/script/license.php
   9
  10   Unless required by applicable law or agreed to in writing, software
  11   distributed under the License is distributed on an "AS IS" BASIS,
  12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13   See the License for the specific language governing permissions and
  14   limitations under the License.
  15
  16   MagickCore private kernels for accelerated functions.
  17 */
  18
  19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
  20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
  21
  22 #if defined(__cplusplus) || defined(c_plusplus)
  23 extern "C" {
  24 #endif
  25
  26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
  27
  28 /*
  29   Define declarations.
  30 */
  31 #define OPENCL_DEFINE(VAR,...)  "\n #""define " #VAR " " #__VA_ARGS__ " \n"
  32 #define OPENCL_ELIF(...)        "\n #""elif " #__VA_ARGS__ " \n"
  33 #define OPENCL_ELSE()           "\n #""else " " \n"
  34 #define OPENCL_ENDIF()          "\n #""endif " " \n"
  35 #define OPENCL_IF(...)          "\n #""if " #__VA_ARGS__ " \n"
  36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
  37
  38 const char *accelerateKernels =
  39
  40 /*
  41   Define declarations.
  42 */
  43   OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
  44   OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
  45   OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
  46   OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
  47   OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
  48   OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
  49   OPENCL_DEFINE(SigmaRandom, (attenuate))
  50   OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
  51   OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
  52   OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
  53
  54 /*
  55   Typedef declarations.
  56 */
  57   STRINGIFY(
  58     typedef enum
  59     {
  60       UndefinedColorspace,
  61       CMYColorspace,           /* negated linear RGB colorspace */
  62       CMYKColorspace,          /* CMY with Black separation */
  63       GRAYColorspace,          /* Single Channel greyscale (non-linear) image */
  64       HCLColorspace,
  65       HCLpColorspace,
  66       HSBColorspace,
  67       HSIColorspace,
  68       HSLColorspace,
  69       HSVColorspace,           /* alias for HSB */
  70       HWBColorspace,
  71       LabColorspace,
  72       LCHColorspace,           /* alias for LCHuv */
  73       LCHabColorspace,         /* Cylindrical (Polar) Lab */
  74       LCHuvColorspace,         /* Cylindrical (Polar) Luv */
  75       LogColorspace,
  76       LMSColorspace,
  77       LuvColorspace,
  78       OHTAColorspace,
  79       Rec601YCbCrColorspace,
  80       Rec709YCbCrColorspace,
  81       RGBColorspace,           /* Linear RGB colorspace */
  82       scRGBColorspace,         /* ??? */
  83       sRGBColorspace,          /* Default: non-linear sRGB colorspace */
  84       TransparentColorspace,
  85       xyYColorspace,
  86       XYZColorspace,           /* IEEE Color Reference colorspace */
  87       YCbCrColorspace,
  88       YCCColorspace,
  89       YDbDrColorspace,
  90       YIQColorspace,
  91       YPbPrColorspace,
  92       YUVColorspace,
  93       LinearGRAYColorspace     /* Single Channel greyscale (linear) image */
  94     } ColorspaceType;
  95   )
  96
  97   STRINGIFY(
  98     typedef enum
  99     {
 100       UndefinedCompositeOp,
 101       AlphaCompositeOp,
 102       AtopCompositeOp,
 103       BlendCompositeOp,
 104       BlurCompositeOp,
 105       BumpmapCompositeOp,
 106       ChangeMaskCompositeOp,
 107       ClearCompositeOp,
 108       ColorBurnCompositeOp,
 109       ColorDodgeCompositeOp,
 110       ColorizeCompositeOp,
 111       CopyBlackCompositeOp,
 112       CopyBlueCompositeOp,
 113       CopyCompositeOp,
 114       CopyCyanCompositeOp,
 115       CopyGreenCompositeOp,
 116       CopyMagentaCompositeOp,
 117       CopyAlphaCompositeOp,
 118       CopyRedCompositeOp,
 119       CopyYellowCompositeOp,
 120       DarkenCompositeOp,
 121       DarkenIntensityCompositeOp,
 122       DifferenceCompositeOp,
 123       DisplaceCompositeOp,
 124       DissolveCompositeOp,
 125       DistortCompositeOp,
 126       DivideDstCompositeOp,
 127       DivideSrcCompositeOp,
 128       DstAtopCompositeOp,
 129       DstCompositeOp,
 130       DstInCompositeOp,
 131       DstOutCompositeOp,
 132       DstOverCompositeOp,
 133       ExclusionCompositeOp,
 134       HardLightCompositeOp,
 135       HardMixCompositeOp,
 136       HueCompositeOp,
 137       InCompositeOp,
 138       IntensityCompositeOp,
 139       LightenCompositeOp,
 140       LightenIntensityCompositeOp,
 141       LinearBurnCompositeOp,
 142       LinearDodgeCompositeOp,
 143       LinearLightCompositeOp,
 144       LuminizeCompositeOp,
 145       MathematicsCompositeOp,
 146       MinusDstCompositeOp,
 147       MinusSrcCompositeOp,
 148       ModulateCompositeOp,
 149       ModulusAddCompositeOp,
 150       ModulusSubtractCompositeOp,
 151       MultiplyCompositeOp,
 152       NoCompositeOp,
 153       OutCompositeOp,
 154       OverCompositeOp,
 155       OverlayCompositeOp,
 156       PegtopLightCompositeOp,
 157       PinLightCompositeOp,
 158       PlusCompositeOp,
 159       ReplaceCompositeOp,
 160       SaturateCompositeOp,
 161       ScreenCompositeOp,
 162       SoftLightCompositeOp,
 163       SrcAtopCompositeOp,
 164       SrcCompositeOp,
 165       SrcInCompositeOp,
 166       SrcOutCompositeOp,
 167       SrcOverCompositeOp,
 168       ThresholdCompositeOp,
 169       VividLightCompositeOp,
 170       XorCompositeOp,
 171       StereoCompositeOp
 172     } CompositeOperator;
 173   )
 174
 175   STRINGIFY(
 176     typedef enum
 177     {
 178       UndefinedFunction,
 179       ArcsinFunction,
 180       ArctanFunction,
 181       PolynomialFunction,
 182       SinusoidFunction
 183     } MagickFunction;
 184   )
 185
 186   STRINGIFY(
 187     typedef enum
 188     {
 189       UndefinedNoise,
 190       UniformNoise,
 191       GaussianNoise,
 192       MultiplicativeGaussianNoise,
 193       ImpulseNoise,
 194       LaplacianNoise,
 195       PoissonNoise,
 196       RandomNoise
 197     } NoiseType;
 198   )
 199
 200   STRINGIFY(
 201     typedef enum
 202     {
 203       UndefinedPixelIntensityMethod = 0,
 204       AveragePixelIntensityMethod,
 205       BrightnessPixelIntensityMethod,
 206       LightnessPixelIntensityMethod,
 207       MSPixelIntensityMethod,
 208       Rec601LumaPixelIntensityMethod,
 209       Rec601LuminancePixelIntensityMethod,
 210       Rec709LumaPixelIntensityMethod,
 211       Rec709LuminancePixelIntensityMethod,
 212       RMSPixelIntensityMethod
 213     } PixelIntensityMethod;
 214   )
 215
 216   STRINGIFY(
 217     typedef enum
 218     {
 219       BoxWeightingFunction = 0,
 220       TriangleWeightingFunction,
 221       CubicBCWeightingFunction,
 222       HannWeightingFunction,
 223       HammingWeightingFunction,
 224       BlackmanWeightingFunction,
 225       GaussianWeightingFunction,
 226       QuadraticWeightingFunction,
 227       JincWeightingFunction,
 228       SincWeightingFunction,
 229       SincFastWeightingFunction,
 230       KaiserWeightingFunction,
 231       WelchWeightingFunction,
 232       BohmanWeightingFunction,
 233       LagrangeWeightingFunction,
 234       CosineWeightingFunction
 235     } ResizeWeightingFunctionType;
 236   )
 237
 238   STRINGIFY(
 239     typedef enum
 240     {
 241       UndefinedChannel = 0x0000,
 242       RedChannel = 0x0001,
 243       GrayChannel = 0x0001,
 244       CyanChannel = 0x0001,
 245       GreenChannel = 0x0002,
 246       MagentaChannel = 0x0002,
 247       BlueChannel = 0x0004,
 248       YellowChannel = 0x0004,
 249       BlackChannel = 0x0008,
 250       AlphaChannel = 0x0010,
 251       OpacityChannel = 0x0010,
 252       IndexChannel = 0x0020,             /* Color Index Table? */
 253       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
 254       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
 255       MetaChannel = 0x0100,              /* ???? */
 256       CompositeChannels = 0x001F,
 257       AllChannels = 0x7ffffff,
 258       /*
 259         Special purpose channel types.
 260         FUTURE: are these needed any more - they are more like hacks
 261         SyncChannels for example is NOT a real channel but a 'flag'
 262         It really says -- "User has not defined channels"
 263         Though it does have extra meaning in the "-auto-level" operator
 264       */
 265       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
 266       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
 267       GrayChannels = 0x0400,
 268       SyncChannels = 0x20000,    /* channels modified as a single unit */
 269       DefaultChannels = AllChannels
 270     } ChannelType;  /* must correspond to PixelChannel */
 271   )
 272
 273 /*
 274   Helper functions.
 275 */
 276
 277 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
 278
 279   STRINGIFY(
 280     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
 281     {
 282       return((CLQuantum) value);
 283     }
 284   )
 285
 286 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
 287
 288   STRINGIFY(
 289     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
 290     {
 291       return((CLQuantum) (257.0f*value));
 292     }
 293   )
 294
 295 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
 296
 297   STRINGIFY(
 298     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
 299     {
 300       return((CLQuantum) (16843009.0*value));
 301     }
 302   )
 303
 304 OPENCL_ENDIF()
 305
 306 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
 307
 308   STRINGIFY(
 309     inline CLQuantum ClampToQuantum(const float value)
 310       {
 311         return (CLQuantum) clamp(value, 0.0f, QuantumRange);
 312       }
 313   )
 314
 315 OPENCL_ELSE()
 316
 317   STRINGIFY(
 318     inline CLQuantum ClampToQuantum(const float value)
 319       {
 320         return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
 321       }
 322   )
 323
 324 OPENCL_ENDIF()
 325
 326   STRINGIFY(
 327     inline int ClampToCanvas(const int offset,const int range)
 328       {
 329         return clamp(offset, (int)0, range-1);
 330       }
 331   )
 332
 333   STRINGIFY(
 334     inline uint ScaleQuantumToMap(CLQuantum value)
 335       {
 336         if (value >= (CLQuantum) MaxMap)
 337           return ((uint)MaxMap);
 338         else
 339           return ((uint)value);
 340       }
 341   )
 342
 343   STRINGIFY(
 344     inline float PerceptibleReciprocal(const float x)
 345     {
 346       float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
 347       return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
 348     }
 349   )
 350
 351   STRINGIFY(
 352   inline float RoundToUnity(const float value)
 353    {
 354      return clamp(value,0.0f,1.0f);
 355    }
 356   )
 357
 358   STRINGIFY(
 359
 360   inline unsigned int getPixelIndex(const unsigned int number_channels,
 361     const unsigned int columns, const unsigned int x, const unsigned int y)
 362   {
 363     return (x * number_channels) + (y * columns * number_channels);
 364   }
 365
 366   inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
 367   inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
 368   inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
 369   inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
 370
 371   inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
 372   inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
 373   inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
 374   inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
 375
 376   inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
 377   inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
 378   inline float getBlueF4(float4 p)                      { return p.x; }
 379   inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
 380
 381   inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
 382   inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
 383   inline float getGreenF4(float4 p)                     { return p.y; }
 384   inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
 385
 386   inline CLQuantum getRed(CLPixelType p)                { return p.z; }
 387   inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
 388   inline float getRedF4(float4 p)                       { return p.z; }
 389   inline void setRedF4(float4* p, float value)          { (*p).z = value; }
 390
 391   inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
 392   inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
 393   inline float getAlphaF4(float4 p)                     { return p.w; }
 394   inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
 395
 396   inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
 397     const ChannelType channel, float *red, float *green, float *blue, float *alpha)
 398   {
 399     if ((channel & RedChannel) != 0)
 400       *red=getPixelRed(p);
 401
 402     if (number_channels > 2)
 403       {
 404         if ((channel & GreenChannel) != 0)
 405           *green=getPixelGreen(p);
 406
 407         if ((channel & BlueChannel) != 0)
 408           *blue=getPixelBlue(p);
 409       }
 410
 411     if (((number_channels == 4) || (number_channels == 2)) &&
 412         ((channel & AlphaChannel) != 0))
 413       *alpha=getPixelAlpha(p,number_channels);
 414   }
 415
 416   inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
 417     const unsigned int columns, const unsigned int x, const unsigned int y)
 418   {
 419     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
 420
 421     float4 pixel;
 422
 423     pixel.x=getPixelRed(p);
 424
 425     if (number_channels > 2)
 426       {
 427         pixel.y=getPixelGreen(p);
 428         pixel.z=getPixelBlue(p);
 429       }
 430
 431     if ((number_channels == 4) || (number_channels == 2))
 432       pixel.w=getPixelAlpha(p,number_channels);
 433     return(pixel);
 434   }
 435
 436   inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
 437     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
 438   {
 439     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
 440
 441     float red = 0.0f;
 442     float green = 0.0f;
 443     float blue = 0.0f;
 444     float alpha = 0.0f;
 445
 446     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
 447     return (float4)(red, green, blue, alpha);
 448   }
 449
 450   inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
 451     const ChannelType channel, float red, float green, float blue, float alpha)
 452   {
 453     if ((channel & RedChannel) != 0)
 454       setPixelRed(p,ClampToQuantum(red));
 455
 456     if (number_channels > 2)
 457       {
 458         if ((channel & GreenChannel) != 0)
 459           setPixelGreen(p,ClampToQuantum(green));
 460
 461         if ((channel & BlueChannel) != 0)
 462           setPixelBlue(p,ClampToQuantum(blue));
 463       }
 464
 465     if (((number_channels == 4) || (number_channels == 2)) &&
 466         ((channel & AlphaChannel) != 0))
 467       setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
 468   }
 469
 470   inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
 471     const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
 472   {
 473     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
 474
 475     setPixelRed(p,ClampToQuantum(pixel.x));
 476
 477     if (number_channels > 2)
 478       {
 479         setPixelGreen(p,ClampToQuantum(pixel.y));
 480         setPixelBlue(p,ClampToQuantum(pixel.z));
 481       }
 482
 483     if ((number_channels == 4) || (number_channels == 2))
 484       setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
 485   }
 486
 487   inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
 488     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
 489     float4 pixel)
 490   {
 491     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
 492     WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
 493   }
 494
 495   inline float GetPixelIntensity(const unsigned int colorspace,
 496     const unsigned int method,float red,float green,float blue)
 497   {
 498     float intensity;
 499
 500     if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace))
 501       return red;
 502
 503     switch (method)
 504     {
 505       case AveragePixelIntensityMethod:
 506         {
 507           intensity=(red+green+blue)/3.0;
 508           break;
 509         }
 510       case BrightnessPixelIntensityMethod:
 511         {
 512           intensity=MagickMax(MagickMax(red,green),blue);
 513           break;
 514         }
 515       case LightnessPixelIntensityMethod:
 516         {
 517           intensity=(MagickMin(MagickMin(red,green),blue)+
 518               MagickMax(MagickMax(red,green),blue))/2.0;
 519           break;
 520         }
 521       case MSPixelIntensityMethod:
 522         {
 523           intensity=(float) (((float) red*red+green*green+blue*blue)/
 524               (3.0*QuantumRange));
 525           break;
 526         }
 527       case Rec601LumaPixelIntensityMethod:
 528         {
 529           /*
 530           if (image->colorspace == RGBColorspace)
 531           {
 532             red=EncodePixelGamma(red);
 533             green=EncodePixelGamma(green);
 534             blue=EncodePixelGamma(blue);
 535           }
 536           */
 537           intensity=0.298839*red+0.586811*green+0.114350*blue;
 538           break;
 539         }
 540       case Rec601LuminancePixelIntensityMethod:
 541         {
 542           /*
 543           if (image->colorspace == sRGBColorspace)
 544           {
 545             red=DecodePixelGamma(red);
 546             green=DecodePixelGamma(green);
 547             blue=DecodePixelGamma(blue);
 548           }
 549           */
 550           intensity=0.298839*red+0.586811*green+0.114350*blue;
 551           break;
 552         }
 553       case Rec709LumaPixelIntensityMethod:
 554       default:
 555         {
 556           /*
 557           if (image->colorspace == RGBColorspace)
 558           {
 559             red=EncodePixelGamma(red);
 560             green=EncodePixelGamma(green);
 561             blue=EncodePixelGamma(blue);
 562           }
 563           */
 564           intensity=0.212656*red+0.715158*green+0.072186*blue;
 565           break;
 566         }
 567       case Rec709LuminancePixelIntensityMethod:
 568         {
 569           /*
 570           if (image->colorspace == sRGBColorspace)
 571           {
 572             red=DecodePixelGamma(red);
 573             green=DecodePixelGamma(green);
 574             blue=DecodePixelGamma(blue);
 575           }
 576           */
 577           intensity=0.212656*red+0.715158*green+0.072186*blue;
 578           break;
 579         }
 580       case RMSPixelIntensityMethod:
 581         {
 582           intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
 583               sqrt(3.0));
 584           break;
 585         }
 586     }
 587
 588     return intensity;
 589   }
 590
 591   inline int mirrorBottom(int value)
 592   {
 593       return (value < 0) ? - (value) : value;
 594   }
 595
 596   inline int mirrorTop(int value, int width)
 597   {
 598       return (value >= width) ? (2 * width - value - 1) : value;
 599   }
 600   )
 601
 602 /*
 603 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 604 %                                                                             %
 605 %                                                                             %
 606 %                                                                             %
 607 %     A d d N o i s e                                                         %
 608 %                                                                             %
 609 %                                                                             %
 610 %                                                                             %
 611 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 612 */
 613
 614   STRINGIFY(
 615   /*
 616   Part of MWC64X by David Thomas, dt10@imperial.ac.uk
 617   This is provided under BSD, full license is with the main package.
 618   See http://www.doc.ic.ac.uk/~dt10/research
 619   */
 620
 621   // Pre: a<M, b<M
 622   // Post: r=(a+b) mod M
 623   ulong MWC_AddMod64(ulong a, ulong b, ulong M)
 624   {
 625     ulong v=a+b;
 626     //if( (v>=M) || (v<a) )
 627     if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
 628       v=v-M;
 629     return v;
 630   }
 631
 632   // Pre: a<M,b<M
 633   // Post: r=(a*b) mod M
 634   // This could be done more efficently, but it is portable, and should
 635   // be easy to understand. It can be replaced with any of the better
 636   // modular multiplication algorithms (for example if you know you have
 637   // double precision available or something).
 638   ulong MWC_MulMod64(ulong a, ulong b, ulong M)
 639   {
 640     ulong r=0;
 641     while(a!=0){
 642       if(a&1)
 643         r=MWC_AddMod64(r,b,M);
 644       b=MWC_AddMod64(b,b,M);
 645       a=a>>1;
 646     }
 647     return r;
 648   }
 649
 650   // Pre: a<M, e>=0
 651   // Post: r=(a^b) mod M
 652   // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
 653   // most architectures
 654   ulong MWC_PowMod64(ulong a, ulong e, ulong M)
 655   {
 656     ulong sqr=a, acc=1;
 657     while(e!=0){
 658       if(e&1)
 659         acc=MWC_MulMod64(acc,sqr,M);
 660         sqr=MWC_MulMod64(sqr,sqr,M);
 661       e=e>>1;
 662     }
 663     return acc;
 664   }
 665
 666   uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
 667   {
 668     ulong m=MWC_PowMod64(A, distance, M);
 669     ulong x=curr.x*(ulong)A+curr.y;
 670     x=MWC_MulMod64(x, m, M);
 671     return (uint2)((uint)(x/A), (uint)(x%A));
 672   }
 673
 674   uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
 675   {
 676     // This is an arbitrary constant for starting LCG jumping from. I didn't
 677     // want to start from 1, as then you end up with the two or three first values
 678     // being a bit poor in ones - once you've decided that, one constant is as
 679     // good as any another. There is no deep mathematical reason for it, I just
 680     // generated a random number.
 681     enum{ MWC_BASEID = 4077358422479273989UL };
 682
 683     ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
 684     ulong m=MWC_PowMod64(A, dist, M);
 685
 686     ulong x=MWC_MulMod64(MWC_BASEID, m, M);
 687     return (uint2)((uint)(x/A), (uint)(x%A));
 688   }
 689
 690   //! Represents the state of a particular generator
 691   typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
 692
 693   void MWC64X_Step(mwc64x_state_t *s)
 694   {
 695     uint X=s->x, C=s->c;
 696
 697     uint Xn=s->seed0*X+C;
 698     uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
 699     uint Cn=mad_hi(s->seed0,X,carry);
 700
 701     s->x=Xn;
 702     s->c=Cn;
 703   }
 704
 705   void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
 706   {
 707     uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
 708     s->x=tmp.x;
 709     s->c=tmp.y;
 710   }
 711
 712   void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
 713   {
 714     uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
 715     s->x=tmp.x;
 716     s->c=tmp.y;
 717   }
 718
 719   //! Return a 32-bit integer in the range [0..2^32)
 720   uint MWC64X_NextUint(mwc64x_state_t *s)
 721   {
 722     uint res=s->x ^ s->c;
 723     MWC64X_Step(s);
 724     return res;
 725   }
 726
 727   //
 728   // End of MWC64X excerpt
 729   //
 730
 731   float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
 732   {
 733     return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
 734   }
 735
 736   float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
 737   {
 738     float
 739       alpha,
 740       beta,
 741       noise,
 742       sigma;
 743
 744     noise = 0.0f;
 745     alpha=mwcReadPseudoRandomValue(r);
 746     switch(noise_type)
 747     {
 748       case UniformNoise:
 749       default:
 750         {
 751           noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
 752           break;
 753         }
 754       case GaussianNoise:
 755         {
 756           float
 757             gamma,
 758             tau;
 759
 760           if (alpha == 0.0f)
 761             alpha=1.0f;
 762           beta=mwcReadPseudoRandomValue(r);
 763           gamma=sqrt(-2.0f*log(alpha));
 764           sigma=gamma*cospi((2.0f*beta));
 765           tau=gamma*sinpi((2.0f*beta));
 766           noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
 767           break;
 768         }
 769       case ImpulseNoise:
 770       {
 771         if (alpha < (SigmaImpulse/2.0f))
 772           noise=0.0f;
 773         else
 774           if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
 775             noise=QuantumRange;
 776           else
 777             noise=pixel;
 778         break;
 779       }
 780       case LaplacianNoise:
 781       {
 782         if (alpha <= 0.5f)
 783           {
 784             if (alpha <= MagickEpsilon)
 785               noise=(pixel-QuantumRange);
 786             else
 787               noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
 788                 0.5f);
 789             break;
 790           }
 791         beta=1.0f-alpha;
 792         if (beta <= (0.5f*MagickEpsilon))
 793           noise=(pixel+QuantumRange);
 794         else
 795           noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
 796         break;
 797       }
 798       case MultiplicativeGaussianNoise:
 799       {
 800         sigma=1.0f;
 801         if (alpha > MagickEpsilon)
 802           sigma=sqrt(-2.0f*log(alpha));
 803         beta=mwcReadPseudoRandomValue(r);
 804         noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
 805           cospi((2.0f*beta))/2.0f);
 806         break;
 807       }
 808       case PoissonNoise:
 809       {
 810         float
 811           poisson;
 812         unsigned int i;
 813         poisson=exp(-SigmaPoisson*QuantumScale*pixel);
 814         for (i=0; alpha > poisson; i++)
 815         {
 816           beta=mwcReadPseudoRandomValue(r);
 817           alpha*=beta;
 818         }
 819         noise=(QuantumRange*i/SigmaPoisson);
 820         break;
 821       }
 822       case RandomNoise:
 823       {
 824         noise=(QuantumRange*SigmaRandom*alpha);
 825         break;
 826       }
 827     }
 828     return noise;
 829   }
 830
 831   __kernel
 832   void AddNoise(const __global CLQuantum *image,
 833     const unsigned int number_channels,const ChannelType channel,
 834     const unsigned int length,const unsigned int pixelsPerWorkItem,
 835     const NoiseType noise_type,const float attenuate,const unsigned int seed0,
 836     const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
 837     __global CLQuantum *filteredImage)
 838   {
 839     mwc64x_state_t rng;
 840     rng.seed0 = seed0;
 841     rng.seed1 = seed1;
 842
 843     uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
 844     uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
 845     MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
 846
 847     uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
 848     uint count = pixelsPerWorkItem;
 849
 850     while (count > 0 && pos < length)
 851     {
 852       const __global CLQuantum *p = image + pos;
 853       __global CLQuantum *q = filteredImage + pos;
 854
 855       float red;
 856       float green;
 857       float blue;
 858       float alpha;
 859
 860       ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
 861
 862       if ((channel & RedChannel) != 0)
 863         red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
 864
 865       if (number_channels > 2)
 866       {
 867         if ((channel & GreenChannel) != 0)
 868           green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
 869
 870         if ((channel & BlueChannel) != 0)
 871           blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
 872       }
 873
 874       if (((number_channels == 4) || (number_channels == 2)) &&
 875           ((channel & AlphaChannel) != 0))
 876         alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
 877
 878       WriteChannels(q, number_channels, channel, red, green, blue, alpha);
 879
 880       pos += (get_local_size(0) * number_channels);
 881       count--;
 882     }
 883   }
 884   )
 885
 886 /*
 887 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 888 %                                                                             %
 889 %                                                                             %
 890 %                                                                             %
 891 %    B l u r                                                                  %
 892 %                                                                             %
 893 %                                                                             %
 894 %                                                                             %
 895 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 896 */
 897
 898   STRINGIFY(
 899   /*
 900   Reduce image noise and reduce detail levels by row
 901   */
 902   __kernel void BlurRow(const __global CLQuantum *image,
 903     const unsigned int number_channels,const ChannelType channel,
 904     __constant float *filter,const unsigned int width,
 905     const unsigned int imageColumns,const unsigned int imageRows,
 906     __local float4 *temp,__global float4 *tempImage)
 907   {
 908     const int x = get_global_id(0);
 909     const int y = get_global_id(1);
 910
 911     const int columns = imageColumns;
 912
 913     const unsigned int radius = (width-1)/2;
 914     const int wsize = get_local_size(0);
 915     const unsigned int loadSize = wsize+width;
 916
 917     //group coordinate
 918     const int groupX=get_local_size(0)*get_group_id(0);
 919
 920     //parallel load and clamp
 921     for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
 922     {
 923       int cx = ClampToCanvas(i + groupX - radius, columns);
 924       temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
 925     }
 926
 927     // barrier
 928     barrier(CLK_LOCAL_MEM_FENCE);
 929
 930     // only do the work if this is not a patched item
 931     if (get_global_id(0) < columns)
 932     {
 933       // compute
 934       float4 result = (float4) 0;
 935
 936       int i = 0;
 937
 938       for ( ; i+7 < width; )
 939       {
 940         for (int j=0; j < 8; j++)
 941           result+=filter[i+j]*temp[i+j+get_local_id(0)];
 942         i+=8;
 943       }
 944
 945       for ( ; i < width; i++)
 946         result+=filter[i]*temp[i+get_local_id(0)];
 947
 948       // write back to global
 949       tempImage[y*columns+x] = result;
 950     }
 951   }
 952   )
 953
 954   STRINGIFY(
 955   /*
 956   Reduce image noise and reduce detail levels by line
 957   */
 958   __kernel void BlurColumn(const __global float4 *blurRowData,
 959     const unsigned int number_channels,const ChannelType channel,
 960     __constant float *filter,const unsigned int width,
 961     const unsigned int imageColumns,const unsigned int imageRows,
 962     __local float4 *temp,__global CLQuantum *filteredImage)
 963   {
 964     const int x = get_global_id(0);
 965     const int y = get_global_id(1);
 966
 967     const int columns = imageColumns;
 968     const int rows = imageRows;
 969
 970     unsigned int radius = (width-1)/2;
 971     const int wsize = get_local_size(1);
 972     const unsigned int loadSize = wsize+width;
 973
 974     //group coordinate
 975     const int groupX=get_local_size(0)*get_group_id(0);
 976     const int groupY=get_local_size(1)*get_group_id(1);
 977     //notice that get_local_size(0) is 1, so
 978     //groupX=get_group_id(0);
 979
 980     //parallel load and clamp
 981     for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
 982       temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
 983
 984     // barrier
 985     barrier(CLK_LOCAL_MEM_FENCE);
 986
 987     // only do the work if this is not a patched item
 988     if (get_global_id(1) < rows)
 989     {
 990       // compute
 991       float4 result = (float4) 0;
 992
 993       int i = 0;
 994
 995       for ( ; i+7 < width; )
 996       {
 997         for (int j=0; j < 8; j++)
 998           result+=filter[i+j]*temp[i+j+get_local_id(1)];
 999         i+=8;
1000       }
1001
1002       for ( ; i < width; i++)
1003         result+=filter[i]*temp[i+get_local_id(1)];
1004
1005       // write back to global
1006       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
1007     }
1008   }
1009   )
1010
1011 /*
1012 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1013 %                                                                             %
1014 %                                                                             %
1015 %                                                                             %
1016 %    C o n t r a s t                                                          %
1017 %                                                                             %
1018 %                                                                             %
1019 %                                                                             %
1020 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1021 */
1022
1023   STRINGIFY(
1024
1025   inline float4 ConvertRGBToHSB(const float4 pixel)
1026   {
1027     float4 result=0.0f;
1028     result.w=pixel.w;
1029     float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
1030     if (tmax != 0.0f)
1031     {
1032       float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
1033       float delta=tmax-tmin;
1034
1035       result.y=delta/tmax;
1036       result.z=QuantumScale*tmax;
1037       if (delta != 0.0f)
1038       {
1039         result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
1040         result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
1041           (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
1042         result.x/=6.0f;
1043         result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
1044       }
1045     }
1046     return(result);
1047   }
1048
1049   inline float4 ConvertHSBToRGB(const float4 pixel)
1050   {
1051     float hue=pixel.x;
1052     float saturation=pixel.y;
1053     float brightness=pixel.z;
1054
1055     float4 result=pixel;
1056
1057     if (saturation == 0.0f)
1058     {
1059       result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
1060     }
1061     else
1062     {
1063       float h=6.0f*(hue-floor(hue));
1064       float f=h-floor(h);
1065       float p=brightness*(1.0f-saturation);
1066       float q=brightness*(1.0f-saturation*f);
1067       float t=brightness*(1.0f-(saturation*(1.0f-f)));
1068       int ih = (int)h;
1069
1070       if (ih == 1)
1071       {
1072         result.x=ClampToQuantum(QuantumRange*q);
1073         result.y=ClampToQuantum(QuantumRange*brightness);
1074         result.z=ClampToQuantum(QuantumRange*p);
1075       }
1076       else if (ih == 2)
1077       {
1078         result.x=ClampToQuantum(QuantumRange*p);
1079         result.y=ClampToQuantum(QuantumRange*brightness);
1080         result.z=ClampToQuantum(QuantumRange*t);
1081       }
1082       else if (ih == 3)
1083       {
1084         result.x=ClampToQuantum(QuantumRange*p);
1085         result.y=ClampToQuantum(QuantumRange*q);
1086         result.z=ClampToQuantum(QuantumRange*brightness);
1087       }
1088       else if (ih == 4)
1089       {
1090         result.x=ClampToQuantum(QuantumRange*t);
1091         result.y=ClampToQuantum(QuantumRange*p);
1092         result.z=ClampToQuantum(QuantumRange*brightness);
1093       }
1094       else if (ih == 5)
1095       {
1096         result.x=ClampToQuantum(QuantumRange*brightness);
1097         result.y=ClampToQuantum(QuantumRange*p);
1098         result.z=ClampToQuantum(QuantumRange*q);
1099       }
1100       else
1101       {
1102         result.x=ClampToQuantum(QuantumRange*brightness);
1103         result.y=ClampToQuantum(QuantumRange*t);
1104         result.z=ClampToQuantum(QuantumRange*p);
1105       }
1106     }
1107     return(result);
1108   }
1109
1110   __kernel void Contrast(__global CLQuantum *image,
1111     const unsigned int number_channels,const int sign)
1112   {
1113     const int x=get_global_id(0);
1114     const int y=get_global_id(1);
1115     const unsigned int columns=get_global_size(0);
1116
1117     float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
1118     if (number_channels < 3)
1119       pixel.y=pixel.z=pixel.x;
1120
1121     pixel=ConvertRGBToHSB(pixel);
1122     float brightness=pixel.z;
1123     brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1124     brightness=clamp(brightness,0.0f,1.0f);
1125     pixel.z=brightness;
1126     pixel=ConvertHSBToRGB(pixel);
1127
1128     WriteAllChannels(image,number_channels,columns,x,y,pixel);
1129   }
1130   )
1131
1132 /*
1133 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1134 %                                                                             %
1135 %                                                                             %
1136 %                                                                             %
1137 %    C o n t r a s t S t r e t c h                                            %
1138 %                                                                             %
1139 %                                                                             %
1140 %                                                                             %
1141 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1142 */
1143
1144     STRINGIFY(
1145     /*
1146     */
1147     __kernel void Histogram(__global CLPixelType * restrict im,
1148       const ChannelType channel,
1149       const unsigned int colorspace,
1150       const unsigned int method,
1151       __global uint4 * restrict histogram)
1152       {
1153         const int x = get_global_id(0);
1154         const int y = get_global_id(1);
1155         const int columns = get_global_size(0);
1156         const int c = x + y * columns;
1157         if ((channel & SyncChannels) != 0)
1158         {
1159           float red=(float)getRed(im[c]);
1160           float green=(float)getGreen(im[c]);
1161           float blue=(float)getBlue(im[c]);
1162
1163           float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1164           uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1165           atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1166         }
1167         else
1168         {
1169           // for equalizing, we always need all channels?
1170           // otherwise something more
1171         }
1172       }
1173     )
1174
1175     STRINGIFY(
1176     /*
1177     */
1178     __kernel void ContrastStretch(__global CLPixelType * restrict im,
1179       const ChannelType channel,
1180       __global CLPixelType * restrict stretch_map,
1181       const float4 white, const float4 black)
1182       {
1183         const int x = get_global_id(0);
1184         const int y = get_global_id(1);
1185         const int columns = get_global_size(0);
1186         const int c = x + y * columns;
1187
1188         uint ePos;
1189         CLPixelType oValue, eValue;
1190         CLQuantum red, green, blue, alpha;
1191
1192         //read from global
1193         oValue=im[c];
1194
1195         if ((channel & RedChannel) != 0)
1196         {
1197           if (getRedF4(white) != getRedF4(black))
1198           {
1199             ePos = ScaleQuantumToMap(getRed(oValue));
1200             eValue = stretch_map[ePos];
1201             red = getRed(eValue);
1202           }
1203         }
1204
1205         if ((channel & GreenChannel) != 0)
1206         {
1207           if (getGreenF4(white) != getGreenF4(black))
1208           {
1209             ePos = ScaleQuantumToMap(getGreen(oValue));
1210             eValue = stretch_map[ePos];
1211             green = getGreen(eValue);
1212           }
1213         }
1214
1215         if ((channel & BlueChannel) != 0)
1216         {
1217           if (getBlueF4(white) != getBlueF4(black))
1218           {
1219             ePos = ScaleQuantumToMap(getBlue(oValue));
1220             eValue = stretch_map[ePos];
1221             blue = getBlue(eValue);
1222           }
1223         }
1224
1225         if ((channel & AlphaChannel) != 0)
1226         {
1227           if (getAlphaF4(white) != getAlphaF4(black))
1228           {
1229             ePos = ScaleQuantumToMap(getAlpha(oValue));
1230             eValue = stretch_map[ePos];
1231             alpha = getAlpha(eValue);
1232           }
1233         }
1234
1235         //write back
1236         im[c]=(CLPixelType)(blue, green, red, alpha);
1237
1238       }
1239     )
1240
1241 /*
1242 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1243 %                                                                             %
1244 %                                                                             %
1245 %                                                                             %
1246 %    C o n v o l v e                                                          %
1247 %                                                                             %
1248 %                                                                             %
1249 %                                                                             %
1250 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1251 */
1252
1253   STRINGIFY(
1254     __kernel
1255     void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1256     const unsigned int imageWidth, const unsigned int imageHeight,
1257     __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1258     const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1259
1260       int2 blockID;
1261       blockID.x = get_global_id(0) / get_local_size(0);
1262       blockID.y = get_global_id(1) / get_local_size(1);
1263
1264       // image area processed by this workgroup
1265       int2 imageAreaOrg;
1266       imageAreaOrg.x = blockID.x * get_local_size(0);
1267       imageAreaOrg.y = blockID.y * get_local_size(1);
1268
1269       int2 midFilterDimen;
1270       midFilterDimen.x = (filterWidth-1)/2;
1271       midFilterDimen.y = (filterHeight-1)/2;
1272
1273       int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1274
1275       // dimension of the local cache
1276       int2 cachedAreaDimen;
1277       cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1278       cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1279
1280       // cache the pixels accessed by this workgroup in local memory
1281       int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1282       int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1283       int groupSize = get_local_size(0) * get_local_size(1);
1284       for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1285
1286         int2 cachedAreaIndex;
1287         cachedAreaIndex.x = i % cachedAreaDimen.x;
1288         cachedAreaIndex.y = i / cachedAreaDimen.x;
1289
1290         int2 imagePixelIndex;
1291         imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1292
1293         // only support EdgeVirtualPixelMethod through ClampToCanvas
1294         // TODO: implement other virtual pixel method
1295         imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1296         imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1297
1298         pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1299       }
1300
1301       // cache the filter
1302       for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1303         filterCache[i] = filter[i];
1304       }
1305       barrier(CLK_LOCAL_MEM_FENCE);
1306
1307
1308       int2 imageIndex;
1309       imageIndex.x = imageAreaOrg.x + get_local_id(0);
1310       imageIndex.y = imageAreaOrg.y + get_local_id(1);
1311
1312       // if out-of-range, stops here and quit
1313       if (imageIndex.x >= imageWidth
1314         || imageIndex.y >= imageHeight) {
1315           return;
1316       }
1317
1318       int filterIndex = 0;
1319       float4 sum = (float4)0.0f;
1320       float gamma = 0.0f;
1321       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1322         int cacheIndexY = get_local_id(1);
1323         for (int j = 0; j < filterHeight; j++) {
1324           int cacheIndexX = get_local_id(0);
1325           for (int i = 0; i < filterWidth; i++) {
1326             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1327             float f = filterCache[filterIndex];
1328
1329             sum.x += f * p.x;
1330             sum.y += f * p.y;
1331             sum.z += f * p.z;
1332             sum.w += f * p.w;
1333
1334             gamma += f;
1335             filterIndex++;
1336             cacheIndexX++;
1337           }
1338           cacheIndexY++;
1339         }
1340       }
1341       else {
1342         int cacheIndexY = get_local_id(1);
1343         for (int j = 0; j < filterHeight; j++) {
1344           int cacheIndexX = get_local_id(0);
1345           for (int i = 0; i < filterWidth; i++) {
1346
1347             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1348             float alpha = QuantumScale*p.w;
1349             float f = filterCache[filterIndex];
1350             float g = alpha * f;
1351
1352             sum.x += g*p.x;
1353             sum.y += g*p.y;
1354             sum.z += g*p.z;
1355             sum.w += f*p.w;
1356
1357             gamma += g;
1358             filterIndex++;
1359             cacheIndexX++;
1360           }
1361           cacheIndexY++;
1362         }
1363         gamma = PerceptibleReciprocal(gamma);
1364         sum.xyz = gamma*sum.xyz;
1365       }
1366       CLPixelType outputPixel;
1367       outputPixel.x = ClampToQuantum(sum.x);
1368       outputPixel.y = ClampToQuantum(sum.y);
1369       outputPixel.z = ClampToQuantum(sum.z);
1370       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1371
1372       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1373     }
1374   )
1375
1376   STRINGIFY(
1377     __kernel
1378     void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1379                   const uint imageWidth, const uint imageHeight,
1380                   __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1381                   const uint matte, const ChannelType channel) {
1382
1383       int2 imageIndex;
1384       imageIndex.x = get_global_id(0);
1385       imageIndex.y = get_global_id(1);
1386
1387       /*
1388       unsigned int imageWidth = get_global_size(0);
1389       unsigned int imageHeight = get_global_size(1);
1390       */
1391       if (imageIndex.x >= imageWidth
1392           || imageIndex.y >= imageHeight)
1393           return;
1394
1395       int2 midFilterDimen;
1396       midFilterDimen.x = (filterWidth-1)/2;
1397       midFilterDimen.y = (filterHeight-1)/2;
1398
1399       int filterIndex = 0;
1400       float4 sum = (float4)0.0f;
1401       float gamma = 0.0f;
1402       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1403         for (int j = 0; j < filterHeight; j++) {
1404           int2 inputPixelIndex;
1405           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1406           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1407           for (int i = 0; i < filterWidth; i++) {
1408             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1409             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1410
1411             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1412             float f = filter[filterIndex];
1413
1414             sum.x += f * p.x;
1415             sum.y += f * p.y;
1416             sum.z += f * p.z;
1417             sum.w += f * p.w;
1418
1419             gamma += f;
1420
1421             filterIndex++;
1422           }
1423         }
1424       }
1425       else {
1426
1427         for (int j = 0; j < filterHeight; j++) {
1428           int2 inputPixelIndex;
1429           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1430           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1431           for (int i = 0; i < filterWidth; i++) {
1432             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1433             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1434
1435             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1436             float alpha = QuantumScale*p.w;
1437             float f = filter[filterIndex];
1438             float g = alpha * f;
1439
1440             sum.x += g*p.x;
1441             sum.y += g*p.y;
1442             sum.z += g*p.z;
1443             sum.w += f*p.w;
1444
1445             gamma += g;
1446
1447
1448             filterIndex++;
1449           }
1450         }
1451         gamma = PerceptibleReciprocal(gamma);
1452         sum.xyz = gamma*sum.xyz;
1453       }
1454
1455       CLPixelType outputPixel;
1456       outputPixel.x = ClampToQuantum(sum.x);
1457       outputPixel.y = ClampToQuantum(sum.y);
1458       outputPixel.z = ClampToQuantum(sum.z);
1459       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1460
1461       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1462     }
1463   )
1464
1465 /*
1466 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1467 %                                                                             %
1468 %                                                                             %
1469 %                                                                             %
1470 %     D e s p e c k l e                                                       %
1471 %                                                                             %
1472 %                                                                             %
1473 %                                                                             %
1474 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1475 */
1476
1477   STRINGIFY(
1478
1479   __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1480   , const unsigned int imageWidth, const unsigned int imageHeight
1481   , const int2 offset, const int polarity, const int matte) {
1482
1483     int x = get_global_id(0);
1484     int y = get_global_id(1);
1485
1486     CLPixelType v = inputImage[y*imageWidth+x];
1487
1488     int2 neighbor;
1489     neighbor.y = y + offset.y;
1490     neighbor.x = x + offset.x;
1491
1492     int2 clampedNeighbor;
1493     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1494     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1495
1496     CLPixelType r = (clampedNeighbor.x == neighbor.x
1497                      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1498     :(CLPixelType)0;
1499
1500     int sv[4];
1501     sv[0] = (int)v.x;
1502     sv[1] = (int)v.y;
1503     sv[2] = (int)v.z;
1504     sv[3] = (int)v.w;
1505
1506     int sr[4];
1507     sr[0] = (int)r.x;
1508     sr[1] = (int)r.y;
1509     sr[2] = (int)r.z;
1510     sr[3] = (int)r.w;
1511
1512     if (polarity > 0) {
1513       \n #pragma unroll 4\n
1514       for (unsigned int i = 0; i < 4; i++) {
1515         sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1516       }
1517     }
1518     else {
1519       \n #pragma unroll 4\n
1520       for (unsigned int i = 0; i < 4; i++) {
1521         sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1522       }
1523
1524     }
1525
1526     v.x = (CLQuantum)sv[0];
1527     v.y = (CLQuantum)sv[1];
1528     v.z = (CLQuantum)sv[2];
1529
1530     if (matte!=0)
1531       v.w = (CLQuantum)sv[3];
1532
1533     outputImage[y*imageWidth+x] = v;
1534
1535     }
1536
1537
1538   )
1539
1540   STRINGIFY(
1541
1542   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1543   , const unsigned int imageWidth, const unsigned int imageHeight
1544   , const int2 offset, const int polarity, const int matte) {
1545
1546     int x = get_global_id(0);
1547     int y = get_global_id(1);
1548
1549     CLPixelType v = inputImage[y*imageWidth+x];
1550
1551     int2 neighbor, clampedNeighbor;
1552
1553     neighbor.y = y + offset.y;
1554     neighbor.x = x + offset.x;
1555     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1556     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1557
1558     CLPixelType r = (clampedNeighbor.x == neighbor.x
1559       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1560     :(CLPixelType)0;
1561
1562
1563     neighbor.y = y - offset.y;
1564     neighbor.x = x - offset.x;
1565     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1566     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1567
1568     CLPixelType s = (clampedNeighbor.x == neighbor.x
1569       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1570     :(CLPixelType)0;
1571
1572
1573     int sv[4];
1574     sv[0] = (int)v.x;
1575     sv[1] = (int)v.y;
1576     sv[2] = (int)v.z;
1577     sv[3] = (int)v.w;
1578
1579     int sr[4];
1580     sr[0] = (int)r.x;
1581     sr[1] = (int)r.y;
1582     sr[2] = (int)r.z;
1583     sr[3] = (int)r.w;
1584
1585     int ss[4];
1586     ss[0] = (int)s.x;
1587     ss[1] = (int)s.y;
1588     ss[2] = (int)s.z;
1589     ss[3] = (int)s.w;
1590
1591     if (polarity > 0) {
1592       \n #pragma unroll 4\n
1593       for (unsigned int i = 0; i < 4; i++) {
1594         //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1595         //
1596         //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1597         //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1598         sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1599       }
1600     }
1601     else {
1602       \n #pragma unroll 4\n
1603       for (unsigned int i = 0; i < 4; i++) {
1604         //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1605         //
1606         //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1607         sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1608       }
1609     }
1610
1611     v.x = (CLQuantum)sv[0];
1612     v.y = (CLQuantum)sv[1];
1613     v.z = (CLQuantum)sv[2];
1614
1615     if (matte!=0)
1616       v.w = (CLQuantum)sv[3];
1617
1618     outputImage[y*imageWidth+x] = v;
1619
1620     }
1621   )
1622
1623 /*
1624 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1625 %                                                                             %
1626 %                                                                             %
1627 %                                                                             %
1628 %     E q u a l i z e                                                         %
1629 %                                                                             %
1630 %                                                                             %
1631 %                                                                             %
1632 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1633 */
1634
1635     STRINGIFY(
1636     /*
1637     */
1638     __kernel void Equalize(__global CLPixelType * restrict im,
1639       const ChannelType channel,
1640       __global CLPixelType * restrict equalize_map,
1641       const float4 white, const float4 black)
1642       {
1643         const int x = get_global_id(0);
1644         const int y = get_global_id(1);
1645         const int columns = get_global_size(0);
1646         const int c = x + y * columns;
1647
1648         uint ePos;
1649         CLPixelType oValue, eValue;
1650         CLQuantum red, green, blue, alpha;
1651
1652         //read from global
1653         oValue=im[c];
1654
1655         if ((channel & SyncChannels) != 0)
1656         {
1657           if (getRedF4(white) != getRedF4(black))
1658           {
1659             ePos = ScaleQuantumToMap(getRed(oValue));
1660             eValue = equalize_map[ePos];
1661             red = getRed(eValue);
1662             ePos = ScaleQuantumToMap(getGreen(oValue));
1663             eValue = equalize_map[ePos];
1664             green = getRed(eValue);
1665             ePos = ScaleQuantumToMap(getBlue(oValue));
1666             eValue = equalize_map[ePos];
1667             blue = getRed(eValue);
1668             ePos = ScaleQuantumToMap(getAlpha(oValue));
1669             eValue = equalize_map[ePos];
1670             alpha = getRed(eValue);
1671
1672             //write back
1673             im[c]=(CLPixelType)(blue, green, red, alpha);
1674           }
1675
1676         }
1677
1678         // for equalizing, we always need all channels?
1679         // otherwise something more
1680
1681      }
1682     )
1683
1684 /*
1685 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1686 %                                                                             %
1687 %                                                                             %
1688 %                                                                             %
1689 %     F u n c t i o n                                                         %
1690 %                                                                             %
1691 %                                                                             %
1692 %                                                                             %
1693 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1694 */
1695
1696   STRINGIFY(
1697   /*
1698   apply FunctionImageChannel(braightness-contrast)
1699   */
1700   CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1701     const unsigned int number_parameters,__constant float *parameters)
1702   {
1703     float result = 0.0f;
1704
1705     switch (function)
1706     {
1707     case PolynomialFunction:
1708       {
1709         for (unsigned int i=0; i < number_parameters; i++)
1710           result = result*QuantumScale*pixel + parameters[i];
1711         result *= QuantumRange;
1712         break;
1713       }
1714     case SinusoidFunction:
1715       {
1716         float  freq,phase,ampl,bias;
1717         freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1718         phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1719         ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1720         bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1721         result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1722           (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1723         break;
1724       }
1725     case ArcsinFunction:
1726       {
1727         float  width,range,center,bias;
1728         width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1729         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1730         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1731         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1732
1733         result = 2.0f/width*(QuantumScale*pixel - center);
1734         result = range/MagickPI*asin(result)+bias;
1735         result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1736         result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1737         result *= QuantumRange;
1738         break;
1739       }
1740     case ArctanFunction:
1741       {
1742         float slope,range,center,bias;
1743         slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1744         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1745         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1746         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1747         result = MagickPI*slope*(QuantumScale*pixel-center);
1748         result = QuantumRange*(range/MagickPI*atan(result) + bias);
1749         break;
1750       }
1751     case UndefinedFunction:
1752       break;
1753     }
1754     return(ClampToQuantum(result));
1755   }
1756   )
1757
1758   STRINGIFY(
1759   /*
1760   Improve brightness / contrast of the image
1761   channel : define which channel is improved
1762   function : the function called to enchance the brightness contrast
1763   number_parameters : numbers of parameters
1764   parameters : the parameter
1765   */
1766   __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1767     const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1768     __constant float *parameters)
1769   {
1770     const unsigned int x = get_global_id(0);
1771     const unsigned int y = get_global_id(1);
1772     const unsigned int columns = get_global_size(0);
1773     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1774
1775     float red;
1776     float green;
1777     float blue;
1778     float alpha;
1779
1780     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1781
1782     if ((channel & RedChannel) != 0)
1783       red=ApplyFunction(red, function, number_parameters, parameters);
1784
1785     if (number_channels > 2)
1786       {
1787         if ((channel & GreenChannel) != 0)
1788           green=ApplyFunction(green, function, number_parameters, parameters);
1789
1790         if ((channel & BlueChannel) != 0)
1791           blue=ApplyFunction(blue, function, number_parameters, parameters);
1792       }
1793
1794     if (((number_channels == 4) || (number_channels == 2)) &&
1795         ((channel & AlphaChannel) != 0))
1796       alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1797
1798     WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1799   }
1800   )
1801
1802 /*
1803 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1804 %                                                                             %
1805 %                                                                             %
1806 %                                                                             %
1807 %     G r a y s c a l e                                                       %
1808 %                                                                             %
1809 %                                                                             %
1810 %                                                                             %
1811 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1812 */
1813
1814   STRINGIFY(
1815   __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1816     const unsigned int colorspace,const unsigned int method)
1817   {
1818     const unsigned int x = get_global_id(0);
1819     const unsigned int y = get_global_id(1);
1820     const unsigned int columns = get_global_size(0);
1821     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1822
1823     float
1824       blue,
1825       green,
1826       red;
1827
1828     red=getPixelRed(p);
1829     green=getPixelGreen(p);
1830     blue=getPixelBlue(p);
1831
1832     CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1833
1834     setPixelRed(p,intensity);
1835     setPixelGreen(p,intensity);
1836     setPixelBlue(p,intensity);
1837   }
1838   )
1839
1840 /*
1841 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1842 %                                                                             %
1843 %                                                                             %
1844 %                                                                             %
1845 %     L o c a l C o n t r a s t                                               %
1846 %                                                                             %
1847 %                                                                             %
1848 %                                                                             %
1849 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1850 */
1851
1852     STRINGIFY(
1853
1854       __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1855           const int radius,
1856           const int imageWidth,
1857           const int imageHeight)
1858       {
1859         const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1860
1861         int x = get_local_id(0);
1862         int y = get_global_id(1);
1863
1864         if ((x >= imageWidth) || (y >= imageHeight))
1865           return;
1866
1867         global CLPixelType *src = srcImage + y * imageWidth;
1868
1869         for (int i = x; i < imageWidth; i += get_local_size(0)) {
1870             float sum = 0.0f;
1871             float weight = 1.0f;
1872
1873             int j = i - radius;
1874             while ((j + 7) < i) {
1875                 for (int k = 0; k < 8; ++k) // Unroll 8x
1876                     sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1877                 weight += 8.0f;
1878                 j+=8;
1879             }
1880             while (j < i) {
1881                 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1882                 weight += 1.0f;
1883                 ++j;
1884             }
1885
1886             while ((j + 7) < radius + i) {
1887                 for (int k = 0; k < 8; ++k) // Unroll 8x
1888                     sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1889                 weight -= 8.0f;
1890                 j+=8;
1891             }
1892             while (j < radius + i) {
1893                 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1894                 weight -= 1.0f;
1895                 ++j;
1896             }
1897
1898             tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1899         }
1900       }
1901     )
1902
1903     STRINGIFY(
1904       __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1905           const int radius,
1906           const float strength,
1907           const int imageWidth,
1908           const int imageHeight)
1909       {
1910         const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1911
1912         int x = get_global_id(0);
1913         int y = get_global_id(1);
1914
1915         if ((x >= imageWidth) || (y >= imageHeight))
1916                 return;
1917
1918         global float *src = blurImage + x;
1919
1920         float sum = 0.0f;
1921         float weight = 1.0f;
1922
1923         int j = y - radius;
1924         while ((j + 7) < y) {
1925             for (int k = 0; k < 8; ++k) // Unroll 8x
1926                 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1927             weight += 8.0f;
1928             j+=8;
1929         }
1930         while (j < y) {
1931             sum += weight * src[mirrorBottom(j) * imageWidth];
1932             weight += 1.0f;
1933             ++j;
1934         }
1935
1936         while ((j + 7) < radius + y) {
1937             for (int k = 0; k < 8; ++k) // Unroll 8x
1938                 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1939             weight -= 8.0f;
1940             j+=8;
1941         }
1942         while (j < radius + y) {
1943             sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1944             weight -= 1.0f;
1945             ++j;
1946         }
1947
1948         CLPixelType pixel = srcImage[x + y * imageWidth];
1949         float srcVal = dot(RGB, convert_float4(pixel));
1950         float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1951         mult = (srcVal + mult) / srcVal;
1952
1953         pixel.x = ClampToQuantum(pixel.x * mult);
1954         pixel.y = ClampToQuantum(pixel.y * mult);
1955         pixel.z = ClampToQuantum(pixel.z * mult);
1956
1957         dstImage[x + y * imageWidth] = pixel;
1958       }
1959     )
1960
1961 /*
1962 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1963 %                                                                             %
1964 %                                                                             %
1965 %                                                                             %
1966 %     M o d u l a t e                                                         %
1967 %                                                                             %
1968 %                                                                             %
1969 %                                                                             %
1970 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1971 */
1972
1973   STRINGIFY(
1974
1975   inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1976     float *hue, float *saturation, float *lightness)
1977   {
1978   float
1979     c,
1980     tmax,
1981     tmin;
1982
1983   /*
1984      Convert RGB to HSL colorspace.
1985      */
1986   tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1987   tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1988
1989   c=tmax-tmin;
1990
1991   *lightness=(tmax+tmin)/2.0;
1992   if (c <= 0.0)
1993   {
1994     *hue=0.0;
1995     *saturation=0.0;
1996     return;
1997   }
1998
1999   if (tmax == (QuantumScale*red))
2000   {
2001     *hue=(QuantumScale*green-QuantumScale*blue)/c;
2002     if ((QuantumScale*green) < (QuantumScale*blue))
2003       *hue+=6.0;
2004   }
2005   else
2006     if (tmax == (QuantumScale*green))
2007       *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2008     else
2009       *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2010
2011   *hue*=60.0/360.0;
2012   if (*lightness <= 0.5)
2013     *saturation=c/(2.0*(*lightness));
2014   else
2015     *saturation=c/(2.0-2.0*(*lightness));
2016   }
2017
2018   inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2019       CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2020   {
2021     float
2022       b,
2023       c,
2024       g,
2025       h,
2026       tmin,
2027       r,
2028       x;
2029
2030     /*
2031        Convert HSL to RGB colorspace.
2032        */
2033     h=hue*360.0;
2034     if (lightness <= 0.5)
2035       c=2.0*lightness*saturation;
2036     else
2037       c=(2.0-2.0*lightness)*saturation;
2038     tmin=lightness-0.5*c;
2039     h-=360.0*floor(h/360.0);
2040     h/=60.0;
2041     x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2042     switch ((int) floor(h) % 6)
2043     {
2044       case 0:
2045         {
2046           r=tmin+c;
2047           g=tmin+x;
2048           b=tmin;
2049           break;
2050         }
2051       case 1:
2052         {
2053           r=tmin+x;
2054           g=tmin+c;
2055           b=tmin;
2056           break;
2057         }
2058       case 2:
2059         {
2060           r=tmin;
2061           g=tmin+c;
2062           b=tmin+x;
2063           break;
2064         }
2065       case 3:
2066         {
2067           r=tmin;
2068           g=tmin+x;
2069           b=tmin+c;
2070           break;
2071         }
2072       case 4:
2073         {
2074           r=tmin+x;
2075           g=tmin;
2076           b=tmin+c;
2077           break;
2078         }
2079       case 5:
2080         {
2081           r=tmin+c;
2082           g=tmin;
2083           b=tmin+x;
2084           break;
2085         }
2086       default:
2087         {
2088           r=0.0;
2089           g=0.0;
2090           b=0.0;
2091         }
2092     }
2093     *red=ClampToQuantum(QuantumRange*r);
2094     *green=ClampToQuantum(QuantumRange*g);
2095     *blue=ClampToQuantum(QuantumRange*b);
2096   }
2097
2098   inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2099     CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2100   {
2101     float
2102       hue,
2103       lightness,
2104       saturation;
2105
2106     /*
2107     Increase or decrease color lightness, saturation, or hue.
2108     */
2109     ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2110     hue+=0.5*(0.01*percent_hue-1.0);
2111     while (hue < 0.0)
2112       hue+=1.0;
2113     while (hue >= 1.0)
2114       hue-=1.0;
2115     saturation*=0.01*percent_saturation;
2116     lightness*=0.01*percent_lightness;
2117     ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2118   }
2119
2120   __kernel void Modulate(__global CLPixelType *im,
2121     const float percent_brightness,
2122     const float percent_hue,
2123     const float percent_saturation,
2124     const int colorspace)
2125   {
2126
2127     const int x = get_global_id(0);
2128     const int y = get_global_id(1);
2129     const int columns = get_global_size(0);
2130     const int c = x + y * columns;
2131
2132     CLPixelType pixel = im[c];
2133
2134     CLQuantum
2135         blue,
2136         green,
2137         red;
2138
2139     red=getRed(pixel);
2140     green=getGreen(pixel);
2141     blue=getBlue(pixel);
2142
2143     switch (colorspace)
2144     {
2145       case HSLColorspace:
2146       default:
2147         {
2148           ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2149               &red, &green, &blue);
2150         }
2151
2152     }
2153
2154     CLPixelType filteredPixel;
2155
2156     setRed(&filteredPixel, red);
2157     setGreen(&filteredPixel, green);
2158     setBlue(&filteredPixel, blue);
2159     filteredPixel.w = pixel.w;
2160
2161     im[c] = filteredPixel;
2162   }
2163   )
2164
2165 /*
2166 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2167 %                                                                             %
2168 %                                                                             %
2169 %                                                                             %
2170 %     M o t i o n B l u r                                                     %
2171 %                                                                             %
2172 %                                                                             %
2173 %                                                                             %
2174 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2175 */
2176
2177   STRINGIFY(
2178     __kernel
2179     void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2180                     const unsigned int imageWidth, const unsigned int imageHeight,
2181                     const __global float *filter, const unsigned int width, const __global int2* offset,
2182                     const float4 bias,
2183                     const ChannelType channel, const unsigned int matte) {
2184
2185       int2 currentPixel;
2186       currentPixel.x = get_global_id(0);
2187       currentPixel.y = get_global_id(1);
2188
2189       if (currentPixel.x >= imageWidth
2190           || currentPixel.y >= imageHeight)
2191           return;
2192
2193       float4 pixel;
2194       pixel.x = (float)bias.x;
2195       pixel.y = (float)bias.y;
2196       pixel.z = (float)bias.z;
2197       pixel.w = (float)bias.w;
2198
2199       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2200
2201         for (int i = 0; i < width; i++) {
2202           // only support EdgeVirtualPixelMethod through ClampToCanvas
2203           // TODO: implement other virtual pixel method
2204           int2 samplePixel = currentPixel + offset[i];
2205           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2206           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2207           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2208
2209           pixel.x += (filter[i] * (float)samplePixelValue.x);
2210           pixel.y += (filter[i] * (float)samplePixelValue.y);
2211           pixel.z += (filter[i] * (float)samplePixelValue.z);
2212           pixel.w += (filter[i] * (float)samplePixelValue.w);
2213         }
2214
2215         CLPixelType outputPixel;
2216         outputPixel.x = ClampToQuantum(pixel.x);
2217         outputPixel.y = ClampToQuantum(pixel.y);
2218         outputPixel.z = ClampToQuantum(pixel.z);
2219         outputPixel.w = ClampToQuantum(pixel.w);
2220         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2221       }
2222       else {
2223
2224         float gamma = 0.0f;
2225         for (int i = 0; i < width; i++) {
2226           // only support EdgeVirtualPixelMethod through ClampToCanvas
2227           // TODO: implement other virtual pixel method
2228           int2 samplePixel = currentPixel + offset[i];
2229           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2230           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2231
2232           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2233
2234           float alpha = QuantumScale*samplePixelValue.w;
2235           float k = filter[i];
2236           pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2237           pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2238           pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2239
2240           pixel.w += k * alpha * samplePixelValue.w;
2241
2242           gamma+=k*alpha;
2243         }
2244         gamma = PerceptibleReciprocal(gamma);
2245         pixel.xyz = gamma*pixel.xyz;
2246
2247         CLPixelType outputPixel;
2248         outputPixel.x = ClampToQuantum(pixel.x);
2249         outputPixel.y = ClampToQuantum(pixel.y);
2250         outputPixel.z = ClampToQuantum(pixel.z);
2251         outputPixel.w = ClampToQuantum(pixel.w);
2252         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2253       }
2254     }
2255   )
2256
2257 /*
2258 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2259 %                                                                             %
2260 %                                                                             %
2261 %                                                                             %
2262 %     R e s i z e                                                             %
2263 %                                                                             %
2264 %                                                                             %
2265 %                                                                             %
2266 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2267 */
2268
2269   STRINGIFY(
2270   // Based on Box from resize.c
2271   float BoxResizeFilter(const float x)
2272   {
2273     return 1.0f;
2274   }
2275   )
2276
2277   STRINGIFY(
2278   // Based on CubicBC from resize.c
2279   float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2280   {
2281     /*
2282     Cubic Filters using B,C determined values:
2283     Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
2284     Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
2285     Spline              B = 1   C = 0    B-Spline Gaussian approximation
2286     Hermite             B = 0   C = 0    B-Spline interpolator
2287
2288     See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2289     Graphics Computer Graphics, Volume 22, Number 4, August 1988
2290     http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2291     Mitchell.pdf.
2292
2293     Coefficents are determined from B,C values:
2294     P0 = (  6 - 2*B       )/6 = coeff[0]
2295     P1 =         0
2296     P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2297     P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2298     Q0 = (      8*B +24*C )/6 = coeff[3]
2299     Q1 = (    -12*B -48*C )/6 = coeff[4]
2300     Q2 = (      6*B +30*C )/6 = coeff[5]
2301     Q3 = (    - 1*B - 6*C )/6 = coeff[6]
2302
2303     which are used to define the filter:
2304
2305     P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
2306     Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
2307
2308     which ensures function is continuous in value and derivative (slope).
2309     */
2310     if (x < 1.0)
2311       return(resizeFilterCoefficients[0]+x*(x*
2312       (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2313     if (x < 2.0)
2314       return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2315       (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2316     return(0.0);
2317   }
2318   )
2319
2320   STRINGIFY(
2321   float Sinc(const float x)
2322   {
2323     if (x != 0.0f)
2324     {
2325       const float alpha=(float) (MagickPI*x);
2326       return sinpi(x)/alpha;
2327     }
2328     return(1.0f);
2329   }
2330   )
2331
2332   STRINGIFY(
2333   float Triangle(const float x)
2334   {
2335     /*
2336     1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2337     a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
2338     for Sinc().
2339     */
2340     return ((x<1.0f)?(1.0f-x):0.0f);
2341   }
2342   )
2343
2344
2345   STRINGIFY(
2346   float Hann(const float x)
2347   {
2348     /*
2349     Cosine window function:
2350       0.5+0.5*cos(pi*x).
2351     */
2352     const float cosine=cos((MagickPI*x));
2353     return(0.5f+0.5f*cosine);
2354   }
2355   )
2356
2357   STRINGIFY(
2358   float Hamming(const float x)
2359   {
2360     /*
2361       Offset cosine window function:
2362        .54 + .46 cos(pi x).
2363     */
2364     const float cosine=cos((MagickPI*x));
2365     return(0.54f+0.46f*cosine);
2366   }
2367   )
2368
2369   STRINGIFY(
2370   float Blackman(const float x)
2371   {
2372     /*
2373       Blackman: 2nd order cosine windowing function:
2374         0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2375
2376       Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2377       five flops.
2378     */
2379     const float cosine=cos((MagickPI*x));
2380     return(0.34f+cosine*(0.5f+cosine*0.16f));
2381   }
2382   )
2383
2384   STRINGIFY(
2385   inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2386   {
2387     switch (filterType)
2388     {
2389     /* Call Sinc even for SincFast to get better precision on GPU
2390        and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
2391     case SincWeightingFunction:
2392     case SincFastWeightingFunction:
2393       return Sinc(x);
2394     case CubicBCWeightingFunction:
2395       return CubicBC(x,filterCoefficients);
2396     case BoxWeightingFunction:
2397       return BoxResizeFilter(x);
2398     case TriangleWeightingFunction:
2399       return Triangle(x);
2400     case HannWeightingFunction:
2401       return Hann(x);
2402     case HammingWeightingFunction:
2403       return Hamming(x);
2404     case BlackmanWeightingFunction:
2405       return Blackman(x);
2406
2407     default:
2408       return 0.0f;
2409     }
2410   }
2411   )
2412
2413
2414   STRINGIFY(
2415   inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2416            , const ResizeWeightingFunctionType resizeWindowType
2417            , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2418   {
2419     float scale;
2420     float xBlur = fabs(x/resizeFilterBlur);
2421     if (resizeWindowSupport < MagickEpsilon
2422         || resizeWindowType == BoxWeightingFunction)
2423     {
2424       scale = 1.0f;
2425     }
2426     else
2427     {
2428       scale = resizeFilterScale;
2429       scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2430     }
2431     float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2432     return weight;
2433   }
2434
2435   )
2436
2437   ;
2438   const char *accelerateKernels2 =
2439
2440   STRINGIFY(
2441
2442   inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2443     return (numWorkItems/pixelPerWorkgroup);
2444   }
2445
2446   // returns the index of the pixel for the current workitem to compute.
2447   // returns -1 if this workitem doesn't need to participate in any computation
2448   inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2449     const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2450     int pixelIndex = itemID/numWorkItemsPerPixel;
2451     pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2452     return pixelIndex;
2453   }
2454
2455   )
2456
2457   STRINGIFY(
2458   __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2459     void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2460       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2461       const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2462       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2463       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2464       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2465       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2466       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2467   {
2468     // calculate the range of resized image pixels computed by this workgroup
2469     const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2470     const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2471     const unsigned int actualNumPixelToCompute = stopX - startX;
2472
2473     // calculate the range of input image pixels to cache
2474     float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2475     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2476     scale = PerceptibleReciprocal(scale);
2477
2478     const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2479     const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2480
2481     // cache the input pixels into local memory
2482     const unsigned int y = get_global_id(1);
2483     const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2484     const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2485     event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2486     wait_group_events(1, &e);
2487
2488     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2489     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2490     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2491     {
2492       const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2493       const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2494       const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2495
2496       // determine which resized pixel computed by this workitem
2497       const unsigned int itemID = get_local_id(0);
2498       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2499
2500       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2501
2502       float4 filteredPixel = (float4)0.0f;
2503       float density = 0.0f;
2504       float gamma = 0.0f;
2505       // -1 means this workitem doesn't participate in the computation
2506       if (pixelIndex != -1)
2507       {
2508         // x coordinated of the resized pixel computed by this workitem
2509         const int x = chunkStartX + pixelIndex;
2510
2511         // calculate how many steps required for this pixel
2512         const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2513         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2514         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2515         const unsigned int n = stop - start;
2516
2517         // calculate how many steps this workitem will contribute
2518         unsigned int numStepsPerWorkItem = n / numItems;
2519         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2520
2521         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2522         if (startStep < n)
2523         {
2524           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2525
2526           unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2527           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2528           {
2529             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2530               (ResizeWeightingFunctionType) resizeFilterType,
2531               (ResizeWeightingFunctionType) resizeWindowType,
2532               resizeFilterScale, resizeFilterWindowSupport,
2533               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2534
2535             float4 cp = (float4)0.0f;
2536
2537             __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2538             cp.x = (float) *(p);
2539             if (number_channels > 2)
2540             {
2541               cp.y = (float) *(p + 1);
2542               cp.z = (float) *(p + 2);
2543             }
2544
2545             if (alpha_index != 0)
2546             {
2547               cp.w = (float) *(p + alpha_index);
2548
2549               float alpha = weight * QuantumScale * cp.w;
2550
2551               filteredPixel.x += alpha * cp.x;
2552               filteredPixel.y += alpha * cp.y;
2553               filteredPixel.z += alpha * cp.z;
2554               filteredPixel.w += weight * cp.w;
2555               gamma += alpha;
2556             }
2557             else
2558               filteredPixel += ((float4) weight)*cp;
2559
2560             density += weight;
2561           }
2562         }
2563       }
2564
2565       // initialize the accumulators to zero
2566       if (itemID < actualNumPixelInThisChunk) {
2567         outputPixelCache[itemID] = (float4)0.0f;
2568         densityCache[itemID] = 0.0f;
2569         if (alpha_index != 0)
2570           gammaCache[itemID] = 0.0f;
2571       }
2572       barrier(CLK_LOCAL_MEM_FENCE);
2573
2574       // accumulatte the filtered pixel value and the density
2575       for (unsigned int i = 0; i < numItems; i++) {
2576         if (pixelIndex != -1) {
2577           if (itemID%numItems == i) {
2578             outputPixelCache[pixelIndex]+=filteredPixel;
2579             densityCache[pixelIndex]+=density;
2580             if (alpha_index != 0)
2581               gammaCache[pixelIndex]+=gamma;
2582           }
2583         }
2584         barrier(CLK_LOCAL_MEM_FENCE);
2585       }
2586
2587       if (itemID < actualNumPixelInThisChunk)
2588       {
2589         float4 filteredPixel = outputPixelCache[itemID];
2590
2591         float gamma = 0.0f;
2592         if (alpha_index != 0)
2593           gamma = gammaCache[itemID];
2594
2595         float density = densityCache[itemID];
2596         if ((density != 0.0f) && (density != 1.0f))
2597         {
2598           density = PerceptibleReciprocal(density);
2599           filteredPixel *= (float4) density;
2600           if (alpha_index != 0)
2601             gamma *= density;
2602         }
2603
2604         if (alpha_index != 0)
2605         {
2606           gamma = PerceptibleReciprocal(gamma);
2607           filteredPixel.x *= gamma;
2608           filteredPixel.y *= gamma;
2609           filteredPixel.z *= gamma;
2610         }
2611
2612         WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2613       }
2614     }
2615   }
2616   )
2617
2618
2619   STRINGIFY(
2620  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2621     void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2622       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2623       const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2624       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2625       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2626       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2627       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2628       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2629   {
2630     // calculate the range of resized image pixels computed by this workgroup
2631     const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2632     const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2633     const unsigned int actualNumPixelToCompute = stopY - startY;
2634
2635     // calculate the range of input image pixels to cache
2636     float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2637     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2638     scale = PerceptibleReciprocal(scale);
2639
2640     const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2641     const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2642
2643     // cache the input pixels into local memory
2644     const unsigned int x = get_global_id(0);
2645     unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2646     unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2647     unsigned int stride = inputColumns * number_channels;
2648     for (unsigned int i = 0; i < number_channels; i++)
2649     {
2650       event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2651       wait_group_events(1,&e);
2652     }
2653
2654     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2655     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2656     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2657     {
2658       const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2659       const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2660       const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2661
2662       // determine which resized pixel computed by this workitem
2663       const unsigned int itemID = get_local_id(1);
2664       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2665
2666       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2667
2668       float4 filteredPixel = (float4)0.0f;
2669       float density = 0.0f;
2670       float gamma = 0.0f;
2671       // -1 means this workitem doesn't participate in the computation
2672       if (pixelIndex != -1)
2673       {
2674         // x coordinated of the resized pixel computed by this workitem
2675         const int y = chunkStartY + pixelIndex;
2676
2677         // calculate how many steps required for this pixel
2678         const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2679         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2680         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2681         const unsigned int n = stop - start;
2682
2683         // calculate how many steps this workitem will contribute
2684         unsigned int numStepsPerWorkItem = n / numItems;
2685         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2686
2687         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2688         if (startStep < n)
2689         {
2690           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2691
2692           unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2693           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2694           {
2695             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2696               (ResizeWeightingFunctionType) resizeFilterType,
2697               (ResizeWeightingFunctionType) resizeWindowType,
2698               resizeFilterScale, resizeFilterWindowSupport,
2699               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2700
2701             float4 cp = (float4)0.0f;
2702
2703             __local CLQuantum *p = inputImageCache + cacheIndex;
2704             cp.x = (float) *(p);
2705             if (number_channels > 2)
2706             {
2707               cp.y = (float) *(p + rangeLength);
2708               cp.z = (float) *(p + (rangeLength * 2));
2709             }
2710
2711             if (alpha_index != 0)
2712             {
2713               cp.w = (float) *(p + (rangeLength * alpha_index));
2714
2715               float alpha = weight * QuantumScale * cp.w;
2716
2717               filteredPixel.x += alpha * cp.x;
2718               filteredPixel.y += alpha * cp.y;
2719               filteredPixel.z += alpha * cp.z;
2720               filteredPixel.w += weight * cp.w;
2721               gamma += alpha;
2722             }
2723             else
2724               filteredPixel += ((float4) weight)*cp;
2725
2726             density += weight;
2727           }
2728         }
2729       }
2730
2731       // initialize the accumulators to zero
2732       if (itemID < actualNumPixelInThisChunk) {
2733         outputPixelCache[itemID] = (float4)0.0f;
2734         densityCache[itemID] = 0.0f;
2735         if (alpha_index != 0)
2736           gammaCache[itemID] = 0.0f;
2737       }
2738       barrier(CLK_LOCAL_MEM_FENCE);
2739
2740       // accumulatte the filtered pixel value and the density
2741       for (unsigned int i = 0; i < numItems; i++) {
2742         if (pixelIndex != -1) {
2743           if (itemID%numItems == i) {
2744             outputPixelCache[pixelIndex]+=filteredPixel;
2745             densityCache[pixelIndex]+=density;
2746             if (alpha_index != 0)
2747               gammaCache[pixelIndex]+=gamma;
2748           }
2749         }
2750         barrier(CLK_LOCAL_MEM_FENCE);
2751       }
2752
2753       if (itemID < actualNumPixelInThisChunk)
2754       {
2755         float4 filteredPixel = outputPixelCache[itemID];
2756
2757         float gamma = 0.0f;
2758         if (alpha_index != 0)
2759           gamma = gammaCache[itemID];
2760
2761         float density = densityCache[itemID];
2762         if ((density != 0.0f) && (density != 1.0f))
2763         {
2764           density = PerceptibleReciprocal(density);
2765           filteredPixel *= (float4) density;
2766           if (alpha_index != 0)
2767             gamma *= density;
2768         }
2769
2770         if (alpha_index != 0)
2771         {
2772           gamma = PerceptibleReciprocal(gamma);
2773           filteredPixel.x *= gamma;
2774           filteredPixel.y *= gamma;
2775           filteredPixel.z *= gamma;
2776         }
2777
2778         WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2779       }
2780     }
2781   }
2782   )
2783
2784 /*
2785 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2786 %                                                                             %
2787 %                                                                             %
2788 %                                                                             %
2789 %     R o t a t i o n a l B l u r                                             %
2790 %                                                                             %
2791 %                                                                             %
2792 %                                                                             %
2793 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2794 */
2795
2796   STRINGIFY(
2797   __kernel void RotationalBlur(const __global CLQuantum *image,
2798     const unsigned int number_channels,const unsigned int channel,
2799     const float2 blurCenter,__constant float *cos_theta,
2800     __constant float *sin_theta,const unsigned int cossin_theta_size,
2801     __global CLQuantum *filteredImage)
2802   {
2803     const int x = get_global_id(0);
2804     const int y = get_global_id(1);
2805     const int columns = get_global_size(0);
2806     const int rows = get_global_size(1);
2807     unsigned int step = 1;
2808     float center_x = (float) x - blurCenter.x;
2809     float center_y = (float) y - blurCenter.y;
2810     float radius = hypot(center_x, center_y);
2811
2812     float blur_radius = hypot(blurCenter.x, blurCenter.y);
2813
2814     if (radius > MagickEpsilon)
2815     {
2816       step = (unsigned int) (blur_radius / radius);
2817       if (step == 0)
2818         step = 1;
2819       if (step >= cossin_theta_size)
2820         step = cossin_theta_size-1;
2821     }
2822
2823     float4 result = 0.0f;
2824     float normalize = 0.0f;
2825     float gamma = 0.0f;
2826
2827     for (unsigned int i=0; i<cossin_theta_size; i+=step)
2828     {
2829       int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2830       int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2831
2832       float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2833
2834       if ((number_channels == 4) || (number_channels == 2))
2835       {
2836         float alpha = (float)(QuantumScale*pixel.w);
2837
2838         gamma += alpha;
2839
2840         result.x += alpha * pixel.x;
2841         result.y += alpha * pixel.y;
2842         result.z += alpha * pixel.z;
2843         result.w += pixel.w;
2844       }
2845       else
2846         result += pixel;
2847
2848       normalize += 1.0f;
2849     }
2850
2851     normalize = PerceptibleReciprocal(normalize);
2852
2853     if ((number_channels == 4) || (number_channels == 2))
2854     {
2855       gamma = PerceptibleReciprocal(gamma);
2856       result.x *= gamma;
2857       result.y *= gamma;
2858       result.z *= gamma;
2859       result.w *= normalize;
2860     }
2861     else
2862       result *= normalize;
2863
2864     WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2865   }
2866   )
2867
2868 /*
2869 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2870 %                                                                             %
2871 %                                                                             %
2872 %                                                                             %
2873 %     U n s h a r p M a s k                                                   %
2874 %                                                                             %
2875 %                                                                             %
2876 %                                                                             %
2877 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2878 */
2879
2880   STRINGIFY(
2881   __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2882     const __global float4 *blurRowData,const unsigned int number_channels,
2883     const ChannelType channel,const unsigned int columns,
2884     const unsigned int rows,__local float4* cachedData,
2885     __local float* cachedFilter,const __global float *filter,
2886     const unsigned int width,const float gain, const float threshold,
2887     __global CLQuantum *filteredImage)
2888   {
2889     const unsigned int radius = (width-1)/2;
2890
2891     // cache the pixel shared by the workgroup
2892     const int groupX = get_group_id(0);
2893     const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2894     const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2895
2896     if ((groupStartY >= 0) && (groupStopY < rows))
2897     {
2898       event_t e = async_work_group_strided_copy(cachedData,
2899         blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2900       wait_group_events(1,&e);
2901     }
2902     else
2903     {
2904       for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2905         cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2906
2907       barrier(CLK_LOCAL_MEM_FENCE);
2908     }
2909     // cache the filter as well
2910     event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2911     wait_group_events(1,&e);
2912
2913     // only do the work if this is not a patched item
2914     const int cy = get_global_id(1);
2915
2916     if (cy < rows)
2917     {
2918       float4 blurredPixel = (float4) 0.0f;
2919
2920       int i = 0;
2921
2922       for ( ; i+7 < width; )
2923       {
2924         for (int j=0; j < 8; j++, i++)
2925           blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2926       }
2927
2928       for ( ; i < width; i++)
2929         blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2930
2931       float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2932       float4 outputPixel = inputImagePixel - blurredPixel;
2933
2934       float quantumThreshold = QuantumRange*threshold;
2935
2936       int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2937       outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2938
2939       //write back
2940       WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2941     }
2942   }
2943   )
2944
2945   STRINGIFY(
2946   __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2947     const ChannelType channel,__constant float *filter,const unsigned int width,
2948     const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2949     const float gain,const float threshold,__global CLQuantum *filteredImage)
2950   {
2951     const unsigned int x = get_global_id(0);
2952     const unsigned int y = get_global_id(1);
2953
2954     const unsigned int radius = (width - 1) / 2;
2955
2956     int row = y - radius;
2957     int baseRow = get_group_id(1) * get_local_size(1) - radius;
2958     int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2959
2960     while (row < endRow) {
2961       int srcy = (row < 0) ? -row : row; // mirror pad
2962       srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2963
2964       float4 value = 0.0f;
2965
2966       int ix = x - radius;
2967       int i = 0;
2968
2969       while (i + 7 < width) {
2970         for (int j = 0; j < 8; ++j) { // unrolled
2971           int srcx = ix + j;
2972           srcx = (srcx < 0) ? -srcx : srcx;
2973           srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2974           value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2975         }
2976         ix += 8;
2977         i += 8;
2978       }
2979
2980       while (i < width) {
2981         int srcx = (ix < 0) ? -ix : ix; // mirror pad
2982         srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2983         value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2984         ++i;
2985         ++ix;
2986       }
2987       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2988       row += get_local_size(1);
2989     }
2990
2991     barrier(CLK_LOCAL_MEM_FENCE);
2992
2993     const int px = get_local_id(0);
2994     const int py = get_local_id(1);
2995     const int prp = get_local_size(0);
2996     float4 value = (float4)(0.0f);
2997
2998     int i = 0;
2999     while (i + 7 < width) {
3000       for (int j = 0; j < 8; ++j) // unrolled
3001         value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
3002       i += 8;
3003     }
3004     while (i < width) {
3005       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3006       ++i;
3007     }
3008
3009     if ((x < columns) && (y < rows)) {
3010       float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
3011       float4 diff = srcPixel - value;
3012
3013       float quantumThreshold = QuantumRange*threshold;
3014
3015       int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3016       value = select(srcPixel + diff * gain, srcPixel, mask);
3017
3018       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
3019     }
3020   }
3021   )
3022
3023   STRINGIFY(
3024     __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
3025     void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
3026       const unsigned int number_channels,const unsigned int max_channels,
3027       const float threshold,const int passes,const unsigned int imageWidth,
3028       const unsigned int imageHeight)
3029   {
3030     const int pad = (1 << (passes - 1));
3031     const int tileSize = 64;
3032     const int tileRowPixels = 64;
3033     const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3034
3035     CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
3036
3037     local float buffer[64 * 64];
3038
3039     int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3040     int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3041
3042     for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3043       int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
3044                 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
3045
3046       for (int channel = 0; channel < max_channels; ++channel)
3047         stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
3048     }
3049
3050     for (int channel = 0; channel < max_channels; ++channel) {
3051       // Load LDS
3052       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3053         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3054
3055       // Process
3056
3057       float tmp[16];
3058       float accum[16];
3059       float pixel;
3060
3061       for (int i = 0; i < 16; i++)
3062         accum[i]=0.0f;
3063
3064       for (int pass = 0; pass < passes; ++pass) {
3065         const int radius = 1 << pass;
3066         const int x = get_local_id(0);
3067         const float thresh = threshold * noise[pass];
3068
3069         // Apply horizontal hat
3070         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3071           const int offset = i * tileRowPixels;
3072           if (pass == 0)
3073             tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3074           pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3075           barrier(CLK_LOCAL_MEM_FENCE);
3076           buffer[x + offset] = pixel;
3077         }
3078         barrier(CLK_LOCAL_MEM_FENCE);
3079
3080         // Apply vertical hat
3081         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3082           pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3083           float delta = tmp[i / 4] - pixel;
3084           tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3085           if (delta < -thresh)
3086             delta += thresh;
3087           else if (delta > thresh)
3088             delta -= thresh;
3089           else
3090             delta = 0;
3091           accum[i / 4] += delta;
3092         }
3093         barrier(CLK_LOCAL_MEM_FENCE);
3094
3095         if (pass < passes - 1)
3096           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3097             buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3098         else  // last pass
3099           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3100             accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3101         barrier(CLK_LOCAL_MEM_FENCE);
3102       }
3103
3104       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3105         stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3106
3107       barrier(CLK_LOCAL_MEM_FENCE);
3108     }
3109
3110     // Write from stage to output
3111
3112     if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3113       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3114         if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3115           int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3116           for (int channel = 0; channel < max_channels; ++channel) {
3117             dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3118           }
3119         }
3120       }
3121     }
3122   }
3123   )
3124
3125   ;
3126
3127 #endif // MAGICKCORE_OPENCL_SUPPORT
3128
3129 #if defined(__cplusplus) || defined(c_plusplus)
3130 }
3131 #endif
3132
3133 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H