2 Copyright 1999-2018 ImageMagick Studio LLC, a non-profit organization
3 dedicated to making software imaging solutions freely available.
5 You may not use this file except in compliance with the License.
6 obtain a copy of the License at
8 https://www.imagemagick.org/script/license.php
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
16 MagickCore private kernels for accelerated functions.
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
22 #if defined(__cplusplus) || defined(c_plusplus)
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE() "\n #""else " " \n"
34 #define OPENCL_ENDIF() "\n #""endif " " \n"
35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
38 const char *accelerateKernels =
43 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
44 OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
45 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
46 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
47 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
48 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
49 OPENCL_DEFINE(SigmaRandom, (attenuate))
50 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
51 OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
52 OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
61 RGBColorspace, /* Linear RGB colorspace */
62 GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */
63 TransparentColorspace,
72 CMYKColorspace, /* negared linear RGB with black separated */
73 sRGBColorspace, /* Default: non-lienar sRGB colorspace */
78 Rec601YCbCrColorspace,
80 Rec709YCbCrColorspace,
82 CMYColorspace, /* negated linear RGB colorspace */
85 LCHColorspace, /* alias for LCHuv */
87 LCHabColorspace, /* Cylindrical (Polar) Lab */
88 LCHuvColorspace, /* Cylindrical (Polar) Luv */
91 HSVColorspace, /* alias for HSB */
100 UndefinedCompositeOp,
102 ModulusAddCompositeOp,
106 ChangeMaskCompositeOp,
108 ColorBurnCompositeOp,
109 ColorDodgeCompositeOp,
111 CopyBlackCompositeOp,
115 CopyGreenCompositeOp,
116 CopyMagentaCompositeOp,
117 CopyOpacityCompositeOp,
119 CopyYellowCompositeOp,
126 DifferenceCompositeOp,
129 ExclusionCompositeOp,
130 HardLightCompositeOp,
134 LinearLightCompositeOp,
146 SoftLightCompositeOp,
152 ModulusSubtractCompositeOp,
153 ThresholdCompositeOp,
155 /* These are new operators, added after the above was last sorted.
156 * The list should be re-sorted only when a new library version is
159 DivideDstCompositeOp,
162 PegtopLightCompositeOp,
163 VividLightCompositeOp,
165 LinearDodgeCompositeOp,
166 LinearBurnCompositeOp,
167 MathematicsCompositeOp,
168 DivideSrcCompositeOp,
170 DarkenIntensityCompositeOp,
171 LightenIntensityCompositeOp
192 MultiplicativeGaussianNoise,
203 UndefinedPixelIntensityMethod = 0,
204 AveragePixelIntensityMethod,
205 BrightnessPixelIntensityMethod,
206 LightnessPixelIntensityMethod,
207 Rec601LumaPixelIntensityMethod,
208 Rec601LuminancePixelIntensityMethod,
209 Rec709LumaPixelIntensityMethod,
210 Rec709LuminancePixelIntensityMethod,
211 RMSPixelIntensityMethod,
212 MSPixelIntensityMethod
213 } PixelIntensityMethod;
218 BoxWeightingFunction = 0,
219 TriangleWeightingFunction,
220 CubicBCWeightingFunction,
221 HannWeightingFunction,
222 HammingWeightingFunction,
223 BlackmanWeightingFunction,
224 GaussianWeightingFunction,
225 QuadraticWeightingFunction,
226 JincWeightingFunction,
227 SincWeightingFunction,
228 SincFastWeightingFunction,
229 KaiserWeightingFunction,
230 WelshWeightingFunction,
231 BohmanWeightingFunction,
232 LagrangeWeightingFunction,
233 CosineWeightingFunction,
234 } ResizeWeightingFunctionType;
240 UndefinedChannel = 0x0000,
242 GrayChannel = 0x0001,
243 CyanChannel = 0x0001,
244 GreenChannel = 0x0002,
245 MagentaChannel = 0x0002,
246 BlueChannel = 0x0004,
247 YellowChannel = 0x0004,
248 BlackChannel = 0x0008,
249 AlphaChannel = 0x0010,
250 OpacityChannel = 0x0010,
251 IndexChannel = 0x0020, /* Color Index Table? */
252 ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */
253 WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */
254 MetaChannel = 0x0100, /* ???? */
255 CompositeChannels = 0x001F,
256 AllChannels = 0x7ffffff,
258 Special purpose channel types.
259 FUTURE: are these needed any more - they are more like hacks
260 SyncChannels for example is NOT a real channel but a 'flag'
261 It really says -- "User has not defined channels"
262 Though it does have extra meaning in the "-auto-level" operator
264 TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
265 RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */
266 GrayChannels = 0x0400,
267 SyncChannels = 0x20000, /* channels modified as a single unit */
268 DefaultChannels = AllChannels
269 } ChannelType; /* must correspond to PixelChannel */
276 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
279 inline CLQuantum ScaleCharToQuantum(const unsigned char value)
281 return((CLQuantum) value);
285 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
288 inline CLQuantum ScaleCharToQuantum(const unsigned char value)
290 return((CLQuantum) (257.0f*value));
294 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
297 inline CLQuantum ScaleCharToQuantum(const unsigned char value)
299 return((CLQuantum) (16843009.0*value));
306 inline int ClampToCanvas(const int offset,const int range)
308 return clamp(offset, (int)0, range-1);
313 inline CLQuantum ClampToQuantum(const float value)
315 return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
320 inline uint ScaleQuantumToMap(CLQuantum value)
322 if (value >= (CLQuantum) MaxMap)
323 return ((uint)MaxMap);
325 return ((uint)value);
330 inline float PerceptibleReciprocal(const float x)
332 float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
333 return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
338 inline float RoundToUnity(const float value)
340 return clamp(value,0.0f,1.0f);
346 inline unsigned int getPixelIndex(const unsigned int number_channels,
347 const unsigned int columns, const unsigned int x, const unsigned int y)
349 return (x * number_channels) + (y * columns * number_channels);
352 inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; }
353 inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
354 inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); }
355 inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
357 inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; }
358 inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
359 inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; }
360 inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
362 inline CLQuantum getBlue(CLPixelType p) { return p.x; }
363 inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
364 inline float getBlueF4(float4 p) { return p.x; }
365 inline void setBlueF4(float4* p, float value) { (*p).x = value; }
367 inline CLQuantum getGreen(CLPixelType p) { return p.y; }
368 inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
369 inline float getGreenF4(float4 p) { return p.y; }
370 inline void setGreenF4(float4* p, float value) { (*p).y = value; }
372 inline CLQuantum getRed(CLPixelType p) { return p.z; }
373 inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
374 inline float getRedF4(float4 p) { return p.z; }
375 inline void setRedF4(float4* p, float value) { (*p).z = value; }
377 inline CLQuantum getAlpha(CLPixelType p) { return p.w; }
378 inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
379 inline float getAlphaF4(float4 p) { return p.w; }
380 inline void setAlphaF4(float4* p, float value) { (*p).w = value; }
382 inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
383 const ChannelType channel, float *red, float *green, float *blue, float *alpha)
385 if ((channel & RedChannel) != 0)
388 if (number_channels > 2)
390 if ((channel & GreenChannel) != 0)
391 *green=getPixelGreen(p);
393 if ((channel & BlueChannel) != 0)
394 *blue=getPixelBlue(p);
397 if (((number_channels == 4) || (number_channels == 2)) &&
398 ((channel & AlphaChannel) != 0))
399 *alpha=getPixelAlpha(p,number_channels);
402 inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
403 const unsigned int columns, const unsigned int x, const unsigned int y)
405 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
409 pixel.x=getPixelRed(p);
411 if (number_channels > 2)
413 pixel.y=getPixelGreen(p);
414 pixel.z=getPixelBlue(p);
417 if ((number_channels == 4) || (number_channels == 2))
418 pixel.w=getPixelAlpha(p,number_channels);
422 inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
423 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
425 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
432 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
433 return (float4)(red, green, blue, alpha);
436 inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
437 const ChannelType channel, float red, float green, float blue, float alpha)
439 if ((channel & RedChannel) != 0)
440 setPixelRed(p,ClampToQuantum(red));
442 if (number_channels > 2)
444 if ((channel & GreenChannel) != 0)
445 setPixelGreen(p,ClampToQuantum(green));
447 if ((channel & BlueChannel) != 0)
448 setPixelBlue(p,ClampToQuantum(blue));
451 if (((number_channels == 4) || (number_channels == 2)) &&
452 ((channel & AlphaChannel) != 0))
453 setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
456 inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
457 const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
459 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
461 setPixelRed(p,ClampToQuantum(pixel.x));
463 if (number_channels > 2)
465 setPixelGreen(p,ClampToQuantum(pixel.y));
466 setPixelBlue(p,ClampToQuantum(pixel.z));
469 if ((number_channels == 4) || (number_channels == 2))
470 setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
473 inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
474 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
477 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
478 WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
481 inline float GetPixelIntensity(const unsigned int colorspace,
482 const unsigned int method,float red,float green,float blue)
486 if (colorspace == GRAYColorspace)
491 case AveragePixelIntensityMethod:
493 intensity=(red+green+blue)/3.0;
496 case BrightnessPixelIntensityMethod:
498 intensity=MagickMax(MagickMax(red,green),blue);
501 case LightnessPixelIntensityMethod:
503 intensity=(MagickMin(MagickMin(red,green),blue)+
504 MagickMax(MagickMax(red,green),blue))/2.0;
507 case MSPixelIntensityMethod:
509 intensity=(float) (((float) red*red+green*green+blue*blue)/
513 case Rec601LumaPixelIntensityMethod:
516 if (image->colorspace == RGBColorspace)
518 red=EncodePixelGamma(red);
519 green=EncodePixelGamma(green);
520 blue=EncodePixelGamma(blue);
523 intensity=0.298839*red+0.586811*green+0.114350*blue;
526 case Rec601LuminancePixelIntensityMethod:
529 if (image->colorspace == sRGBColorspace)
531 red=DecodePixelGamma(red);
532 green=DecodePixelGamma(green);
533 blue=DecodePixelGamma(blue);
536 intensity=0.298839*red+0.586811*green+0.114350*blue;
539 case Rec709LumaPixelIntensityMethod:
543 if (image->colorspace == RGBColorspace)
545 red=EncodePixelGamma(red);
546 green=EncodePixelGamma(green);
547 blue=EncodePixelGamma(blue);
550 intensity=0.212656*red+0.715158*green+0.072186*blue;
553 case Rec709LuminancePixelIntensityMethod:
556 if (image->colorspace == sRGBColorspace)
558 red=DecodePixelGamma(red);
559 green=DecodePixelGamma(green);
560 blue=DecodePixelGamma(blue);
563 intensity=0.212656*red+0.715158*green+0.072186*blue;
566 case RMSPixelIntensityMethod:
568 intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
577 inline int mirrorBottom(int value)
579 return (value < 0) ? - (value) : value;
582 inline int mirrorTop(int value, int width)
584 return (value >= width) ? (2 * width - value - 1) : value;
589 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
597 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
602 Part of MWC64X by David Thomas, dt10@imperial.ac.uk
603 This is provided under BSD, full license is with the main package.
604 See http://www.doc.ic.ac.uk/~dt10/research
608 // Post: r=(a+b) mod M
609 ulong MWC_AddMod64(ulong a, ulong b, ulong M)
612 //if( (v>=M) || (v<a) )
613 if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
619 // Post: r=(a*b) mod M
620 // This could be done more efficently, but it is portable, and should
621 // be easy to understand. It can be replaced with any of the better
622 // modular multiplication algorithms (for example if you know you have
623 // double precision available or something).
624 ulong MWC_MulMod64(ulong a, ulong b, ulong M)
629 r=MWC_AddMod64(r,b,M);
630 b=MWC_AddMod64(b,b,M);
637 // Post: r=(a^b) mod M
638 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
639 // most architectures
640 ulong MWC_PowMod64(ulong a, ulong e, ulong M)
645 acc=MWC_MulMod64(acc,sqr,M);
646 sqr=MWC_MulMod64(sqr,sqr,M);
652 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
654 ulong m=MWC_PowMod64(A, distance, M);
655 ulong x=curr.x*(ulong)A+curr.y;
656 x=MWC_MulMod64(x, m, M);
657 return (uint2)((uint)(x/A), (uint)(x%A));
660 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
662 // This is an arbitrary constant for starting LCG jumping from. I didn't
663 // want to start from 1, as then you end up with the two or three first values
664 // being a bit poor in ones - once you've decided that, one constant is as
665 // good as any another. There is no deep mathematical reason for it, I just
666 // generated a random number.
667 enum{ MWC_BASEID = 4077358422479273989UL };
669 ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
670 ulong m=MWC_PowMod64(A, dist, M);
672 ulong x=MWC_MulMod64(MWC_BASEID, m, M);
673 return (uint2)((uint)(x/A), (uint)(x%A));
676 //! Represents the state of a particular generator
677 typedef struct{ uint x; uint c; } mwc64x_state_t;
679 enum{ MWC64X_A = 4294883355U };
680 enum{ MWC64X_M = 18446383549859758079UL };
682 void MWC64X_Step(mwc64x_state_t *s)
686 uint Xn=MWC64X_A*X+C;
687 uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
688 uint Cn=mad_hi(MWC64X_A,X,carry);
694 void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
696 uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
701 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
703 uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
708 //! Return a 32-bit integer in the range [0..2^32)
709 uint MWC64X_NextUint(mwc64x_state_t *s)
711 uint res=s->x ^ s->c;
717 // End of MWC64X excerpt
720 float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
722 return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
725 float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
734 alpha=mwcReadPseudoRandomValue(r);
740 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
751 beta=mwcReadPseudoRandomValue(r);
752 gamma=sqrt(-2.0f*log(alpha));
753 sigma=gamma*cospi((2.0f*beta));
754 tau=gamma*sinpi((2.0f*beta));
755 noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
760 if (alpha < (SigmaImpulse/2.0f))
763 if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
773 if (alpha <= MagickEpsilon)
774 noise=(pixel-QuantumRange);
776 noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
781 if (beta <= (0.5f*MagickEpsilon))
782 noise=(pixel+QuantumRange);
784 noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
787 case MultiplicativeGaussianNoise:
790 if (alpha > MagickEpsilon)
791 sigma=sqrt(-2.0f*log(alpha));
792 beta=mwcReadPseudoRandomValue(r);
793 noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
794 cospi((2.0f*beta))/2.0f);
802 poisson=exp(-SigmaPoisson*QuantumScale*pixel);
803 for (i=0; alpha > poisson; i++)
805 beta=mwcReadPseudoRandomValue(r);
808 noise=(QuantumRange*i/SigmaPoisson);
813 noise=(QuantumRange*SigmaRandom*alpha);
821 void AddNoise(const __global CLQuantum *image,
822 const unsigned int number_channels,const ChannelType channel,
823 const unsigned int length,const unsigned int pixelsPerWorkItem,
824 const NoiseType noise_type,const float attenuate,const unsigned int seed0,
825 const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
826 __global CLQuantum *filteredImage)
832 uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
833 uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
834 MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
836 uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
837 uint count = pixelsPerWorkItem;
839 while (count > 0 && pos < length)
841 const __global CLQuantum *p = image + pos;
842 __global CLQuantum *q = filteredImage + pos;
849 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
851 if ((channel & RedChannel) != 0)
852 red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
854 if (number_channels > 2)
856 if ((channel & GreenChannel) != 0)
857 green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
859 if ((channel & BlueChannel) != 0)
860 blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
863 if (((number_channels == 4) || (number_channels == 2)) &&
864 ((channel & AlphaChannel) != 0))
865 alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
867 WriteChannels(q, number_channels, channel, red, green, blue, alpha);
869 pos += (get_local_size(0) * number_channels);
876 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
884 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
889 Reduce image noise and reduce detail levels by row
891 __kernel void BlurRow(const __global CLQuantum *image,
892 const unsigned int number_channels,const ChannelType channel,
893 __constant float *filter,const unsigned int width,
894 const unsigned int imageColumns,const unsigned int imageRows,
895 __local float4 *temp,__global float4 *tempImage)
897 const int x = get_global_id(0);
898 const int y = get_global_id(1);
900 const int columns = imageColumns;
902 const unsigned int radius = (width-1)/2;
903 const int wsize = get_local_size(0);
904 const unsigned int loadSize = wsize+width;
907 const int groupX=get_local_size(0)*get_group_id(0);
909 //parallel load and clamp
910 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
912 int cx = ClampToCanvas(i + groupX - radius, columns);
913 temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
917 barrier(CLK_LOCAL_MEM_FENCE);
919 // only do the work if this is not a patched item
920 if (get_global_id(0) < columns)
923 float4 result = (float4) 0;
927 for ( ; i+7 < width; )
929 for (int j=0; j < 8; j++)
930 result+=filter[i+j]*temp[i+j+get_local_id(0)];
934 for ( ; i < width; i++)
935 result+=filter[i]*temp[i+get_local_id(0)];
937 // write back to global
938 tempImage[y*columns+x] = result;
945 Reduce image noise and reduce detail levels by line
947 __kernel void BlurColumn(const __global float4 *blurRowData,
948 const unsigned int number_channels,const ChannelType channel,
949 __constant float *filter,const unsigned int width,
950 const unsigned int imageColumns,const unsigned int imageRows,
951 __local float4 *temp,__global CLQuantum *filteredImage)
953 const int x = get_global_id(0);
954 const int y = get_global_id(1);
956 const int columns = imageColumns;
957 const int rows = imageRows;
959 unsigned int radius = (width-1)/2;
960 const int wsize = get_local_size(1);
961 const unsigned int loadSize = wsize+width;
964 const int groupX=get_local_size(0)*get_group_id(0);
965 const int groupY=get_local_size(1)*get_group_id(1);
966 //notice that get_local_size(0) is 1, so
967 //groupX=get_group_id(0);
969 //parallel load and clamp
970 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
971 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
974 barrier(CLK_LOCAL_MEM_FENCE);
976 // only do the work if this is not a patched item
977 if (get_global_id(1) < rows)
980 float4 result = (float4) 0;
984 for ( ; i+7 < width; )
986 for (int j=0; j < 8; j++)
987 result+=filter[i+j]*temp[i+j+get_local_id(1)];
991 for ( ; i < width; i++)
992 result+=filter[i]*temp[i+get_local_id(1)];
994 // write back to global
995 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
1001 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1009 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1014 inline float4 ConvertRGBToHSB(const float4 pixel)
1018 float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
1021 float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
1022 float delta=tmax-tmin;
1024 result.y=delta/tmax;
1025 result.z=QuantumScale*tmax;
1028 result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
1029 result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
1030 (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
1032 result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
1038 inline float4 ConvertHSBToRGB(const float4 pixel)
1041 float saturation=pixel.y;
1042 float brightness=pixel.z;
1044 float4 result=pixel;
1046 if (saturation == 0.0f)
1048 result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
1052 float h=6.0f*(hue-floor(hue));
1054 float p=brightness*(1.0f-saturation);
1055 float q=brightness*(1.0f-saturation*f);
1056 float t=brightness*(1.0f-(saturation*(1.0f-f)));
1061 result.x=ClampToQuantum(QuantumRange*q);
1062 result.y=ClampToQuantum(QuantumRange*brightness);
1063 result.z=ClampToQuantum(QuantumRange*p);
1067 result.x=ClampToQuantum(QuantumRange*p);
1068 result.y=ClampToQuantum(QuantumRange*brightness);
1069 result.z=ClampToQuantum(QuantumRange*t);
1073 result.x=ClampToQuantum(QuantumRange*p);
1074 result.y=ClampToQuantum(QuantumRange*q);
1075 result.z=ClampToQuantum(QuantumRange*brightness);
1079 result.x=ClampToQuantum(QuantumRange*t);
1080 result.y=ClampToQuantum(QuantumRange*p);
1081 result.z=ClampToQuantum(QuantumRange*brightness);
1085 result.x=ClampToQuantum(QuantumRange*brightness);
1086 result.y=ClampToQuantum(QuantumRange*p);
1087 result.z=ClampToQuantum(QuantumRange*q);
1091 result.x=ClampToQuantum(QuantumRange*brightness);
1092 result.y=ClampToQuantum(QuantumRange*t);
1093 result.z=ClampToQuantum(QuantumRange*p);
1099 __kernel void Contrast(__global CLQuantum *image,
1100 const unsigned int number_channels,const int sign)
1102 const int x=get_global_id(0);
1103 const int y=get_global_id(1);
1104 const unsigned int columns=get_global_size(0);
1106 float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
1107 if (number_channels < 3)
1108 pixel.y=pixel.z=pixel.x;
1110 pixel=ConvertRGBToHSB(pixel);
1111 float brightness=pixel.z;
1112 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1113 brightness=clamp(brightness,0.0f,1.0f);
1115 pixel=ConvertHSBToRGB(pixel);
1117 WriteAllChannels(image,number_channels,columns,x,y,pixel);
1122 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1126 % C o n t r a s t S t r e t c h %
1130 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1136 __kernel void Histogram(__global CLPixelType * restrict im,
1137 const ChannelType channel,
1138 const unsigned int colorspace,
1139 const unsigned int method,
1140 __global uint4 * restrict histogram)
1142 const int x = get_global_id(0);
1143 const int y = get_global_id(1);
1144 const int columns = get_global_size(0);
1145 const int c = x + y * columns;
1146 if ((channel & SyncChannels) != 0)
1148 float red=(float)getRed(im[c]);
1149 float green=(float)getGreen(im[c]);
1150 float blue=(float)getBlue(im[c]);
1152 float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1153 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1154 atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1158 // for equalizing, we always need all channels?
1159 // otherwise something more
1167 __kernel void ContrastStretch(__global CLPixelType * restrict im,
1168 const ChannelType channel,
1169 __global CLPixelType * restrict stretch_map,
1170 const float4 white, const float4 black)
1172 const int x = get_global_id(0);
1173 const int y = get_global_id(1);
1174 const int columns = get_global_size(0);
1175 const int c = x + y * columns;
1178 CLPixelType oValue, eValue;
1179 CLQuantum red, green, blue, alpha;
1184 if ((channel & RedChannel) != 0)
1186 if (getRedF4(white) != getRedF4(black))
1188 ePos = ScaleQuantumToMap(getRed(oValue));
1189 eValue = stretch_map[ePos];
1190 red = getRed(eValue);
1194 if ((channel & GreenChannel) != 0)
1196 if (getGreenF4(white) != getGreenF4(black))
1198 ePos = ScaleQuantumToMap(getGreen(oValue));
1199 eValue = stretch_map[ePos];
1200 green = getGreen(eValue);
1204 if ((channel & BlueChannel) != 0)
1206 if (getBlueF4(white) != getBlueF4(black))
1208 ePos = ScaleQuantumToMap(getBlue(oValue));
1209 eValue = stretch_map[ePos];
1210 blue = getBlue(eValue);
1214 if ((channel & AlphaChannel) != 0)
1216 if (getAlphaF4(white) != getAlphaF4(black))
1218 ePos = ScaleQuantumToMap(getAlpha(oValue));
1219 eValue = stretch_map[ePos];
1220 alpha = getAlpha(eValue);
1225 im[c]=(CLPixelType)(blue, green, red, alpha);
1231 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1239 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1244 void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1245 const unsigned int imageWidth, const unsigned int imageHeight,
1246 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1247 const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1250 blockID.x = get_global_id(0) / get_local_size(0);
1251 blockID.y = get_global_id(1) / get_local_size(1);
1253 // image area processed by this workgroup
1255 imageAreaOrg.x = blockID.x * get_local_size(0);
1256 imageAreaOrg.y = blockID.y * get_local_size(1);
1258 int2 midFilterDimen;
1259 midFilterDimen.x = (filterWidth-1)/2;
1260 midFilterDimen.y = (filterHeight-1)/2;
1262 int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1264 // dimension of the local cache
1265 int2 cachedAreaDimen;
1266 cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1267 cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1269 // cache the pixels accessed by this workgroup in local memory
1270 int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1271 int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1272 int groupSize = get_local_size(0) * get_local_size(1);
1273 for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1275 int2 cachedAreaIndex;
1276 cachedAreaIndex.x = i % cachedAreaDimen.x;
1277 cachedAreaIndex.y = i / cachedAreaDimen.x;
1279 int2 imagePixelIndex;
1280 imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1282 // only support EdgeVirtualPixelMethod through ClampToCanvas
1283 // TODO: implement other virtual pixel method
1284 imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1285 imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1287 pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1291 for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1292 filterCache[i] = filter[i];
1294 barrier(CLK_LOCAL_MEM_FENCE);
1298 imageIndex.x = imageAreaOrg.x + get_local_id(0);
1299 imageIndex.y = imageAreaOrg.y + get_local_id(1);
1301 // if out-of-range, stops here and quit
1302 if (imageIndex.x >= imageWidth
1303 || imageIndex.y >= imageHeight) {
1307 int filterIndex = 0;
1308 float4 sum = (float4)0.0f;
1310 if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1311 int cacheIndexY = get_local_id(1);
1312 for (int j = 0; j < filterHeight; j++) {
1313 int cacheIndexX = get_local_id(0);
1314 for (int i = 0; i < filterWidth; i++) {
1315 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1316 float f = filterCache[filterIndex];
1331 int cacheIndexY = get_local_id(1);
1332 for (int j = 0; j < filterHeight; j++) {
1333 int cacheIndexX = get_local_id(0);
1334 for (int i = 0; i < filterWidth; i++) {
1336 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1337 float alpha = QuantumScale*p.w;
1338 float f = filterCache[filterIndex];
1339 float g = alpha * f;
1352 gamma = PerceptibleReciprocal(gamma);
1353 sum.xyz = gamma*sum.xyz;
1355 CLPixelType outputPixel;
1356 outputPixel.x = ClampToQuantum(sum.x);
1357 outputPixel.y = ClampToQuantum(sum.y);
1358 outputPixel.z = ClampToQuantum(sum.z);
1359 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1361 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1367 void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1368 const uint imageWidth, const uint imageHeight,
1369 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1370 const uint matte, const ChannelType channel) {
1373 imageIndex.x = get_global_id(0);
1374 imageIndex.y = get_global_id(1);
1377 unsigned int imageWidth = get_global_size(0);
1378 unsigned int imageHeight = get_global_size(1);
1380 if (imageIndex.x >= imageWidth
1381 || imageIndex.y >= imageHeight)
1384 int2 midFilterDimen;
1385 midFilterDimen.x = (filterWidth-1)/2;
1386 midFilterDimen.y = (filterHeight-1)/2;
1388 int filterIndex = 0;
1389 float4 sum = (float4)0.0f;
1391 if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1392 for (int j = 0; j < filterHeight; j++) {
1393 int2 inputPixelIndex;
1394 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1395 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1396 for (int i = 0; i < filterWidth; i++) {
1397 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1398 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1400 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1401 float f = filter[filterIndex];
1416 for (int j = 0; j < filterHeight; j++) {
1417 int2 inputPixelIndex;
1418 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1419 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1420 for (int i = 0; i < filterWidth; i++) {
1421 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1422 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1424 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1425 float alpha = QuantumScale*p.w;
1426 float f = filter[filterIndex];
1427 float g = alpha * f;
1440 gamma = PerceptibleReciprocal(gamma);
1441 sum.xyz = gamma*sum.xyz;
1444 CLPixelType outputPixel;
1445 outputPixel.x = ClampToQuantum(sum.x);
1446 outputPixel.y = ClampToQuantum(sum.y);
1447 outputPixel.z = ClampToQuantum(sum.z);
1448 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1450 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1455 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1459 % D e s p e c k l e %
1463 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1468 __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1469 , const unsigned int imageWidth, const unsigned int imageHeight
1470 , const int2 offset, const int polarity, const int matte) {
1472 int x = get_global_id(0);
1473 int y = get_global_id(1);
1475 CLPixelType v = inputImage[y*imageWidth+x];
1478 neighbor.y = y + offset.y;
1479 neighbor.x = x + offset.x;
1481 int2 clampedNeighbor;
1482 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1483 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1485 CLPixelType r = (clampedNeighbor.x == neighbor.x
1486 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1502 \n #pragma unroll 4\n
1503 for (unsigned int i = 0; i < 4; i++) {
1504 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1508 \n #pragma unroll 4\n
1509 for (unsigned int i = 0; i < 4; i++) {
1510 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1515 v.x = (CLQuantum)sv[0];
1516 v.y = (CLQuantum)sv[1];
1517 v.z = (CLQuantum)sv[2];
1520 v.w = (CLQuantum)sv[3];
1522 outputImage[y*imageWidth+x] = v;
1531 __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1532 , const unsigned int imageWidth, const unsigned int imageHeight
1533 , const int2 offset, const int polarity, const int matte) {
1535 int x = get_global_id(0);
1536 int y = get_global_id(1);
1538 CLPixelType v = inputImage[y*imageWidth+x];
1540 int2 neighbor, clampedNeighbor;
1542 neighbor.y = y + offset.y;
1543 neighbor.x = x + offset.x;
1544 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1545 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1547 CLPixelType r = (clampedNeighbor.x == neighbor.x
1548 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1552 neighbor.y = y - offset.y;
1553 neighbor.x = x - offset.x;
1554 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1555 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1557 CLPixelType s = (clampedNeighbor.x == neighbor.x
1558 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1581 \n #pragma unroll 4\n
1582 for (unsigned int i = 0; i < 4; i++) {
1583 //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1585 //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1586 //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1587 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1591 \n #pragma unroll 4\n
1592 for (unsigned int i = 0; i < 4; i++) {
1593 //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1595 //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1596 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1600 v.x = (CLQuantum)sv[0];
1601 v.y = (CLQuantum)sv[1];
1602 v.z = (CLQuantum)sv[2];
1605 v.w = (CLQuantum)sv[3];
1607 outputImage[y*imageWidth+x] = v;
1613 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1621 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1627 __kernel void Equalize(__global CLPixelType * restrict im,
1628 const ChannelType channel,
1629 __global CLPixelType * restrict equalize_map,
1630 const float4 white, const float4 black)
1632 const int x = get_global_id(0);
1633 const int y = get_global_id(1);
1634 const int columns = get_global_size(0);
1635 const int c = x + y * columns;
1638 CLPixelType oValue, eValue;
1639 CLQuantum red, green, blue, alpha;
1644 if ((channel & SyncChannels) != 0)
1646 if (getRedF4(white) != getRedF4(black))
1648 ePos = ScaleQuantumToMap(getRed(oValue));
1649 eValue = equalize_map[ePos];
1650 red = getRed(eValue);
1651 ePos = ScaleQuantumToMap(getGreen(oValue));
1652 eValue = equalize_map[ePos];
1653 green = getRed(eValue);
1654 ePos = ScaleQuantumToMap(getBlue(oValue));
1655 eValue = equalize_map[ePos];
1656 blue = getRed(eValue);
1657 ePos = ScaleQuantumToMap(getAlpha(oValue));
1658 eValue = equalize_map[ePos];
1659 alpha = getRed(eValue);
1662 im[c]=(CLPixelType)(blue, green, red, alpha);
1667 // for equalizing, we always need all channels?
1668 // otherwise something more
1674 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1682 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1687 apply FunctionImageChannel(braightness-contrast)
1689 CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1690 const unsigned int number_parameters,__constant float *parameters)
1692 float result = 0.0f;
1696 case PolynomialFunction:
1698 for (unsigned int i=0; i < number_parameters; i++)
1699 result = result*QuantumScale*pixel + parameters[i];
1700 result *= QuantumRange;
1703 case SinusoidFunction:
1705 float freq,phase,ampl,bias;
1706 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1707 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1708 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1709 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1710 result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1711 (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1714 case ArcsinFunction:
1716 float width,range,center,bias;
1717 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1718 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1719 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1720 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1722 result = 2.0f/width*(QuantumScale*pixel - center);
1723 result = range/MagickPI*asin(result)+bias;
1724 result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1725 result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1726 result *= QuantumRange;
1729 case ArctanFunction:
1731 float slope,range,center,bias;
1732 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1733 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1734 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1735 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1736 result = MagickPI*slope*(QuantumScale*pixel-center);
1737 result = QuantumRange*(range/MagickPI*atan(result) + bias);
1740 case UndefinedFunction:
1743 return(ClampToQuantum(result));
1749 Improve brightness / contrast of the image
1750 channel : define which channel is improved
1751 function : the function called to enchance the brightness contrast
1752 number_parameters : numbers of parameters
1753 parameters : the parameter
1755 __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1756 const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1757 __constant float *parameters)
1759 const unsigned int x = get_global_id(0);
1760 const unsigned int y = get_global_id(1);
1761 const unsigned int columns = get_global_size(0);
1762 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1769 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1771 if ((channel & RedChannel) != 0)
1772 red=ApplyFunction(red, function, number_parameters, parameters);
1774 if (number_channels > 2)
1776 if ((channel & GreenChannel) != 0)
1777 green=ApplyFunction(green, function, number_parameters, parameters);
1779 if ((channel & BlueChannel) != 0)
1780 blue=ApplyFunction(blue, function, number_parameters, parameters);
1783 if (((number_channels == 4) || (number_channels == 2)) &&
1784 ((channel & AlphaChannel) != 0))
1785 alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1787 WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1792 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1796 % G r a y s c a l e %
1800 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1804 __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1805 const unsigned int colorspace,const unsigned int method)
1807 const unsigned int x = get_global_id(0);
1808 const unsigned int y = get_global_id(1);
1809 const unsigned int columns = get_global_size(0);
1810 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1818 green=getPixelGreen(p);
1819 blue=getPixelBlue(p);
1821 CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1823 setPixelRed(p,intensity);
1824 setPixelGreen(p,intensity);
1825 setPixelBlue(p,intensity);
1830 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1834 % L o c a l C o n t r a s t %
1838 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1843 __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1845 const int imageWidth,
1846 const int imageHeight)
1848 const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1850 int x = get_local_id(0);
1851 int y = get_global_id(1);
1853 if ((x >= imageWidth) || (y >= imageHeight))
1856 global CLPixelType *src = srcImage + y * imageWidth;
1858 for (int i = x; i < imageWidth; i += get_local_size(0)) {
1860 float weight = 1.0f;
1863 while ((j + 7) < i) {
1864 for (int k = 0; k < 8; ++k) // Unroll 8x
1865 sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1870 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1875 while ((j + 7) < radius + i) {
1876 for (int k = 0; k < 8; ++k) // Unroll 8x
1877 sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1881 while (j < radius + i) {
1882 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1887 tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1893 __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1895 const float strength,
1896 const int imageWidth,
1897 const int imageHeight)
1899 const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1901 int x = get_global_id(0);
1902 int y = get_global_id(1);
1904 if ((x >= imageWidth) || (y >= imageHeight))
1907 global float *src = blurImage + x;
1910 float weight = 1.0f;
1913 while ((j + 7) < y) {
1914 for (int k = 0; k < 8; ++k) // Unroll 8x
1915 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1920 sum += weight * src[mirrorBottom(j) * imageWidth];
1925 while ((j + 7) < radius + y) {
1926 for (int k = 0; k < 8; ++k) // Unroll 8x
1927 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1931 while (j < radius + y) {
1932 sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1937 CLPixelType pixel = srcImage[x + y * imageWidth];
1938 float srcVal = dot(RGB, convert_float4(pixel));
1939 float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1940 mult = (srcVal + mult) / srcVal;
1942 pixel.x = ClampToQuantum(pixel.x * mult);
1943 pixel.y = ClampToQuantum(pixel.y * mult);
1944 pixel.z = ClampToQuantum(pixel.z * mult);
1946 dstImage[x + y * imageWidth] = pixel;
1951 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1959 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1964 inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1965 float *hue, float *saturation, float *lightness)
1973 Convert RGB to HSL colorspace.
1975 tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1976 tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1980 *lightness=(tmax+tmin)/2.0;
1988 if (tmax == (QuantumScale*red))
1990 *hue=(QuantumScale*green-QuantumScale*blue)/c;
1991 if ((QuantumScale*green) < (QuantumScale*blue))
1995 if (tmax == (QuantumScale*green))
1996 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
1998 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2001 if (*lightness <= 0.5)
2002 *saturation=c/(2.0*(*lightness));
2004 *saturation=c/(2.0-2.0*(*lightness));
2007 inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2008 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2020 Convert HSL to RGB colorspace.
2023 if (lightness <= 0.5)
2024 c=2.0*lightness*saturation;
2026 c=(2.0-2.0*lightness)*saturation;
2027 tmin=lightness-0.5*c;
2028 h-=360.0*floor(h/360.0);
2030 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2031 switch ((int) floor(h) % 6)
2082 *red=ClampToQuantum(QuantumRange*r);
2083 *green=ClampToQuantum(QuantumRange*g);
2084 *blue=ClampToQuantum(QuantumRange*b);
2087 inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2088 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2096 Increase or decrease color lightness, saturation, or hue.
2098 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2099 hue+=0.5*(0.01*percent_hue-1.0);
2104 saturation*=0.01*percent_saturation;
2105 lightness*=0.01*percent_lightness;
2106 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2109 __kernel void Modulate(__global CLPixelType *im,
2110 const float percent_brightness,
2111 const float percent_hue,
2112 const float percent_saturation,
2113 const int colorspace)
2116 const int x = get_global_id(0);
2117 const int y = get_global_id(1);
2118 const int columns = get_global_size(0);
2119 const int c = x + y * columns;
2121 CLPixelType pixel = im[c];
2129 green=getGreen(pixel);
2130 blue=getBlue(pixel);
2137 ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2138 &red, &green, &blue);
2143 CLPixelType filteredPixel;
2145 setRed(&filteredPixel, red);
2146 setGreen(&filteredPixel, green);
2147 setBlue(&filteredPixel, blue);
2148 filteredPixel.w = pixel.w;
2150 im[c] = filteredPixel;
2155 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2159 % M o t i o n B l u r %
2163 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2168 void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2169 const unsigned int imageWidth, const unsigned int imageHeight,
2170 const __global float *filter, const unsigned int width, const __global int2* offset,
2172 const ChannelType channel, const unsigned int matte) {
2175 currentPixel.x = get_global_id(0);
2176 currentPixel.y = get_global_id(1);
2178 if (currentPixel.x >= imageWidth
2179 || currentPixel.y >= imageHeight)
2183 pixel.x = (float)bias.x;
2184 pixel.y = (float)bias.y;
2185 pixel.z = (float)bias.z;
2186 pixel.w = (float)bias.w;
2188 if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2190 for (int i = 0; i < width; i++) {
2191 // only support EdgeVirtualPixelMethod through ClampToCanvas
2192 // TODO: implement other virtual pixel method
2193 int2 samplePixel = currentPixel + offset[i];
2194 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2195 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2196 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2198 pixel.x += (filter[i] * (float)samplePixelValue.x);
2199 pixel.y += (filter[i] * (float)samplePixelValue.y);
2200 pixel.z += (filter[i] * (float)samplePixelValue.z);
2201 pixel.w += (filter[i] * (float)samplePixelValue.w);
2204 CLPixelType outputPixel;
2205 outputPixel.x = ClampToQuantum(pixel.x);
2206 outputPixel.y = ClampToQuantum(pixel.y);
2207 outputPixel.z = ClampToQuantum(pixel.z);
2208 outputPixel.w = ClampToQuantum(pixel.w);
2209 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2214 for (int i = 0; i < width; i++) {
2215 // only support EdgeVirtualPixelMethod through ClampToCanvas
2216 // TODO: implement other virtual pixel method
2217 int2 samplePixel = currentPixel + offset[i];
2218 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2219 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2221 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2223 float alpha = QuantumScale*samplePixelValue.w;
2224 float k = filter[i];
2225 pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2226 pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2227 pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2229 pixel.w += k * alpha * samplePixelValue.w;
2233 gamma = PerceptibleReciprocal(gamma);
2234 pixel.xyz = gamma*pixel.xyz;
2236 CLPixelType outputPixel;
2237 outputPixel.x = ClampToQuantum(pixel.x);
2238 outputPixel.y = ClampToQuantum(pixel.y);
2239 outputPixel.z = ClampToQuantum(pixel.z);
2240 outputPixel.w = ClampToQuantum(pixel.w);
2241 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2247 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2255 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2259 // Based on Box from resize.c
2260 float BoxResizeFilter(const float x)
2267 // Based on CubicBC from resize.c
2268 float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2271 Cubic Filters using B,C determined values:
2272 Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
2273 Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
2274 Spline B = 1 C = 0 B-Spline Gaussian approximation
2275 Hermite B = 0 C = 0 B-Spline interpolator
2277 See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2278 Graphics Computer Graphics, Volume 22, Number 4, August 1988
2279 http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2282 Coefficents are determined from B,C values:
2283 P0 = ( 6 - 2*B )/6 = coeff[0]
2285 P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2286 P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2287 Q0 = ( 8*B +24*C )/6 = coeff[3]
2288 Q1 = ( -12*B -48*C )/6 = coeff[4]
2289 Q2 = ( 6*B +30*C )/6 = coeff[5]
2290 Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2292 which are used to define the filter:
2294 P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2295 Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2297 which ensures function is continuous in value and derivative (slope).
2300 return(resizeFilterCoefficients[0]+x*(x*
2301 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2303 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2304 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2310 float Sinc(const float x)
2314 const float alpha=(float) (MagickPI*x);
2315 return sinpi(x)/alpha;
2322 float Triangle(const float x)
2325 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2326 a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2329 return ((x<1.0f)?(1.0f-x):0.0f);
2335 float Hann(const float x)
2338 Cosine window function:
2341 const float cosine=cos((MagickPI*x));
2342 return(0.5f+0.5f*cosine);
2347 float Hamming(const float x)
2350 Offset cosine window function:
2351 .54 + .46 cos(pi x).
2353 const float cosine=cos((MagickPI*x));
2354 return(0.54f+0.46f*cosine);
2359 float Blackman(const float x)
2362 Blackman: 2nd order cosine windowing function:
2363 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2365 Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2368 const float cosine=cos((MagickPI*x));
2369 return(0.34f+cosine*(0.5f+cosine*0.16f));
2374 inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2378 /* Call Sinc even for SincFast to get better precision on GPU
2379 and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2380 case SincWeightingFunction:
2381 case SincFastWeightingFunction:
2383 case CubicBCWeightingFunction:
2384 return CubicBC(x,filterCoefficients);
2385 case BoxWeightingFunction:
2386 return BoxResizeFilter(x);
2387 case TriangleWeightingFunction:
2389 case HannWeightingFunction:
2391 case HammingWeightingFunction:
2393 case BlackmanWeightingFunction:
2404 inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2405 , const ResizeWeightingFunctionType resizeWindowType
2406 , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2409 float xBlur = fabs(x/resizeFilterBlur);
2410 if (resizeWindowSupport < MagickEpsilon
2411 || resizeWindowType == BoxWeightingFunction)
2417 scale = resizeFilterScale;
2418 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2420 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2427 const char *accelerateKernels2 =
2431 inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2432 return (numWorkItems/pixelPerWorkgroup);
2435 // returns the index of the pixel for the current workitem to compute.
2436 // returns -1 if this workitem doesn't need to participate in any computation
2437 inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2438 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2439 int pixelIndex = itemID/numWorkItemsPerPixel;
2440 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2447 __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2448 void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2449 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2450 const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2451 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2452 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2453 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2454 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2455 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2457 // calculate the range of resized image pixels computed by this workgroup
2458 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2459 const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2460 const unsigned int actualNumPixelToCompute = stopX - startX;
2462 // calculate the range of input image pixels to cache
2463 float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2464 const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2465 scale = PerceptibleReciprocal(scale);
2467 const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2468 const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2470 // cache the input pixels into local memory
2471 const unsigned int y = get_global_id(1);
2472 const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2473 const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2474 event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2475 wait_group_events(1, &e);
2477 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2478 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2479 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2481 const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2482 const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2483 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2485 // determine which resized pixel computed by this workitem
2486 const unsigned int itemID = get_local_id(0);
2487 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2489 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2491 float4 filteredPixel = (float4)0.0f;
2492 float density = 0.0f;
2494 // -1 means this workitem doesn't participate in the computation
2495 if (pixelIndex != -1)
2497 // x coordinated of the resized pixel computed by this workitem
2498 const int x = chunkStartX + pixelIndex;
2500 // calculate how many steps required for this pixel
2501 const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2502 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2503 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2504 const unsigned int n = stop - start;
2506 // calculate how many steps this workitem will contribute
2507 unsigned int numStepsPerWorkItem = n / numItems;
2508 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2510 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2513 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2515 unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2516 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2518 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2519 (ResizeWeightingFunctionType) resizeFilterType,
2520 (ResizeWeightingFunctionType) resizeWindowType,
2521 resizeFilterScale, resizeFilterWindowSupport,
2522 resizeFilterBlur, scale*(start + i - bisect + 0.5));
2524 float4 cp = (float4)0.0f;
2526 __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2527 cp.x = (float) *(p);
2528 if (number_channels > 2)
2530 cp.y = (float) *(p + 1);
2531 cp.z = (float) *(p + 2);
2534 if (alpha_index != 0)
2536 cp.w = (float) *(p + alpha_index);
2538 float alpha = weight * QuantumScale * cp.w;
2540 filteredPixel.x += alpha * cp.x;
2541 filteredPixel.y += alpha * cp.y;
2542 filteredPixel.z += alpha * cp.z;
2543 filteredPixel.w += weight * cp.w;
2547 filteredPixel += ((float4) weight)*cp;
2554 // initialize the accumulators to zero
2555 if (itemID < actualNumPixelInThisChunk) {
2556 outputPixelCache[itemID] = (float4)0.0f;
2557 densityCache[itemID] = 0.0f;
2558 if (alpha_index != 0)
2559 gammaCache[itemID] = 0.0f;
2561 barrier(CLK_LOCAL_MEM_FENCE);
2563 // accumulatte the filtered pixel value and the density
2564 for (unsigned int i = 0; i < numItems; i++) {
2565 if (pixelIndex != -1) {
2566 if (itemID%numItems == i) {
2567 outputPixelCache[pixelIndex]+=filteredPixel;
2568 densityCache[pixelIndex]+=density;
2569 if (alpha_index != 0)
2570 gammaCache[pixelIndex]+=gamma;
2573 barrier(CLK_LOCAL_MEM_FENCE);
2576 if (itemID < actualNumPixelInThisChunk)
2578 float4 filteredPixel = outputPixelCache[itemID];
2581 if (alpha_index != 0)
2582 gamma = gammaCache[itemID];
2584 float density = densityCache[itemID];
2585 if ((density != 0.0f) && (density != 1.0f))
2587 density = PerceptibleReciprocal(density);
2588 filteredPixel *= (float4) density;
2589 if (alpha_index != 0)
2593 if (alpha_index != 0)
2595 gamma = PerceptibleReciprocal(gamma);
2596 filteredPixel.x *= gamma;
2597 filteredPixel.y *= gamma;
2598 filteredPixel.z *= gamma;
2601 WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2609 __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2610 void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2611 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2612 const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2613 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2614 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2615 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2616 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2617 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2619 // calculate the range of resized image pixels computed by this workgroup
2620 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2621 const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2622 const unsigned int actualNumPixelToCompute = stopY - startY;
2624 // calculate the range of input image pixels to cache
2625 float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2626 const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2627 scale = PerceptibleReciprocal(scale);
2629 const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2630 const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2632 // cache the input pixels into local memory
2633 const unsigned int x = get_global_id(0);
2634 unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2635 unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2636 unsigned int stride = inputColumns * number_channels;
2637 for (unsigned int i = 0; i < number_channels; i++)
2639 event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2640 wait_group_events(1,&e);
2643 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2644 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2645 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2647 const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2648 const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2649 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2651 // determine which resized pixel computed by this workitem
2652 const unsigned int itemID = get_local_id(1);
2653 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2655 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2657 float4 filteredPixel = (float4)0.0f;
2658 float density = 0.0f;
2660 // -1 means this workitem doesn't participate in the computation
2661 if (pixelIndex != -1)
2663 // x coordinated of the resized pixel computed by this workitem
2664 const int y = chunkStartY + pixelIndex;
2666 // calculate how many steps required for this pixel
2667 const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2668 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2669 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2670 const unsigned int n = stop - start;
2672 // calculate how many steps this workitem will contribute
2673 unsigned int numStepsPerWorkItem = n / numItems;
2674 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2676 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2679 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2681 unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2682 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2684 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2685 (ResizeWeightingFunctionType) resizeFilterType,
2686 (ResizeWeightingFunctionType) resizeWindowType,
2687 resizeFilterScale, resizeFilterWindowSupport,
2688 resizeFilterBlur, scale*(start + i - bisect + 0.5));
2690 float4 cp = (float4)0.0f;
2692 __local CLQuantum *p = inputImageCache + cacheIndex;
2693 cp.x = (float) *(p);
2694 if (number_channels > 2)
2696 cp.y = (float) *(p + rangeLength);
2697 cp.z = (float) *(p + (rangeLength * 2));
2700 if (alpha_index != 0)
2702 cp.w = (float) *(p + (rangeLength * alpha_index));
2704 float alpha = weight * QuantumScale * cp.w;
2706 filteredPixel.x += alpha * cp.x;
2707 filteredPixel.y += alpha * cp.y;
2708 filteredPixel.z += alpha * cp.z;
2709 filteredPixel.w += weight * cp.w;
2713 filteredPixel += ((float4) weight)*cp;
2720 // initialize the accumulators to zero
2721 if (itemID < actualNumPixelInThisChunk) {
2722 outputPixelCache[itemID] = (float4)0.0f;
2723 densityCache[itemID] = 0.0f;
2724 if (alpha_index != 0)
2725 gammaCache[itemID] = 0.0f;
2727 barrier(CLK_LOCAL_MEM_FENCE);
2729 // accumulatte the filtered pixel value and the density
2730 for (unsigned int i = 0; i < numItems; i++) {
2731 if (pixelIndex != -1) {
2732 if (itemID%numItems == i) {
2733 outputPixelCache[pixelIndex]+=filteredPixel;
2734 densityCache[pixelIndex]+=density;
2735 if (alpha_index != 0)
2736 gammaCache[pixelIndex]+=gamma;
2739 barrier(CLK_LOCAL_MEM_FENCE);
2742 if (itemID < actualNumPixelInThisChunk)
2744 float4 filteredPixel = outputPixelCache[itemID];
2747 if (alpha_index != 0)
2748 gamma = gammaCache[itemID];
2750 float density = densityCache[itemID];
2751 if ((density != 0.0f) && (density != 1.0f))
2753 density = PerceptibleReciprocal(density);
2754 filteredPixel *= (float4) density;
2755 if (alpha_index != 0)
2759 if (alpha_index != 0)
2761 gamma = PerceptibleReciprocal(gamma);
2762 filteredPixel.x *= gamma;
2763 filteredPixel.y *= gamma;
2764 filteredPixel.z *= gamma;
2767 WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2774 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2778 % R o t a t i o n a l B l u r %
2782 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2786 __kernel void RotationalBlur(const __global CLQuantum *image,
2787 const unsigned int number_channels,const unsigned int channel,
2788 const float2 blurCenter,__constant float *cos_theta,
2789 __constant float *sin_theta,const unsigned int cossin_theta_size,
2790 __global CLQuantum *filteredImage)
2792 const int x = get_global_id(0);
2793 const int y = get_global_id(1);
2794 const int columns = get_global_size(0);
2795 const int rows = get_global_size(1);
2796 unsigned int step = 1;
2797 float center_x = (float) x - blurCenter.x;
2798 float center_y = (float) y - blurCenter.y;
2799 float radius = hypot(center_x, center_y);
2801 float blur_radius = hypot(blurCenter.x, blurCenter.y);
2803 if (radius > MagickEpsilon)
2805 step = (unsigned int) (blur_radius / radius);
2808 if (step >= cossin_theta_size)
2809 step = cossin_theta_size-1;
2812 float4 result = 0.0f;
2813 float normalize = 0.0f;
2816 for (unsigned int i=0; i<cossin_theta_size; i+=step)
2818 int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2819 int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2821 float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2823 if ((number_channels == 4) || (number_channels == 2))
2825 float alpha = (float)(QuantumScale*pixel.w);
2829 result.x += alpha * pixel.x;
2830 result.y += alpha * pixel.y;
2831 result.z += alpha * pixel.z;
2832 result.w += pixel.w;
2840 normalize = PerceptibleReciprocal(normalize);
2842 if ((number_channels == 4) || (number_channels == 2))
2844 gamma = PerceptibleReciprocal(gamma);
2848 result.w *= normalize;
2851 result *= normalize;
2853 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2858 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2862 % U n s h a r p M a s k %
2866 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2870 __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2871 const __global float4 *blurRowData,const unsigned int number_channels,
2872 const ChannelType channel,const unsigned int columns,
2873 const unsigned int rows,__local float4* cachedData,
2874 __local float* cachedFilter,const __global float *filter,
2875 const unsigned int width,const float gain, const float threshold,
2876 __global CLQuantum *filteredImage)
2878 const unsigned int radius = (width-1)/2;
2880 // cache the pixel shared by the workgroup
2881 const int groupX = get_group_id(0);
2882 const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2883 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2885 if ((groupStartY >= 0) && (groupStopY < rows))
2887 event_t e = async_work_group_strided_copy(cachedData,
2888 blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2889 wait_group_events(1,&e);
2893 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2894 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2896 barrier(CLK_LOCAL_MEM_FENCE);
2898 // cache the filter as well
2899 event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2900 wait_group_events(1,&e);
2902 // only do the work if this is not a patched item
2903 const int cy = get_global_id(1);
2907 float4 blurredPixel = (float4) 0.0f;
2911 for ( ; i+7 < width; )
2913 for (int j=0; j < 8; j++, i++)
2914 blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2917 for ( ; i < width; i++)
2918 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2920 float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2921 float4 outputPixel = inputImagePixel - blurredPixel;
2923 float quantumThreshold = QuantumRange*threshold;
2925 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2926 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2929 WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2935 __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2936 const ChannelType channel,__constant float *filter,const unsigned int width,
2937 const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2938 const float gain,const float threshold,__global CLQuantum *filteredImage)
2940 const unsigned int x = get_global_id(0);
2941 const unsigned int y = get_global_id(1);
2943 const unsigned int radius = (width - 1) / 2;
2945 int row = y - radius;
2946 int baseRow = get_group_id(1) * get_local_size(1) - radius;
2947 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2949 while (row < endRow) {
2950 int srcy = (row < 0) ? -row : row; // mirror pad
2951 srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2953 float4 value = 0.0f;
2955 int ix = x - radius;
2958 while (i + 7 < width) {
2959 for (int j = 0; j < 8; ++j) { // unrolled
2961 srcx = (srcx < 0) ? -srcx : srcx;
2962 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2963 value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2970 int srcx = (ix < 0) ? -ix : ix; // mirror pad
2971 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2972 value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2976 pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2977 row += get_local_size(1);
2980 barrier(CLK_LOCAL_MEM_FENCE);
2982 const int px = get_local_id(0);
2983 const int py = get_local_id(1);
2984 const int prp = get_local_size(0);
2985 float4 value = (float4)(0.0f);
2988 while (i + 7 < width) {
2989 for (int j = 0; j < 8; ++j) // unrolled
2990 value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2994 value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2998 if ((x < columns) && (y < rows)) {
2999 float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
3000 float4 diff = srcPixel - value;
3002 float quantumThreshold = QuantumRange*threshold;
3004 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3005 value = select(srcPixel + diff * gain, srcPixel, mask);
3007 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
3013 __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
3014 void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
3015 const unsigned int number_channels,const unsigned int max_channels,
3016 const float threshold,const int passes,const unsigned int imageWidth,
3017 const unsigned int imageHeight)
3019 const int pad = (1 << (passes - 1));
3020 const int tileSize = 64;
3021 const int tileRowPixels = 64;
3022 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3024 CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
3026 local float buffer[64 * 64];
3028 int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3029 int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3031 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3032 int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
3033 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
3035 for (int channel = 0; channel < max_channels; ++channel)
3036 stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
3039 for (int channel = 0; channel < max_channels; ++channel) {
3041 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3042 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3050 for (int i = 0; i < 16; i++)
3053 for (int pass = 0; pass < passes; ++pass) {
3054 const int radius = 1 << pass;
3055 const int x = get_local_id(0);
3056 const float thresh = threshold * noise[pass];
3058 // Apply horizontal hat
3059 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3060 const int offset = i * tileRowPixels;
3062 tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3063 pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3064 barrier(CLK_LOCAL_MEM_FENCE);
3065 buffer[x + offset] = pixel;
3067 barrier(CLK_LOCAL_MEM_FENCE);
3069 // Apply vertical hat
3070 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3071 pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3072 float delta = tmp[i / 4] - pixel;
3073 tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3074 if (delta < -thresh)
3076 else if (delta > thresh)
3080 accum[i / 4] += delta;
3082 barrier(CLK_LOCAL_MEM_FENCE);
3084 if (pass < passes - 1)
3085 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3086 buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3088 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3089 accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3090 barrier(CLK_LOCAL_MEM_FENCE);
3093 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3094 stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3096 barrier(CLK_LOCAL_MEM_FENCE);
3099 // Write from stage to output
3101 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3102 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3103 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3104 int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3105 for (int channel = 0; channel < max_channels; ++channel) {
3106 dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3116 #endif // MAGICKCORE_OPENCL_SUPPORT
3118 #if defined(__cplusplus) || defined(c_plusplus)
3122 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H