From 44fb54c2b1dfacf5cb2c9b20b08c164d84f35ecd Mon Sep 17 00:00:00 2001 From: dirk Date: Sun, 29 May 2016 21:26:07 +0200 Subject: [PATCH] Moved OpenCL kernels to a separate header file. --- MagickCore/accelerate-kernels-private.h | 3263 +++++++++++++++++++++++ MagickCore/accelerate-private.h | 3233 ---------------------- MagickCore/accelerate.c | 1 + 3 files changed, 3264 insertions(+), 3233 deletions(-) create mode 100644 MagickCore/accelerate-kernels-private.h diff --git a/MagickCore/accelerate-kernels-private.h b/MagickCore/accelerate-kernels-private.h new file mode 100644 index 000000000..f94ef401c --- /dev/null +++ b/MagickCore/accelerate-kernels-private.h @@ -0,0 +1,3263 @@ +/* + Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization + dedicated to making software imaging solutions freely available. + + You may not use this file except in compliance with the License. + obtain a copy of the License at + + http://www.imagemagick.org/script/license.php + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + MagickCore private kernels for accelerated functions. +*/ + +#ifndef _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H +#define _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#if defined(MAGICKCORE_OPENCL_SUPPORT) + +/* + Define declarations. +*/ +#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n" +#define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n" +#define OPENCL_ELSE() "\n #""else " " \n" +#define OPENCL_ENDIF() "\n #""endif " " \n" +#define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n" +#define STRINGIFY(...) #__VA_ARGS__ "\n" + +/* + Typedef declarations. +*/ + +typedef struct _FloatPixelPacket +{ + MagickRealType + red, + green, + blue, + alpha, + black; +} FloatPixelPacket; + +const char *accelerateKernels = + +/* + Define declarations. +*/ + OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) + OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f)) + OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) + OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) + OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) + OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) + OPENCL_DEFINE(SigmaRandom, (attenuate)) + OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) + OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y))) + OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y))) + +/* + Typedef declarations. +*/ + STRINGIFY( + typedef enum + { + UndefinedColorspace, + RGBColorspace, /* Linear RGB colorspace */ + GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */ + TransparentColorspace, + OHTAColorspace, + LabColorspace, + XYZColorspace, + YCbCrColorspace, + YCCColorspace, + YIQColorspace, + YPbPrColorspace, + YUVColorspace, + CMYKColorspace, /* negared linear RGB with black separated */ + sRGBColorspace, /* Default: non-lienar sRGB colorspace */ + HSBColorspace, + HSLColorspace, + HWBColorspace, + Rec601LumaColorspace, + Rec601YCbCrColorspace, + Rec709LumaColorspace, + Rec709YCbCrColorspace, + LogColorspace, + CMYColorspace, /* negated linear RGB colorspace */ + LuvColorspace, + HCLColorspace, + LCHColorspace, /* alias for LCHuv */ + LMSColorspace, + LCHabColorspace, /* Cylindrical (Polar) Lab */ + LCHuvColorspace, /* Cylindrical (Polar) Luv */ + scRGBColorspace, + HSIColorspace, + HSVColorspace, /* alias for HSB */ + HCLpColorspace, + YDbDrColorspace + } ColorspaceType; + ) + + STRINGIFY( + typedef enum + { + UndefinedCompositeOp, + NoCompositeOp, + ModulusAddCompositeOp, + AtopCompositeOp, + BlendCompositeOp, + BumpmapCompositeOp, + ChangeMaskCompositeOp, + ClearCompositeOp, + ColorBurnCompositeOp, + ColorDodgeCompositeOp, + ColorizeCompositeOp, + CopyBlackCompositeOp, + CopyBlueCompositeOp, + CopyCompositeOp, + CopyCyanCompositeOp, + CopyGreenCompositeOp, + CopyMagentaCompositeOp, + CopyOpacityCompositeOp, + CopyRedCompositeOp, + CopyYellowCompositeOp, + DarkenCompositeOp, + DstAtopCompositeOp, + DstCompositeOp, + DstInCompositeOp, + DstOutCompositeOp, + DstOverCompositeOp, + DifferenceCompositeOp, + DisplaceCompositeOp, + DissolveCompositeOp, + ExclusionCompositeOp, + HardLightCompositeOp, + HueCompositeOp, + InCompositeOp, + LightenCompositeOp, + LinearLightCompositeOp, + LuminizeCompositeOp, + MinusDstCompositeOp, + ModulateCompositeOp, + MultiplyCompositeOp, + OutCompositeOp, + OverCompositeOp, + OverlayCompositeOp, + PlusCompositeOp, + ReplaceCompositeOp, + SaturateCompositeOp, + ScreenCompositeOp, + SoftLightCompositeOp, + SrcAtopCompositeOp, + SrcCompositeOp, + SrcInCompositeOp, + SrcOutCompositeOp, + SrcOverCompositeOp, + ModulusSubtractCompositeOp, + ThresholdCompositeOp, + XorCompositeOp, + /* These are new operators, added after the above was last sorted. + * The list should be re-sorted only when a new library version is + * created. + */ + DivideDstCompositeOp, + DistortCompositeOp, + BlurCompositeOp, + PegtopLightCompositeOp, + VividLightCompositeOp, + PinLightCompositeOp, + LinearDodgeCompositeOp, + LinearBurnCompositeOp, + MathematicsCompositeOp, + DivideSrcCompositeOp, + MinusSrcCompositeOp, + DarkenIntensityCompositeOp, + LightenIntensityCompositeOp + } CompositeOperator; + ) + + STRINGIFY( + typedef enum + { + UndefinedFunction, + ArcsinFunction, + ArctanFunction, + PolynomialFunction, + SinusoidFunction + } MagickFunction; + ) + + STRINGIFY( + typedef enum + { + UndefinedNoise, + UniformNoise, + GaussianNoise, + MultiplicativeGaussianNoise, + ImpulseNoise, + LaplacianNoise, + PoissonNoise, + RandomNoise + } NoiseType; + ) + + STRINGIFY( + typedef enum + { + UndefinedPixelIntensityMethod = 0, + AveragePixelIntensityMethod, + BrightnessPixelIntensityMethod, + LightnessPixelIntensityMethod, + Rec601LumaPixelIntensityMethod, + Rec601LuminancePixelIntensityMethod, + Rec709LumaPixelIntensityMethod, + Rec709LuminancePixelIntensityMethod, + RMSPixelIntensityMethod, + MSPixelIntensityMethod + } PixelIntensityMethod; + ) + + STRINGIFY( + typedef enum { + BoxWeightingFunction = 0, + TriangleWeightingFunction, + CubicBCWeightingFunction, + HannWeightingFunction, + HammingWeightingFunction, + BlackmanWeightingFunction, + GaussianWeightingFunction, + QuadraticWeightingFunction, + JincWeightingFunction, + SincWeightingFunction, + SincFastWeightingFunction, + KaiserWeightingFunction, + WelshWeightingFunction, + BohmanWeightingFunction, + LagrangeWeightingFunction, + CosineWeightingFunction, + } ResizeWeightingFunctionType; + ) + + STRINGIFY( + typedef enum + { + UndefinedChannel = 0x0000, + RedChannel = 0x0001, + GrayChannel = 0x0001, + CyanChannel = 0x0001, + GreenChannel = 0x0002, + MagentaChannel = 0x0002, + BlueChannel = 0x0004, + YellowChannel = 0x0004, + BlackChannel = 0x0008, + AlphaChannel = 0x0010, + OpacityChannel = 0x0010, + IndexChannel = 0x0020, /* Color Index Table? */ + ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */ + WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */ + MetaChannel = 0x0100, /* ???? */ + CompositeChannels = 0x002F, + AllChannels = 0x7ffffff, + /* + Special purpose channel types. + FUTURE: are these needed any more - they are more like hacks + SyncChannels for example is NOT a real channel but a 'flag' + It really says -- "User has not defined channels" + Though it does have extra meaning in the "-auto-level" operator + */ + TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */ + RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */ + GrayChannels = 0x0400, + SyncChannels = 0x20000, /* channels modified as a single unit */ + DefaultChannels = AllChannels + } ChannelType; /* must correspond to PixelChannel */ + ) + +/* + Helper functions. +*/ + +OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8)) + + STRINGIFY( + inline CLQuantum ScaleCharToQuantum(const unsigned char value) + { + return((CLQuantum) value); + } + ) + +OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16)) + + STRINGIFY( + inline CLQuantum ScaleCharToQuantum(const unsigned char value) + { + return((CLQuantum) (257.0f*value)); + } + ) + +OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32)) + + STRINGIFY( + inline CLQuantum ScaleCharToQuantum(const unsigned char value) + { + return((CLQuantum) (16843009.0*value)); + } + ) + +OPENCL_ENDIF() + + STRINGIFY( + inline int ClampToCanvas(const int offset,const int range) + { + return clamp(offset, (int)0, range-1); + } + ) + + STRINGIFY( + inline CLQuantum ClampToQuantum(const float value) + { + return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f); + } + ) + + STRINGIFY( + inline uint ScaleQuantumToMap(CLQuantum value) + { + if (value >= (CLQuantum) MaxMap) + return ((uint)MaxMap); + else + return ((uint)value); + } + ) + + STRINGIFY( + inline float PerceptibleReciprocal(const float x) + { + float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0; + return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon)); + } + ) + + STRINGIFY( + inline float RoundToUnity(const float value) + { + return clamp(value,0.0f,1.0f); + } + ) + + STRINGIFY( + + inline unsigned int getPixelIndex(const unsigned int number_channels, + const unsigned int columns, const unsigned int x, const unsigned int y) + { + return (x * number_channels) + (y * columns * number_channels); + } + + inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; } + inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); } + inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); } + inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); } + + inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; } + inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; } + inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; } + inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; } + + inline CLQuantum getBlue(CLPixelType p) { return p.x; } + inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; } + inline float getBlueF4(float4 p) { return p.x; } + inline void setBlueF4(float4* p, float value) { (*p).x = value; } + + inline CLQuantum getGreen(CLPixelType p) { return p.y; } + inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; } + inline float getGreenF4(float4 p) { return p.y; } + inline void setGreenF4(float4* p, float value) { (*p).y = value; } + + inline CLQuantum getRed(CLPixelType p) { return p.z; } + inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; } + inline float getRedF4(float4 p) { return p.z; } + inline void setRedF4(float4* p, float value) { (*p).z = value; } + + inline CLQuantum getAlpha(CLPixelType p) { return p.w; } + inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; } + inline float getAlphaF4(float4 p) { return p.w; } + inline void setAlphaF4(float4* p, float value) { (*p).w = value; } + + inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels, + const ChannelType channel, float *red, float *green, float *blue, float *alpha) + { + if ((channel & RedChannel) != 0) + *red=getPixelRed(p); + + if (number_channels > 2) + { + if ((channel & GreenChannel) != 0) + *green=getPixelGreen(p); + + if ((channel & BlueChannel) != 0) + *blue=getPixelBlue(p); + } + + if (((number_channels == 4) || (number_channels == 2)) && + ((channel & AlphaChannel) != 0)) + *alpha=getPixelAlpha(p,number_channels); + } + + inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels, + const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel) + { + const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); + + float red = 0.0f; + float green = 0.0f; + float blue = 0.0f; + float alpha = 0.0f; + + ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); + return (float4)(red, green, blue, alpha); + } + + inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels, + const ChannelType channel, float red, float green, float blue, float alpha) + { + if ((channel & RedChannel) != 0) + setPixelRed(p,ClampToQuantum(red)); + + if (number_channels > 2) + { + if ((channel & GreenChannel) != 0) + setPixelGreen(p,ClampToQuantum(green)); + + if ((channel & BlueChannel) != 0) + setPixelBlue(p,ClampToQuantum(blue)); + } + + if (((number_channels == 4) || (number_channels == 2)) && + ((channel & AlphaChannel) != 0)) + setPixelAlpha(p,number_channels,ClampToQuantum(alpha)); + } + + inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels, + const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel, + float4 pixel) + { + __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); + WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w); + } + + inline float GetPixelIntensity(const unsigned int colorspace, + const unsigned int method,float red,float green,float blue) + { + float intensity; + + if (colorspace == GRAYColorspace) + return red; + + switch (method) + { + case AveragePixelIntensityMethod: + { + intensity=(red+green+blue)/3.0; + break; + } + case BrightnessPixelIntensityMethod: + { + intensity=MagickMax(MagickMax(red,green),blue); + break; + } + case LightnessPixelIntensityMethod: + { + intensity=(MagickMin(MagickMin(red,green),blue)+ + MagickMax(MagickMax(red,green),blue))/2.0; + break; + } + case MSPixelIntensityMethod: + { + intensity=(float) (((float) red*red+green*green+blue*blue)/ + (3.0*QuantumRange)); + break; + } + case Rec601LumaPixelIntensityMethod: + { + /* + if (image->colorspace == RGBColorspace) + { + red=EncodePixelGamma(red); + green=EncodePixelGamma(green); + blue=EncodePixelGamma(blue); + } + */ + intensity=0.298839*red+0.586811*green+0.114350*blue; + break; + } + case Rec601LuminancePixelIntensityMethod: + { + /* + if (image->colorspace == sRGBColorspace) + { + red=DecodePixelGamma(red); + green=DecodePixelGamma(green); + blue=DecodePixelGamma(blue); + } + */ + intensity=0.298839*red+0.586811*green+0.114350*blue; + break; + } + case Rec709LumaPixelIntensityMethod: + default: + { + /* + if (image->colorspace == RGBColorspace) + { + red=EncodePixelGamma(red); + green=EncodePixelGamma(green); + blue=EncodePixelGamma(blue); + } + */ + intensity=0.212656*red+0.715158*green+0.072186*blue; + break; + } + case Rec709LuminancePixelIntensityMethod: + { + /* + if (image->colorspace == sRGBColorspace) + { + red=DecodePixelGamma(red); + green=DecodePixelGamma(green); + blue=DecodePixelGamma(blue); + } + */ + intensity=0.212656*red+0.715158*green+0.072186*blue; + break; + } + case RMSPixelIntensityMethod: + { + intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/ + sqrt(3.0)); + break; + } + } + + return intensity; + } + + inline int mirrorBottom(int value) + { + return (value < 0) ? - (value) : value; + } + + inline int mirrorTop(int value, int width) + { + return (value >= width) ? (2 * width - value - 1) : value; + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% A d d N o i s e % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + /* + Part of MWC64X by David Thomas, dt10@imperial.ac.uk + This is provided under BSD, full license is with the main package. + See http://www.doc.ic.ac.uk/~dt10/research + */ + + // Pre: a=M) || (v=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug. + v=v-M; + return v; + } + + // Pre: a>1; + } + return r; + } + + // Pre: a=0 + // Post: r=(a^b) mod M + // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on + // most architectures + ulong MWC_PowMod64(ulong a, ulong e, ulong M) + { + ulong sqr=a, acc=1; + while(e!=0){ + if(e&1) + acc=MWC_MulMod64(acc,sqr,M); + sqr=MWC_MulMod64(sqr,sqr,M); + e=e>>1; + } + return acc; + } + + uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance) + { + ulong m=MWC_PowMod64(A, distance, M); + ulong x=curr.x*(ulong)A+curr.y; + x=MWC_MulMod64(x, m, M); + return (uint2)((uint)(x/A), (uint)(x%A)); + } + + uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap) + { + // This is an arbitrary constant for starting LCG jumping from. I didn't + // want to start from 1, as then you end up with the two or three first values + // being a bit poor in ones - once you've decided that, one constant is as + // good as any another. There is no deep mathematical reason for it, I just + // generated a random number. + enum{ MWC_BASEID = 4077358422479273989UL }; + + ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap; + ulong m=MWC_PowMod64(A, dist, M); + + ulong x=MWC_MulMod64(MWC_BASEID, m, M); + return (uint2)((uint)(x/A), (uint)(x%A)); + } + + //! Represents the state of a particular generator + typedef struct{ uint x; uint c; } mwc64x_state_t; + + enum{ MWC64X_A = 4294883355U }; + enum{ MWC64X_M = 18446383549859758079UL }; + + void MWC64X_Step(mwc64x_state_t *s) + { + uint X=s->x, C=s->c; + + uint Xn=MWC64X_A*X+C; + uint carry=(uint)(Xnx=Xn; + s->c=Cn; + } + + void MWC64X_Skip(mwc64x_state_t *s, ulong distance) + { + uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance); + s->x=tmp.x; + s->c=tmp.y; + } + + void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset) + { + uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset); + s->x=tmp.x; + s->c=tmp.y; + } + + //! Return a 32-bit integer in the range [0..2^32) + uint MWC64X_NextUint(mwc64x_state_t *s) + { + uint res=s->x ^ s->c; + MWC64X_Step(s); + return res; + } + + // + // End of MWC64X excerpt + // + + float mwcReadPseudoRandomValue(mwc64x_state_t* rng) + { + return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0 + } + + float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate) + { + float + alpha, + beta, + noise, + sigma; + + noise = 0.0f; + alpha=mwcReadPseudoRandomValue(r); + switch(noise_type) + { + case UniformNoise: + default: + { + noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); + break; + } + case GaussianNoise: + { + float + gamma, + tau; + + if (alpha == 0.0f) + alpha=1.0f; + beta=mwcReadPseudoRandomValue(r); + gamma=sqrt(-2.0f*log(alpha)); + sigma=gamma*cospi((2.0f*beta)); + tau=gamma*sinpi((2.0f*beta)); + noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau; + break; + } + case ImpulseNoise: + { + if (alpha < (SigmaImpulse/2.0f)) + noise=0.0f; + else + if (alpha >= (1.0f-(SigmaImpulse/2.0f))) + noise=QuantumRange; + else + noise=pixel; + break; + } + case LaplacianNoise: + { + if (alpha <= 0.5f) + { + if (alpha <= MagickEpsilon) + noise=(pixel-QuantumRange); + else + noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ + 0.5f); + break; + } + beta=1.0f-alpha; + if (beta <= (0.5f*MagickEpsilon)) + noise=(pixel+QuantumRange); + else + noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); + break; + } + case MultiplicativeGaussianNoise: + { + sigma=1.0f; + if (alpha > MagickEpsilon) + sigma=sqrt(-2.0f*log(alpha)); + beta=mwcReadPseudoRandomValue(r); + noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma* + cospi((2.0f*beta))/2.0f); + break; + } + case PoissonNoise: + { + float + poisson; + unsigned int i; + poisson=exp(-SigmaPoisson*QuantumScale*pixel); + for (i=0; alpha > poisson; i++) + { + beta=mwcReadPseudoRandomValue(r); + alpha*=beta; + } + noise=(QuantumRange*i/SigmaPoisson); + break; + } + case RandomNoise: + { + noise=(QuantumRange*SigmaRandom*alpha); + break; + } + } + return noise; + } + + __kernel + void AddNoise(const __global CLQuantum *image, + const unsigned int number_channels,const ChannelType channel, + const unsigned int length,const unsigned int pixelsPerWorkItem, + const NoiseType noise_type,const float attenuate,const unsigned int seed0, + const unsigned int seed1,const unsigned int numRandomNumbersPerPixel, + __global CLQuantum *filteredImage) + { + mwc64x_state_t rng; + rng.x = seed0; + rng.c = seed1; + + uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use + uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream); + MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams + + uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels); + uint count = pixelsPerWorkItem; + + while (count > 0 && pos < length) + { + const __global CLQuantum *p = image + pos; + __global CLQuantum *q = filteredImage + pos; + + float red; + float green; + float blue; + float alpha; + + ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); + + if ((channel & RedChannel) != 0) + red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate); + + if (number_channels > 2) + { + if ((channel & GreenChannel) != 0) + green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate); + + if ((channel & BlueChannel) != 0) + blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate); + } + + if (((number_channels == 4) || (number_channels == 2)) && + ((channel & AlphaChannel) != 0)) + alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate); + + WriteChannels(q, number_channels, channel, red, green, blue, alpha); + + pos += (get_local_size(0) * number_channels); + count--; + } + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% B l u r % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + /* + Reduce image noise and reduce detail levels by row + */ + __kernel void BlurRow(const __global CLQuantum *image, + const unsigned int number_channels,const ChannelType channel, + __constant float *filter,const unsigned int width, + const unsigned int imageColumns,const unsigned int imageRows, + __local float4 *temp,__global float4 *tempImage) + { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int columns = imageColumns; + + const unsigned int radius = (width-1)/2; + const int wsize = get_local_size(0); + const unsigned int loadSize = wsize+width; + + //group coordinate + const int groupX=get_local_size(0)*get_group_id(0); + + //parallel load and clamp + for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0)) + { + int cx = ClampToCanvas(i + groupX - radius, columns); + temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel); + } + + // barrier + barrier(CLK_LOCAL_MEM_FENCE); + + // only do the work if this is not a patched item + if (get_global_id(0) < columns) + { + // compute + float4 result = (float4) 0; + + int i = 0; + + for ( ; i+7 < width; ) + { + for (int j=0; j < 8; j++) + result+=filter[i+j]*temp[i+j+get_local_id(0)]; + i+=8; + } + + for ( ; i < width; i++) + result+=filter[i]*temp[i+get_local_id(0)]; + + // write back to global + tempImage[y*columns+x] = result; + } + } + ) + + STRINGIFY( + /* + Reduce image noise and reduce detail levels by line + */ + __kernel void BlurColumn(const __global float4 *blurRowData, + const unsigned int number_channels,const ChannelType channel, + __constant float *filter,const unsigned int width, + const unsigned int imageColumns,const unsigned int imageRows, + __local float4 *temp,__global CLQuantum *filteredImage) + { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int columns = imageColumns; + const int rows = imageRows; + + unsigned int radius = (width-1)/2; + const int wsize = get_local_size(1); + const unsigned int loadSize = wsize+width; + + //group coordinate + const int groupX=get_local_size(0)*get_group_id(0); + const int groupY=get_local_size(1)*get_group_id(1); + //notice that get_local_size(0) is 1, so + //groupX=get_group_id(0); + + //parallel load and clamp + for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1)) + temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX]; + + // barrier + barrier(CLK_LOCAL_MEM_FENCE); + + // only do the work if this is not a patched item + if (get_global_id(1) < rows) + { + // compute + float4 result = (float4) 0; + + int i = 0; + + for ( ; i+7 < width; ) + { + for (int j=0; j < 8; j++) + result+=filter[i+j]*temp[i+j+get_local_id(1)]; + i+=8; + } + + for ( ; i < width; i++) + result+=filter[i]*temp[i+get_local_id(1)]; + + // write back to global + WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); + } + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% C o m p o s i t e % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + inline float ColorDodge(const float Sca, + const float Sa,const float Dca,const float Da) + { + /* + Oct 2004 SVG specification. + */ + if ((Sca*Da+Dca*Sa) >= Sa*Da) + return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa)); + return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa)); + + + /* + New specification, March 2009 SVG specification. This specification was + also wrong of non-overlap cases. + */ + /* + if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon)) + return(Sca*(1.0-Da)); + if (fabs(Sca-Sa) < MagickEpsilon) + return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa)); + return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca))); + */ + + /* + Working from first principles using the original formula: + + f(Sc,Dc) = Dc/(1-Sc) + + This works correctly! Looks like the 2004 model was right but just + required a extra condition for correct handling. + */ + + /* + if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon)) + return(Sca*(1.0-Da)+Dca*(1.0-Sa)); + if (fabs(Sca-Sa) < MagickEpsilon) + return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa)); + return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa)); + */ + } + + inline void CompositeColorDodge(const float4 *p, + const float4 *q,float4 *composite) { + + float + Da, + gamma, + Sa; + + Sa=QuantumScale*getAlphaF4(*p); /* simplify and speed up equations */ + Da=QuantumScale*getAlphaF4(*q); + gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */ + setAlphaF4(composite,QuantumRange*gamma); + gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma); + setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale* + getRedF4(*q)*Da,Da)); + setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale* + getGreenF4(*q)*Da,Da)); + setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale* + getBlueF4(*q)*Da,Da)); + } + ) + + STRINGIFY( + inline void MagickPixelCompositePlus(const float4 *p, + const float alpha,const float4 *q, + const float beta,float4 *composite) + { + float + gamma; + + float + Da, + Sa; + /* + Add two pixels with the given opacities. + */ + Sa=QuantumScale*alpha; + Da=QuantumScale*beta; + gamma=RoundToUnity(Sa+Da); /* 'Plus' blending -- not 'Over' blending */ + setAlphaF4(composite,QuantumRange*gamma); + gamma=PerceptibleReciprocal(gamma); + setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q))); + setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q))); + setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q))); + } + ) + + STRINGIFY( + inline void MagickPixelCompositeBlend(const float4 *p, + const float alpha,const float4 *q, + const float beta,float4 *composite) + { + MagickPixelCompositePlus(p,(float) (alpha* + (getAlphaF4(*p))),q,(float) (beta* + (getAlphaF4(*q))),composite); + } + ) + + STRINGIFY( + __kernel + void Composite(__global CLPixelType *image, + const unsigned int imageWidth, + const unsigned int imageHeight, + const __global CLPixelType *compositeImage, + const unsigned int compositeWidth, + const unsigned int compositeHeight, + const unsigned int compose, + const ChannelType channel, + const unsigned int matte, + const float destination_dissolve, + const float source_dissolve) { + + uint2 index; + index.x = get_global_id(0); + index.y = get_global_id(1); + + + if (index.x >= imageWidth + || index.y >= imageHeight) { + return; + } + const CLPixelType inputPixel = image[index.y*imageWidth+index.x]; + float4 destination; + setRedF4(&destination,getRed(inputPixel)); + setGreenF4(&destination,getGreen(inputPixel)); + setBlueF4(&destination,getBlue(inputPixel)); + + + const CLPixelType compositePixel + = compositeImage[index.y*imageWidth+index.x]; + float4 source; + setRedF4(&source,getRed(compositePixel)); + setGreenF4(&source,getGreen(compositePixel)); + setBlueF4(&source,getBlue(compositePixel)); + + if (matte != 0) { + setAlphaF4(&destination,getAlpha(inputPixel)); + setAlphaF4(&source,getAlpha(compositePixel)); + } + else { + setAlphaF4(&destination,1.0f); + setAlphaF4(&source,1.0f); + } + + float4 composite=destination; + + CompositeOperator op = (CompositeOperator)compose; + switch (op) { + case ColorDodgeCompositeOp: + CompositeColorDodge(&source,&destination,&composite); + break; + case BlendCompositeOp: + MagickPixelCompositeBlend(&source,source_dissolve,&destination, + destination_dissolve,&composite); + break; + default: + // unsupported operators + break; + }; + + CLPixelType outputPixel; + setRed(&outputPixel, ClampToQuantum(getRedF4(composite))); + setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite))); + setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite))); + setAlpha(&outputPixel, ClampToQuantum(getAlphaF4(composite))); + image[index.y*imageWidth+index.x] = outputPixel; + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% C o n t r a s t % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + + inline float3 ConvertRGBToHSB(CLPixelType pixel) { + float3 HueSaturationBrightness; + HueSaturationBrightness.x = 0.0f; // Hue + HueSaturationBrightness.y = 0.0f; // Saturation + HueSaturationBrightness.z = 0.0f; // Brightness + + float r=(float) getRed(pixel); + float g=(float) getGreen(pixel); + float b=(float) getBlue(pixel); + + float tmin=MagickMin(MagickMin(r,g),b); + float tmax=MagickMax(MagickMax(r,g),b); + + if (tmax!=0.0f) { + float delta=tmax-tmin; + HueSaturationBrightness.y=delta/tmax; + HueSaturationBrightness.z=QuantumScale*tmax; + + if (delta != 0.0f) { + HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f)); + HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta; + HueSaturationBrightness.x/=6.0f; + HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f; + } + } + return HueSaturationBrightness; + } + + inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) { + + float hue = HueSaturationBrightness.x; + float brightness = HueSaturationBrightness.z; + float saturation = HueSaturationBrightness.y; + + CLPixelType rgb; + + if (saturation == 0.0f) { + setRed(&rgb,ClampToQuantum(QuantumRange*brightness)); + setGreen(&rgb,getRed(rgb)); + setBlue(&rgb,getRed(rgb)); + } + else { + + float h=6.0f*(hue-floor(hue)); + float f=h-floor(h); + float p=brightness*(1.0f-saturation); + float q=brightness*(1.0f-saturation*f); + float t=brightness*(1.0f-(saturation*(1.0f-f))); + + float clampedBrightness = ClampToQuantum(QuantumRange*brightness); + float clamped_t = ClampToQuantum(QuantumRange*t); + float clamped_p = ClampToQuantum(QuantumRange*p); + float clamped_q = ClampToQuantum(QuantumRange*q); + int ih = (int)h; + setRed(&rgb, (ih == 1)?clamped_q: + (ih == 2 || ih == 3)?clamped_p: + (ih == 4)?clamped_t: + clampedBrightness); + + setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness: + (ih == 3)?clamped_q: + (ih == 4 || ih == 5)?clamped_p: + clamped_t); + + setBlue(&rgb, (ih == 2)?clamped_t: + (ih == 3 || ih == 4)?clampedBrightness: + (ih == 5)?clamped_q: + clamped_p); + } + return rgb; + } + + __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen) + { + + const int sign = sharpen!=0?1:-1; + const int x = get_global_id(0); + const int y = get_global_id(1); + const int columns = get_global_size(0); + const int c = x + y * columns; + + CLPixelType pixel = im[c]; + float3 HueSaturationBrightness = ConvertRGBToHSB(pixel); + float brightness = HueSaturationBrightness.z; + brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness); + brightness = clamp(brightness,0.0f,1.0f); + HueSaturationBrightness.z = brightness; + + CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness); + filteredPixel.w = pixel.w; + im[c] = filteredPixel; + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% C o n t r a s t S t r e t c h % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + /* + */ + __kernel void Histogram(__global CLPixelType * restrict im, + const ChannelType channel, + const unsigned int colorspace, + const unsigned int method, + __global uint4 * restrict histogram) + { + const int x = get_global_id(0); + const int y = get_global_id(1); + const int columns = get_global_size(0); + const int c = x + y * columns; + if ((channel & SyncChannels) != 0) + { + float red=(float)getRed(im[c]); + float green=(float)getGreen(im[c]); + float blue=(float)getBlue(im[c]); + + float intensity = GetPixelIntensity(colorspace, method, red, green, blue); + uint pos = ScaleQuantumToMap(ClampToQuantum(intensity)); + atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position + } + else + { + // for equalizing, we always need all channels? + // otherwise something more + } + } + ) + + STRINGIFY( + /* + */ + __kernel void ContrastStretch(__global CLPixelType * restrict im, + const ChannelType channel, + __global CLPixelType * restrict stretch_map, + const float4 white, const float4 black) + { + const int x = get_global_id(0); + const int y = get_global_id(1); + const int columns = get_global_size(0); + const int c = x + y * columns; + + uint ePos; + CLPixelType oValue, eValue; + CLQuantum red, green, blue, alpha; + + //read from global + oValue=im[c]; + + if ((channel & RedChannel) != 0) + { + if (getRedF4(white) != getRedF4(black)) + { + ePos = ScaleQuantumToMap(getRed(oValue)); + eValue = stretch_map[ePos]; + red = getRed(eValue); + } + } + + if ((channel & GreenChannel) != 0) + { + if (getGreenF4(white) != getGreenF4(black)) + { + ePos = ScaleQuantumToMap(getGreen(oValue)); + eValue = stretch_map[ePos]; + green = getGreen(eValue); + } + } + + if ((channel & BlueChannel) != 0) + { + if (getBlueF4(white) != getBlueF4(black)) + { + ePos = ScaleQuantumToMap(getBlue(oValue)); + eValue = stretch_map[ePos]; + blue = getBlue(eValue); + } + } + + if ((channel & AlphaChannel) != 0) + { + if (getAlphaF4(white) != getAlphaF4(black)) + { + ePos = ScaleQuantumToMap(getAlpha(oValue)); + eValue = stretch_map[ePos]; + alpha = getAlpha(eValue); + } + } + + //write back + im[c]=(CLPixelType)(blue, green, red, alpha); + + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% C o n v o l v e % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + __kernel + void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output, + const unsigned int imageWidth, const unsigned int imageHeight, + __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, + const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) { + + int2 blockID; + blockID.x = get_global_id(0) / get_local_size(0); + blockID.y = get_global_id(1) / get_local_size(1); + + // image area processed by this workgroup + int2 imageAreaOrg; + imageAreaOrg.x = blockID.x * get_local_size(0); + imageAreaOrg.y = blockID.y * get_local_size(1); + + int2 midFilterDimen; + midFilterDimen.x = (filterWidth-1)/2; + midFilterDimen.y = (filterHeight-1)/2; + + int2 cachedAreaOrg = imageAreaOrg - midFilterDimen; + + // dimension of the local cache + int2 cachedAreaDimen; + cachedAreaDimen.x = get_local_size(0) + filterWidth - 1; + cachedAreaDimen.y = get_local_size(1) + filterHeight - 1; + + // cache the pixels accessed by this workgroup in local memory + int localID = get_local_id(1)*get_local_size(0)+get_local_id(0); + int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y; + int groupSize = get_local_size(0) * get_local_size(1); + for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) { + + int2 cachedAreaIndex; + cachedAreaIndex.x = i % cachedAreaDimen.x; + cachedAreaIndex.y = i / cachedAreaDimen.x; + + int2 imagePixelIndex; + imagePixelIndex = cachedAreaOrg + cachedAreaIndex; + + // only support EdgeVirtualPixelMethod through ClampToCanvas + // TODO: implement other virtual pixel method + imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth); + imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight); + + pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x]; + } + + // cache the filter + for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) { + filterCache[i] = filter[i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + + int2 imageIndex; + imageIndex.x = imageAreaOrg.x + get_local_id(0); + imageIndex.y = imageAreaOrg.y + get_local_id(1); + + // if out-of-range, stops here and quit + if (imageIndex.x >= imageWidth + || imageIndex.y >= imageHeight) { + return; + } + + int filterIndex = 0; + float4 sum = (float4)0.0f; + float gamma = 0.0f; + if (((channel & AlphaChannel) == 0) || (matte == 0)) { + int cacheIndexY = get_local_id(1); + for (int j = 0; j < filterHeight; j++) { + int cacheIndexX = get_local_id(0); + for (int i = 0; i < filterWidth; i++) { + CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; + float f = filterCache[filterIndex]; + + sum.x += f * p.x; + sum.y += f * p.y; + sum.z += f * p.z; + sum.w += f * p.w; + + gamma += f; + filterIndex++; + cacheIndexX++; + } + cacheIndexY++; + } + } + else { + int cacheIndexY = get_local_id(1); + for (int j = 0; j < filterHeight; j++) { + int cacheIndexX = get_local_id(0); + for (int i = 0; i < filterWidth; i++) { + + CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; + float alpha = QuantumScale*p.w; + float f = filterCache[filterIndex]; + float g = alpha * f; + + sum.x += g*p.x; + sum.y += g*p.y; + sum.z += g*p.z; + sum.w += f*p.w; + + gamma += g; + filterIndex++; + cacheIndexX++; + } + cacheIndexY++; + } + gamma = PerceptibleReciprocal(gamma); + sum.xyz = gamma*sum.xyz; + } + CLPixelType outputPixel; + outputPixel.x = ClampToQuantum(sum.x); + outputPixel.y = ClampToQuantum(sum.y); + outputPixel.z = ClampToQuantum(sum.z); + outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; + + output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; + } + ) + + STRINGIFY( + __kernel + void Convolve(const __global CLPixelType *input, __global CLPixelType *output, + const uint imageWidth, const uint imageHeight, + __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, + const uint matte, const ChannelType channel) { + + int2 imageIndex; + imageIndex.x = get_global_id(0); + imageIndex.y = get_global_id(1); + + /* + unsigned int imageWidth = get_global_size(0); + unsigned int imageHeight = get_global_size(1); + */ + if (imageIndex.x >= imageWidth + || imageIndex.y >= imageHeight) + return; + + int2 midFilterDimen; + midFilterDimen.x = (filterWidth-1)/2; + midFilterDimen.y = (filterHeight-1)/2; + + int filterIndex = 0; + float4 sum = (float4)0.0f; + float gamma = 0.0f; + if (((channel & AlphaChannel) == 0) || (matte == 0)) { + for (int j = 0; j < filterHeight; j++) { + int2 inputPixelIndex; + inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; + inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); + for (int i = 0; i < filterWidth; i++) { + inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; + inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); + + CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; + float f = filter[filterIndex]; + + sum.x += f * p.x; + sum.y += f * p.y; + sum.z += f * p.z; + sum.w += f * p.w; + + gamma += f; + + filterIndex++; + } + } + } + else { + + for (int j = 0; j < filterHeight; j++) { + int2 inputPixelIndex; + inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; + inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); + for (int i = 0; i < filterWidth; i++) { + inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; + inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); + + CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; + float alpha = QuantumScale*p.w; + float f = filter[filterIndex]; + float g = alpha * f; + + sum.x += g*p.x; + sum.y += g*p.y; + sum.z += g*p.z; + sum.w += f*p.w; + + gamma += g; + + + filterIndex++; + } + } + gamma = PerceptibleReciprocal(gamma); + sum.xyz = gamma*sum.xyz; + } + + CLPixelType outputPixel; + outputPixel.x = ClampToQuantum(sum.x); + outputPixel.y = ClampToQuantum(sum.y); + outputPixel.z = ClampToQuantum(sum.z); + outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; + + output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% D e s p e c k l e % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + + __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage + , const unsigned int imageWidth, const unsigned int imageHeight + , const int2 offset, const int polarity, const int matte) { + + int x = get_global_id(0); + int y = get_global_id(1); + + CLPixelType v = inputImage[y*imageWidth+x]; + + int2 neighbor; + neighbor.y = y + offset.y; + neighbor.x = x + offset.x; + + int2 clampedNeighbor; + clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); + clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); + + CLPixelType r = (clampedNeighbor.x == neighbor.x + && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] + :(CLPixelType)0; + + int sv[4]; + sv[0] = (int)v.x; + sv[1] = (int)v.y; + sv[2] = (int)v.z; + sv[3] = (int)v.w; + + int sr[4]; + sr[0] = (int)r.x; + sr[1] = (int)r.y; + sr[2] = (int)r.z; + sr[3] = (int)r.w; + + if (polarity > 0) { + \n #pragma unroll 4\n + for (unsigned int i = 0; i < 4; i++) { + sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i]; + } + } + else { + \n #pragma unroll 4\n + for (unsigned int i = 0; i < 4; i++) { + sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i]; + } + + } + + v.x = (CLQuantum)sv[0]; + v.y = (CLQuantum)sv[1]; + v.z = (CLQuantum)sv[2]; + + if (matte!=0) + v.w = (CLQuantum)sv[3]; + + outputImage[y*imageWidth+x] = v; + + } + + + ) + + STRINGIFY( + + __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage + , const unsigned int imageWidth, const unsigned int imageHeight + , const int2 offset, const int polarity, const int matte) { + + int x = get_global_id(0); + int y = get_global_id(1); + + CLPixelType v = inputImage[y*imageWidth+x]; + + int2 neighbor, clampedNeighbor; + + neighbor.y = y + offset.y; + neighbor.x = x + offset.x; + clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); + clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); + + CLPixelType r = (clampedNeighbor.x == neighbor.x + && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] + :(CLPixelType)0; + + + neighbor.y = y - offset.y; + neighbor.x = x - offset.x; + clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); + clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); + + CLPixelType s = (clampedNeighbor.x == neighbor.x + && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] + :(CLPixelType)0; + + + int sv[4]; + sv[0] = (int)v.x; + sv[1] = (int)v.y; + sv[2] = (int)v.z; + sv[3] = (int)v.w; + + int sr[4]; + sr[0] = (int)r.x; + sr[1] = (int)r.y; + sr[2] = (int)r.z; + sr[3] = (int)r.w; + + int ss[4]; + ss[0] = (int)s.x; + ss[1] = (int)s.y; + ss[2] = (int)s.z; + ss[3] = (int)s.w; + + if (polarity > 0) { + \n #pragma unroll 4\n + for (unsigned int i = 0; i < 4; i++) { + //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i]; + // + //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); + //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); + sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); + } + } + else { + \n #pragma unroll 4\n + for (unsigned int i = 0; i < 4; i++) { + //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i]; + // + //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); + sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); + } + } + + v.x = (CLQuantum)sv[0]; + v.y = (CLQuantum)sv[1]; + v.z = (CLQuantum)sv[2]; + + if (matte!=0) + v.w = (CLQuantum)sv[3]; + + outputImage[y*imageWidth+x] = v; + + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% E q u a l i z e % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + /* + */ + __kernel void Equalize(__global CLPixelType * restrict im, + const ChannelType channel, + __global CLPixelType * restrict equalize_map, + const float4 white, const float4 black) + { + const int x = get_global_id(0); + const int y = get_global_id(1); + const int columns = get_global_size(0); + const int c = x + y * columns; + + uint ePos; + CLPixelType oValue, eValue; + CLQuantum red, green, blue, alpha; + + //read from global + oValue=im[c]; + + if ((channel & SyncChannels) != 0) + { + if (getRedF4(white) != getRedF4(black)) + { + ePos = ScaleQuantumToMap(getRed(oValue)); + eValue = equalize_map[ePos]; + red = getRed(eValue); + ePos = ScaleQuantumToMap(getGreen(oValue)); + eValue = equalize_map[ePos]; + green = getRed(eValue); + ePos = ScaleQuantumToMap(getBlue(oValue)); + eValue = equalize_map[ePos]; + blue = getRed(eValue); + ePos = ScaleQuantumToMap(getAlpha(oValue)); + eValue = equalize_map[ePos]; + alpha = getRed(eValue); + + //write back + im[c]=(CLPixelType)(blue, green, red, alpha); + } + + } + + // for equalizing, we always need all channels? + // otherwise something more + + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% F u n c t i o n % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + /* + apply FunctionImageChannel(braightness-contrast) + */ + CLQuantum ApplyFunction(float pixel,const MagickFunction function, + const unsigned int number_parameters,__constant float *parameters) + { + float result = 0.0f; + + switch (function) + { + case PolynomialFunction: + { + for (unsigned int i=0; i < number_parameters; i++) + result = result*QuantumScale*pixel + parameters[i]; + result *= QuantumRange; + break; + } + case SinusoidFunction: + { + float freq,phase,ampl,bias; + freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; + phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f; + ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f; + bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; + result = QuantumRange*(ampl*sin(2.0f*MagickPI* + (freq*QuantumScale*pixel + phase/360.0f)) + bias); + break; + } + case ArcsinFunction: + { + float width,range,center,bias; + width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; + center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; + range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; + bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; + + result = 2.0f/width*(QuantumScale*pixel - center); + result = range/MagickPI*asin(result)+bias; + result = ( result <= -1.0f ) ? bias - range/2.0f : result; + result = ( result >= 1.0f ) ? bias + range/2.0f : result; + result *= QuantumRange; + break; + } + case ArctanFunction: + { + float slope,range,center,bias; + slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; + center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; + range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; + bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; + result = MagickPI*slope*(QuantumScale*pixel-center); + result = QuantumRange*(range/MagickPI*atan(result) + bias); + break; + } + case UndefinedFunction: + break; + } + return(ClampToQuantum(result)); + } + ) + + STRINGIFY( + /* + Improve brightness / contrast of the image + channel : define which channel is improved + function : the function called to enchance the brightness contrast + number_parameters : numbers of parameters + parameters : the parameter + */ + __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels, + const ChannelType channel,const MagickFunction function,const unsigned int number_parameters, + __constant float *parameters) + { + const unsigned int x = get_global_id(0); + const unsigned int y = get_global_id(1); + const unsigned int columns = get_global_size(0); + __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); + + float red; + float green; + float blue; + float alpha; + + ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); + + if ((channel & RedChannel) != 0) + red=ApplyFunction(red, function, number_parameters, parameters); + + if (number_channels > 2) + { + if ((channel & GreenChannel) != 0) + green=ApplyFunction(green, function, number_parameters, parameters); + + if ((channel & BlueChannel) != 0) + blue=ApplyFunction(blue, function, number_parameters, parameters); + } + + if (((number_channels == 4) || (number_channels == 2)) && + ((channel & AlphaChannel) != 0)) + alpha=ApplyFunction(alpha, function, number_parameters, parameters); + + WriteChannels(p, number_channels, channel, red, green, blue, alpha); + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% G r a y s c a l e % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + __kernel void Grayscale(__global CLQuantum *image,const int number_channels, + const unsigned int colorspace,const unsigned int method) + { + const unsigned int x = get_global_id(0); + const unsigned int y = get_global_id(1); + const unsigned int columns = get_global_size(0); + __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); + + float + blue, + green, + red; + + red=getPixelRed(p); + green=getPixelGreen(p); + blue=getPixelBlue(p); + + CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue)); + + setPixelRed(p,intensity); + setPixelGreen(p,intensity); + setPixelBlue(p,intensity); + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% L o c a l C o n t r a s t % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + + __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage, + const int radius, + const int imageWidth, + const int imageHeight) + { + const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f)); + + int x = get_local_id(0); + int y = get_global_id(1); + + global CLPixelType *src = srcImage + y * imageWidth; + + for (int i = x; i < imageWidth; i += get_local_size(0)) { + float sum = 0.0f; + float weight = 1.0f; + + int j = i - radius; + while ((j + 7) < i) { + for (int k = 0; k < 8; ++k) // Unroll 8x + sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)])); + weight += 8.0f; + j+=8; + } + while (j < i) { + sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)])); + weight += 1.0f; + ++j; + } + + while ((j + 7) < radius + i) { + for (int k = 0; k < 8; ++k) // Unroll 8x + sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)])); + weight -= 8.0f; + j+=8; + } + while (j < radius + i) { + sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)])); + weight -= 1.0f; + ++j; + } + + tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1)); + } + } + ) + + STRINGIFY( + __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage, + const int radius, + const float strength, + const int imageWidth, + const int imageHeight) + { + const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f); + + int x = get_global_id(0); + int y = get_global_id(1); + + if ((x >= imageWidth) || (y >= imageHeight)) + return; + + global float *src = blurImage + x; + + float sum = 0.0f; + float weight = 1.0f; + + int j = y - radius; + while ((j + 7) < y) { + for (int k = 0; k < 8; ++k) // Unroll 8x + sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth]; + weight += 8.0f; + j+=8; + } + while (j < y) { + sum += weight * src[mirrorBottom(j) * imageWidth]; + weight += 1.0f; + ++j; + } + + while ((j + 7) < radius + y) { + for (int k = 0; k < 8; ++k) // Unroll 8x + sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth]; + weight -= 8.0f; + j+=8; + } + while (j < radius + y) { + sum += weight * src[mirrorTop(j, imageHeight) * imageWidth]; + weight -= 1.0f; + ++j; + } + + CLPixelType pixel = srcImage[x + y * imageWidth]; + float srcVal = dot(RGB, convert_float4(pixel)); + float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f); + mult = (srcVal + mult) / srcVal; + + pixel.x = ClampToQuantum(pixel.x * mult); + pixel.y = ClampToQuantum(pixel.y * mult); + pixel.z = ClampToQuantum(pixel.z * mult); + + dstImage[x + y * imageWidth] = pixel; + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% M o d u l a t e % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + + inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue, + float *hue, float *saturation, float *lightness) + { + float + c, + tmax, + tmin; + + /* + Convert RGB to HSL colorspace. + */ + tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue)); + tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue)); + + c=tmax-tmin; + + *lightness=(tmax+tmin)/2.0; + if (c <= 0.0) + { + *hue=0.0; + *saturation=0.0; + return; + } + + if (tmax == (QuantumScale*red)) + { + *hue=(QuantumScale*green-QuantumScale*blue)/c; + if ((QuantumScale*green) < (QuantumScale*blue)) + *hue+=6.0; + } + else + if (tmax == (QuantumScale*green)) + *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c; + else + *hue=4.0+(QuantumScale*red-QuantumScale*green)/c; + + *hue*=60.0/360.0; + if (*lightness <= 0.5) + *saturation=c/(2.0*(*lightness)); + else + *saturation=c/(2.0-2.0*(*lightness)); + } + + inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness, + CLQuantum *red,CLQuantum *green,CLQuantum *blue) + { + float + b, + c, + g, + h, + tmin, + r, + x; + + /* + Convert HSL to RGB colorspace. + */ + h=hue*360.0; + if (lightness <= 0.5) + c=2.0*lightness*saturation; + else + c=(2.0-2.0*lightness)*saturation; + tmin=lightness-0.5*c; + h-=360.0*floor(h/360.0); + h/=60.0; + x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0)); + switch ((int) floor(h) % 6) + { + case 0: + { + r=tmin+c; + g=tmin+x; + b=tmin; + break; + } + case 1: + { + r=tmin+x; + g=tmin+c; + b=tmin; + break; + } + case 2: + { + r=tmin; + g=tmin+c; + b=tmin+x; + break; + } + case 3: + { + r=tmin; + g=tmin+x; + b=tmin+c; + break; + } + case 4: + { + r=tmin+x; + g=tmin; + b=tmin+c; + break; + } + case 5: + { + r=tmin+c; + g=tmin; + b=tmin+x; + break; + } + default: + { + r=0.0; + g=0.0; + b=0.0; + } + } + *red=ClampToQuantum(QuantumRange*r); + *green=ClampToQuantum(QuantumRange*g); + *blue=ClampToQuantum(QuantumRange*b); + } + + inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, + CLQuantum *red,CLQuantum *green,CLQuantum *blue) + { + float + hue, + lightness, + saturation; + + /* + Increase or decrease color lightness, saturation, or hue. + */ + ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness); + hue+=0.5*(0.01*percent_hue-1.0); + while (hue < 0.0) + hue+=1.0; + while (hue >= 1.0) + hue-=1.0; + saturation*=0.01*percent_saturation; + lightness*=0.01*percent_lightness; + ConvertHSLToRGB(hue,saturation,lightness,red,green,blue); + } + + __kernel void Modulate(__global CLPixelType *im, + const float percent_brightness, + const float percent_hue, + const float percent_saturation, + const int colorspace) + { + + const int x = get_global_id(0); + const int y = get_global_id(1); + const int columns = get_global_size(0); + const int c = x + y * columns; + + CLPixelType pixel = im[c]; + + CLQuantum + blue, + green, + red; + + red=getRed(pixel); + green=getGreen(pixel); + blue=getBlue(pixel); + + switch (colorspace) + { + case HSLColorspace: + default: + { + ModulateHSL(percent_hue, percent_saturation, percent_brightness, + &red, &green, &blue); + } + + } + + CLPixelType filteredPixel; + + setRed(&filteredPixel, red); + setGreen(&filteredPixel, green); + setBlue(&filteredPixel, blue); + filteredPixel.w = pixel.w; + + im[c] = filteredPixel; + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% M o t i o n B l u r % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + __kernel + void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output, + const unsigned int imageWidth, const unsigned int imageHeight, + const __global float *filter, const unsigned int width, const __global int2* offset, + const float4 bias, + const ChannelType channel, const unsigned int matte) { + + int2 currentPixel; + currentPixel.x = get_global_id(0); + currentPixel.y = get_global_id(1); + + if (currentPixel.x >= imageWidth + || currentPixel.y >= imageHeight) + return; + + float4 pixel; + pixel.x = (float)bias.x; + pixel.y = (float)bias.y; + pixel.z = (float)bias.z; + pixel.w = (float)bias.w; + + if (((channel & AlphaChannel) == 0) || (matte == 0)) { + + for (int i = 0; i < width; i++) { + // only support EdgeVirtualPixelMethod through ClampToCanvas + // TODO: implement other virtual pixel method + int2 samplePixel = currentPixel + offset[i]; + samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); + samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); + CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; + + pixel.x += (filter[i] * (float)samplePixelValue.x); + pixel.y += (filter[i] * (float)samplePixelValue.y); + pixel.z += (filter[i] * (float)samplePixelValue.z); + pixel.w += (filter[i] * (float)samplePixelValue.w); + } + + CLPixelType outputPixel; + outputPixel.x = ClampToQuantum(pixel.x); + outputPixel.y = ClampToQuantum(pixel.y); + outputPixel.z = ClampToQuantum(pixel.z); + outputPixel.w = ClampToQuantum(pixel.w); + output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; + } + else { + + float gamma = 0.0f; + for (int i = 0; i < width; i++) { + // only support EdgeVirtualPixelMethod through ClampToCanvas + // TODO: implement other virtual pixel method + int2 samplePixel = currentPixel + offset[i]; + samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); + samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); + + CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; + + float alpha = QuantumScale*samplePixelValue.w; + float k = filter[i]; + pixel.x = pixel.x + k * alpha * samplePixelValue.x; + pixel.y = pixel.y + k * alpha * samplePixelValue.y; + pixel.z = pixel.z + k * alpha * samplePixelValue.z; + + pixel.w += k * alpha * samplePixelValue.w; + + gamma+=k*alpha; + } + gamma = PerceptibleReciprocal(gamma); + pixel.xyz = gamma*pixel.xyz; + + CLPixelType outputPixel; + outputPixel.x = ClampToQuantum(pixel.x); + outputPixel.y = ClampToQuantum(pixel.y); + outputPixel.z = ClampToQuantum(pixel.z); + outputPixel.w = ClampToQuantum(pixel.w); + output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; + } + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% R e s i z e % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + // Based on Box from resize.c + float BoxResizeFilter(const float x) + { + return 1.0f; + } + ) + + STRINGIFY( + // Based on CubicBC from resize.c + float CubicBC(const float x,const __global float* resizeFilterCoefficients) + { + /* + Cubic Filters using B,C determined values: + Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter + Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears + Spline B = 1 C = 0 B-Spline Gaussian approximation + Hermite B = 0 C = 0 B-Spline interpolator + + See paper by Mitchell and Netravali, Reconstruction Filters in Computer + Graphics Computer Graphics, Volume 22, Number 4, August 1988 + http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/ + Mitchell.pdf. + + Coefficents are determined from B,C values: + P0 = ( 6 - 2*B )/6 = coeff[0] + P1 = 0 + P2 = (-18 +12*B + 6*C )/6 = coeff[1] + P3 = ( 12 - 9*B - 6*C )/6 = coeff[2] + Q0 = ( 8*B +24*C )/6 = coeff[3] + Q1 = ( -12*B -48*C )/6 = coeff[4] + Q2 = ( 6*B +30*C )/6 = coeff[5] + Q3 = ( - 1*B - 6*C )/6 = coeff[6] + + which are used to define the filter: + + P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1 + Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2 + + which ensures function is continuous in value and derivative (slope). + */ + if (x < 1.0) + return(resizeFilterCoefficients[0]+x*(x* + (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2]))); + if (x < 2.0) + return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x* + (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6]))); + return(0.0); + } + ) + + STRINGIFY( + float Sinc(const float x) + { + if (x != 0.0f) + { + const float alpha=(float) (MagickPI*x); + return sinpi(x)/alpha; + } + return(1.0f); + } + ) + + STRINGIFY( + float Triangle(const float x) + { + /* + 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or + a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function + for Sinc(). + */ + return ((x<1.0f)?(1.0f-x):0.0f); + } + ) + + + STRINGIFY( + float Hann(const float x) + { + /* + Cosine window function: + 0.5+0.5*cos(pi*x). + */ + const float cosine=cos((MagickPI*x)); + return(0.5f+0.5f*cosine); + } + ) + + STRINGIFY( + float Hamming(const float x) + { + /* + Offset cosine window function: + .54 + .46 cos(pi x). + */ + const float cosine=cos((MagickPI*x)); + return(0.54f+0.46f*cosine); + } + ) + + STRINGIFY( + float Blackman(const float x) + { + /* + Blackman: 2nd order cosine windowing function: + 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x) + + Refactored by Chantal Racette and Nicolas Robidoux to one trig call and + five flops. + */ + const float cosine=cos((MagickPI*x)); + return(0.34f+cosine*(0.5f+cosine*0.16f)); + } + ) + + STRINGIFY( + inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients) + { + switch (filterType) + { + /* Call Sinc even for SincFast to get better precision on GPU + and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/ + case SincWeightingFunction: + case SincFastWeightingFunction: + return Sinc(x); + case CubicBCWeightingFunction: + return CubicBC(x,filterCoefficients); + case BoxWeightingFunction: + return BoxResizeFilter(x); + case TriangleWeightingFunction: + return Triangle(x); + case HannWeightingFunction: + return Hann(x); + case HammingWeightingFunction: + return Hamming(x); + case BlackmanWeightingFunction: + return Blackman(x); + + default: + return 0.0f; + } + } + ) + + + STRINGIFY( + inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType + , const ResizeWeightingFunctionType resizeWindowType + , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x) + { + float scale; + float xBlur = fabs(x/resizeFilterBlur); + if (resizeWindowSupport < MagickEpsilon + || resizeWindowType == BoxWeightingFunction) + { + scale = 1.0f; + } + else + { + scale = resizeFilterScale; + scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients); + } + float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients); + return weight; + } + + ) + + ; + const char *accelerateKernels2 = + + STRINGIFY( + + inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { + return (numWorkItems/pixelPerWorkgroup); + } + + // returns the index of the pixel for the current workitem to compute. + // returns -1 if this workitem doesn't need to participate in any computation + inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { + const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems); + int pixelIndex = itemID/numWorkItemsPerPixel; + pixelIndex = (pixelIndex 2) + { + cp.y = (float) *(p + 1); + cp.z = (float) *(p + 2); + } + + if (alpha_index != 0) + { + cp.w = (float) *(p + alpha_index); + + float alpha = weight * QuantumScale * cp.w; + + filteredPixel.x += alpha * cp.x; + filteredPixel.y += alpha * cp.y; + filteredPixel.z += alpha * cp.z; + filteredPixel.w += weight * cp.w; + gamma += alpha; + } + else + filteredPixel += ((float4) weight)*cp; + + density += weight; + } + } + } + + // initialize the accumulators to zero + if (itemID < actualNumPixelInThisChunk) { + outputPixelCache[itemID] = (float4)0.0f; + densityCache[itemID] = 0.0f; + if (alpha_index != 0) + gammaCache[itemID] = 0.0f; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // accumulatte the filtered pixel value and the density + for (unsigned int i = 0; i < numItems; i++) { + if (pixelIndex != -1) { + if (itemID%numItems == i) { + outputPixelCache[pixelIndex]+=filteredPixel; + densityCache[pixelIndex]+=density; + if (alpha_index != 0) + gammaCache[pixelIndex]+=gamma; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (itemID < actualNumPixelInThisChunk) + { + float4 filteredPixel = outputPixelCache[itemID]; + + float gamma = 0.0f; + if (alpha_index != 0) + gamma = gammaCache[itemID]; + + float density = densityCache[itemID]; + if ((density != 0.0f) && (density != 1.0f)) + { + density = PerceptibleReciprocal(density); + filteredPixel *= (float4) density; + gamma *= density; + } + + if (alpha_index != 0) + { + gamma = PerceptibleReciprocal(gamma); + filteredPixel.x *= gamma; + filteredPixel.y *= gamma; + filteredPixel.z *= gamma; + } + + WriteFloat4(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, AllChannels, filteredPixel); + } + } + } + ) + + + STRINGIFY( + __kernel __attribute__((reqd_work_group_size(1, 256, 1))) + void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, + const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, + const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor, + const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, + const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, + const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, + const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, + __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) + { + // calculate the range of resized image pixels computed by this workgroup + const unsigned int startY = get_group_id(1)*pixelPerWorkgroup; + const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows); + const unsigned int actualNumPixelToCompute = stopY - startY; + + // calculate the range of input image pixels to cache + float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f); + const float support = MagickMax(scale*resizeFilterSupport,0.5f); + scale = PerceptibleReciprocal(scale); + + const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0)); + const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows); + + // cache the input pixels into local memory + const unsigned int x = get_global_id(0); + unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY); + unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY; + unsigned int stride = inputColumns * number_channels; + for (unsigned int i = 0; i < number_channels; i++) + { + event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0); + wait_group_events(1,&e); + } + + unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; + unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; + for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) + { + const unsigned int chunkStartY = startY + chunk*pixelChunkSize; + const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY); + const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY; + + // determine which resized pixel computed by this workitem + const unsigned int itemID = get_local_id(1); + const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1)); + + const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1)); + + float4 filteredPixel = (float4)0.0f; + float density = 0.0f; + float gamma = 0.0f; + // -1 means this workitem doesn't participate in the computation + if (pixelIndex != -1) + { + // x coordinated of the resized pixel computed by this workitem + const int y = chunkStartY + pixelIndex; + + // calculate how many steps required for this pixel + const float bisect = (y+0.5)/yFactor+MagickEpsilon; + const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); + const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows); + const unsigned int n = stop - start; + + // calculate how many steps this workitem will contribute + unsigned int numStepsPerWorkItem = n / numItems; + numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); + + const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; + if (startStep < n) + { + const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); + + unsigned int cacheIndex = start+startStep-cacheRangeStartY; + for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) + { + float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, + (ResizeWeightingFunctionType) resizeFilterType, + (ResizeWeightingFunctionType) resizeWindowType, + resizeFilterScale, resizeFilterWindowSupport, + resizeFilterBlur, scale*(start + i - bisect + 0.5)); + + float4 cp = (float4)0.0f; + + __local CLQuantum *p = inputImageCache + cacheIndex; + cp.x = (float) *(p); + if (number_channels > 2) + { + cp.y = (float) *(p + rangeLength); + cp.z = (float) *(p + (rangeLength * 2)); + } + + if (alpha_index != 0) + { + cp.w = (float) *(p + (rangeLength * alpha_index)); + + float alpha = weight * QuantumScale * cp.w; + + filteredPixel.x += alpha * cp.x; + filteredPixel.y += alpha * cp.y; + filteredPixel.z += alpha * cp.z; + filteredPixel.w += weight * cp.w; + gamma += alpha; + } + else + filteredPixel += ((float4) weight)*cp; + + density += weight; + } + } + } + + // initialize the accumulators to zero + if (itemID < actualNumPixelInThisChunk) { + outputPixelCache[itemID] = (float4)0.0f; + densityCache[itemID] = 0.0f; + if (alpha_index != 0) + gammaCache[itemID] = 0.0f; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // accumulatte the filtered pixel value and the density + for (unsigned int i = 0; i < numItems; i++) { + if (pixelIndex != -1) { + if (itemID%numItems == i) { + outputPixelCache[pixelIndex]+=filteredPixel; + densityCache[pixelIndex]+=density; + if (alpha_index != 0) + gammaCache[pixelIndex]+=gamma; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (itemID < actualNumPixelInThisChunk) + { + float4 filteredPixel = outputPixelCache[itemID]; + + float gamma = 0.0f; + if (alpha_index != 0) + gamma = gammaCache[itemID]; + + float density = densityCache[itemID]; + if ((density != 0.0f) && (density != 1.0f)) + { + density = PerceptibleReciprocal(density); + filteredPixel *= (float4) density; + gamma *= density; + } + + if (alpha_index != 0) + { + gamma = PerceptibleReciprocal(gamma); + filteredPixel.x *= gamma; + filteredPixel.y *= gamma; + filteredPixel.z *= gamma; + } + + WriteFloat4(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, AllChannels, filteredPixel); + } + } + } + ) + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% R o t a t i o n a l B l u r % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +*/ + + STRINGIFY( + __kernel void RotationalBlur(const __global CLQuantum *image,const unsigned int number_channels, + const unsigned int channel,const float4 bias,const float2 blurCenter,__constant float *cos_theta, + __constant float *sin_theta,const unsigned int cossin_theta_size,__global CLQuantum *filteredImage) + { + const int x = get_global_id(0); + const int y = get_global_id(1); + const int columns = get_global_size(0); + const int rows = get_global_size(1); + unsigned int step = 1; + float center_x = (float) x - blurCenter.x; + float center_y = (float) y - blurCenter.y; + float radius = hypot(center_x, center_y); + + float blur_radius = hypot(blurCenter.x, blurCenter.y); + + if (radius > MagickEpsilon) + { + step = (unsigned int) (blur_radius / radius); + if (step == 0) + step = 1; + if (step >= cossin_theta_size) + step = cossin_theta_size-1; + } + + float4 result = bias; + + float normalize = 0.0f; + float gamma = 0.0f; + + for (unsigned int i=0; i= 0) && (groupStopY < rows)) + { + event_t e = async_work_group_strided_copy(cachedData, + blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0); + wait_group_events(1,&e); + } + else + { + for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) + cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX]; + + barrier(CLK_LOCAL_MEM_FENCE); + } + // cache the filter as well + event_t e = async_work_group_copy(cachedFilter,filter,width,0); + wait_group_events(1,&e); + + // only do the work if this is not a patched item + const int cy = get_global_id(1); + + if (cy < rows) + { + float4 blurredPixel = (float4) 0.0f; + + int i = 0; + + for ( ; i+7 < width; ) + { + for (int j=0; j < 8; j++, i++) + blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)]; + } + + for ( ; i < width; i++) + blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)]; + + float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel); + float4 outputPixel = inputImagePixel - blurredPixel; + + float quantumThreshold = QuantumRange*threshold; + + int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold); + outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask); + + //write back + WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel); + } + } + ) + + STRINGIFY( + __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels, + const ChannelType channel,__constant float *filter,const unsigned int width, + const unsigned int columns,const unsigned int rows,__local float4 *pixels, + const float gain,const float threshold,__global CLQuantum *filteredImage) + { + const unsigned int x = get_global_id(0); + const unsigned int y = get_global_id(1); + + const unsigned int radius = (width - 1) / 2; + + int row = y - radius; + int baseRow = get_group_id(1) * get_local_size(1) - radius; + int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius; + + while (row < endRow) { + int srcy = (row < 0) ? -row : row; // mirror pad + srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy; + + float4 value = 0.0f; + + int ix = x - radius; + int i = 0; + + while (i + 7 < width) { + for (int j = 0; j < 8; ++j) { // unrolled + int srcx = ix + j; + srcx = (srcx < 0) ? -srcx : srcx; + srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; + value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); + } + ix += 8; + i += 8; + } + + while (i < width) { + int srcx = (ix < 0) ? -ix : ix; // mirror pad + srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; + value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); + ++i; + ++ix; + } + pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value; + row += get_local_size(1); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const int px = get_local_id(0); + const int py = get_local_id(1); + const int prp = get_local_size(0); + float4 value = (float4)(0.0f); + + int i = 0; + while (i + 7 < width) { + for (int j = 0; j < 8; ++j) // unrolled + value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp]; + i += 8; + } + while (i < width) { + value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; + ++i; + } + + float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel); + float4 diff = srcPixel - value; + + float quantumThreshold = QuantumRange*threshold; + + int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold); + value = select(srcPixel + diff * gain, srcPixel, mask); + + if ((x < columns) && (y < rows)) + WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value); + } + ) + + STRINGIFY( + __kernel __attribute__((reqd_work_group_size(64, 4, 1))) + void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage, + const unsigned int number_channels,const unsigned int max_channels, + const float threshold,const int passes,const unsigned int imageWidth, + const unsigned int imageHeight) + { + const int pad = (1 << (passes - 1)); + const int tileSize = 64; + const int tileRowPixels = 64; + const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 }; + + CLQuantum stage[48]; // 16 * 3 (we only need 3 channels) + + local float buffer[64 * 64]; + + int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0); + int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad; + + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { + int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + + (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels; + + for (int channel = 0; channel < max_channels; ++channel) + stage[(i / 4) + (16 * channel)] = srcImage[pos + channel]; + } + + for (int channel = 0; channel < max_channels; ++channel) { + // Load LDS + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) + buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]); + + // Process + + float tmp[16]; + float accum[16]; + float pixel; + + for (int i = 0; i < 16; i++) + accum[i]=0.0f; + + for (int pass = 0; pass < passes; ++pass) { + const int radius = 1 << pass; + const int x = get_local_id(0); + const float thresh = threshold * noise[pass]; + + // Apply horizontal hat + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { + const int offset = i * tileRowPixels; + if (pass == 0) + tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass + pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]); + barrier(CLK_LOCAL_MEM_FENCE); + buffer[x + offset] = pixel; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Apply vertical hat + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { + pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]); + float delta = tmp[i / 4] - pixel; + tmp[i / 4] = pixel; // hold output in tmp until all workitems are done + if (delta < -thresh) + delta += thresh; + else if (delta > thresh) + delta -= thresh; + else + delta = 0; + accum[i / 4] += delta; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (pass < passes - 1) + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) + buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass + else // last pass + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) + accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output + barrier(CLK_LOCAL_MEM_FENCE); + } + + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) + stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]); + + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Write from stage to output + + if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { + if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) { + int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels)); + for (int channel = 0; channel < max_channels; ++channel) { + dstImage[pos + channel] = stage[(i / 4) + (16 * channel)]; + } + } + } + } + } + ) + + ; + +#endif // MAGICKCORE_OPENCL_SUPPORT + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif // _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H diff --git a/MagickCore/accelerate-private.h b/MagickCore/accelerate-private.h index 95b2eab75..bd2b6c5e3 100644 --- a/MagickCore/accelerate-private.h +++ b/MagickCore/accelerate-private.h @@ -29,3239 +29,6 @@ extern "C" { #endif -#if defined(MAGICKCORE_OPENCL_SUPPORT) - -/* - Define declarations. -*/ -#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n" -#define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n" -#define OPENCL_ELSE() "\n #""else " " \n" -#define OPENCL_ENDIF() "\n #""endif " " \n" -#define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n" -#define STRINGIFY(...) #__VA_ARGS__ "\n" - -/* - Typedef declarations. -*/ - -typedef struct _FloatPixelPacket -{ - MagickRealType - red, - green, - blue, - alpha, - black; -} FloatPixelPacket; - -const char *accelerateKernels = - -/* - Define declarations. -*/ - OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) - OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f)) - OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) - OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) - OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) - OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) - OPENCL_DEFINE(SigmaRandom, (attenuate)) - OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) - OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y))) - OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y))) - -/* - Typedef declarations. -*/ - STRINGIFY( - typedef enum - { - UndefinedColorspace, - RGBColorspace, /* Linear RGB colorspace */ - GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */ - TransparentColorspace, - OHTAColorspace, - LabColorspace, - XYZColorspace, - YCbCrColorspace, - YCCColorspace, - YIQColorspace, - YPbPrColorspace, - YUVColorspace, - CMYKColorspace, /* negared linear RGB with black separated */ - sRGBColorspace, /* Default: non-lienar sRGB colorspace */ - HSBColorspace, - HSLColorspace, - HWBColorspace, - Rec601LumaColorspace, - Rec601YCbCrColorspace, - Rec709LumaColorspace, - Rec709YCbCrColorspace, - LogColorspace, - CMYColorspace, /* negated linear RGB colorspace */ - LuvColorspace, - HCLColorspace, - LCHColorspace, /* alias for LCHuv */ - LMSColorspace, - LCHabColorspace, /* Cylindrical (Polar) Lab */ - LCHuvColorspace, /* Cylindrical (Polar) Luv */ - scRGBColorspace, - HSIColorspace, - HSVColorspace, /* alias for HSB */ - HCLpColorspace, - YDbDrColorspace - } ColorspaceType; - ) - - STRINGIFY( - typedef enum - { - UndefinedCompositeOp, - NoCompositeOp, - ModulusAddCompositeOp, - AtopCompositeOp, - BlendCompositeOp, - BumpmapCompositeOp, - ChangeMaskCompositeOp, - ClearCompositeOp, - ColorBurnCompositeOp, - ColorDodgeCompositeOp, - ColorizeCompositeOp, - CopyBlackCompositeOp, - CopyBlueCompositeOp, - CopyCompositeOp, - CopyCyanCompositeOp, - CopyGreenCompositeOp, - CopyMagentaCompositeOp, - CopyOpacityCompositeOp, - CopyRedCompositeOp, - CopyYellowCompositeOp, - DarkenCompositeOp, - DstAtopCompositeOp, - DstCompositeOp, - DstInCompositeOp, - DstOutCompositeOp, - DstOverCompositeOp, - DifferenceCompositeOp, - DisplaceCompositeOp, - DissolveCompositeOp, - ExclusionCompositeOp, - HardLightCompositeOp, - HueCompositeOp, - InCompositeOp, - LightenCompositeOp, - LinearLightCompositeOp, - LuminizeCompositeOp, - MinusDstCompositeOp, - ModulateCompositeOp, - MultiplyCompositeOp, - OutCompositeOp, - OverCompositeOp, - OverlayCompositeOp, - PlusCompositeOp, - ReplaceCompositeOp, - SaturateCompositeOp, - ScreenCompositeOp, - SoftLightCompositeOp, - SrcAtopCompositeOp, - SrcCompositeOp, - SrcInCompositeOp, - SrcOutCompositeOp, - SrcOverCompositeOp, - ModulusSubtractCompositeOp, - ThresholdCompositeOp, - XorCompositeOp, - /* These are new operators, added after the above was last sorted. - * The list should be re-sorted only when a new library version is - * created. - */ - DivideDstCompositeOp, - DistortCompositeOp, - BlurCompositeOp, - PegtopLightCompositeOp, - VividLightCompositeOp, - PinLightCompositeOp, - LinearDodgeCompositeOp, - LinearBurnCompositeOp, - MathematicsCompositeOp, - DivideSrcCompositeOp, - MinusSrcCompositeOp, - DarkenIntensityCompositeOp, - LightenIntensityCompositeOp - } CompositeOperator; - ) - - STRINGIFY( - typedef enum - { - UndefinedFunction, - ArcsinFunction, - ArctanFunction, - PolynomialFunction, - SinusoidFunction - } MagickFunction; - ) - - STRINGIFY( - typedef enum - { - UndefinedNoise, - UniformNoise, - GaussianNoise, - MultiplicativeGaussianNoise, - ImpulseNoise, - LaplacianNoise, - PoissonNoise, - RandomNoise - } NoiseType; - ) - - STRINGIFY( - typedef enum - { - UndefinedPixelIntensityMethod = 0, - AveragePixelIntensityMethod, - BrightnessPixelIntensityMethod, - LightnessPixelIntensityMethod, - Rec601LumaPixelIntensityMethod, - Rec601LuminancePixelIntensityMethod, - Rec709LumaPixelIntensityMethod, - Rec709LuminancePixelIntensityMethod, - RMSPixelIntensityMethod, - MSPixelIntensityMethod - } PixelIntensityMethod; - ) - - STRINGIFY( - typedef enum { - BoxWeightingFunction = 0, - TriangleWeightingFunction, - CubicBCWeightingFunction, - HannWeightingFunction, - HammingWeightingFunction, - BlackmanWeightingFunction, - GaussianWeightingFunction, - QuadraticWeightingFunction, - JincWeightingFunction, - SincWeightingFunction, - SincFastWeightingFunction, - KaiserWeightingFunction, - WelshWeightingFunction, - BohmanWeightingFunction, - LagrangeWeightingFunction, - CosineWeightingFunction, - } ResizeWeightingFunctionType; - ) - - STRINGIFY( - typedef enum - { - UndefinedChannel = 0x0000, - RedChannel = 0x0001, - GrayChannel = 0x0001, - CyanChannel = 0x0001, - GreenChannel = 0x0002, - MagentaChannel = 0x0002, - BlueChannel = 0x0004, - YellowChannel = 0x0004, - BlackChannel = 0x0008, - AlphaChannel = 0x0010, - OpacityChannel = 0x0010, - IndexChannel = 0x0020, /* Color Index Table? */ - ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */ - WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */ - MetaChannel = 0x0100, /* ???? */ - CompositeChannels = 0x002F, - AllChannels = 0x7ffffff, - /* - Special purpose channel types. - FUTURE: are these needed any more - they are more like hacks - SyncChannels for example is NOT a real channel but a 'flag' - It really says -- "User has not defined channels" - Though it does have extra meaning in the "-auto-level" operator - */ - TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */ - RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */ - GrayChannels = 0x0400, - SyncChannels = 0x20000, /* channels modified as a single unit */ - DefaultChannels = AllChannels - } ChannelType; /* must correspond to PixelChannel */ - ) - -/* - Helper functions. -*/ - -OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8)) - - STRINGIFY( - inline CLQuantum ScaleCharToQuantum(const unsigned char value) - { - return((CLQuantum) value); - } - ) - -OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16)) - - STRINGIFY( - inline CLQuantum ScaleCharToQuantum(const unsigned char value) - { - return((CLQuantum) (257.0f*value)); - } - ) - -OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32)) - - STRINGIFY( - inline CLQuantum ScaleCharToQuantum(const unsigned char value) - { - return((CLQuantum) (16843009.0*value)); - } - ) - -OPENCL_ENDIF() - - STRINGIFY( - inline int ClampToCanvas(const int offset,const int range) - { - return clamp(offset, (int)0, range-1); - } - ) - - STRINGIFY( - inline CLQuantum ClampToQuantum(const float value) - { - return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f); - } - ) - - STRINGIFY( - inline uint ScaleQuantumToMap(CLQuantum value) - { - if (value >= (CLQuantum) MaxMap) - return ((uint)MaxMap); - else - return ((uint)value); - } - ) - - STRINGIFY( - inline float PerceptibleReciprocal(const float x) - { - float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0; - return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon)); - } - ) - - STRINGIFY( - inline float RoundToUnity(const float value) - { - return clamp(value,0.0f,1.0f); - } - ) - - STRINGIFY( - - inline unsigned int getPixelIndex(const unsigned int number_channels, - const unsigned int columns, const unsigned int x, const unsigned int y) - { - return (x * number_channels) + (y * columns * number_channels); - } - - inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; } - inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); } - inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); } - inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); } - - inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; } - inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; } - inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; } - inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; } - - inline CLQuantum getBlue(CLPixelType p) { return p.x; } - inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; } - inline float getBlueF4(float4 p) { return p.x; } - inline void setBlueF4(float4* p, float value) { (*p).x = value; } - - inline CLQuantum getGreen(CLPixelType p) { return p.y; } - inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; } - inline float getGreenF4(float4 p) { return p.y; } - inline void setGreenF4(float4* p, float value) { (*p).y = value; } - - inline CLQuantum getRed(CLPixelType p) { return p.z; } - inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; } - inline float getRedF4(float4 p) { return p.z; } - inline void setRedF4(float4* p, float value) { (*p).z = value; } - - inline CLQuantum getAlpha(CLPixelType p) { return p.w; } - inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; } - inline float getAlphaF4(float4 p) { return p.w; } - inline void setAlphaF4(float4* p, float value) { (*p).w = value; } - - inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels, - const ChannelType channel, float *red, float *green, float *blue, float *alpha) - { - if ((channel & RedChannel) != 0) - *red=getPixelRed(p); - - if (number_channels > 2) - { - if ((channel & GreenChannel) != 0) - *green=getPixelGreen(p); - - if ((channel & BlueChannel) != 0) - *blue=getPixelBlue(p); - } - - if (((number_channels == 4) || (number_channels == 2)) && - ((channel & AlphaChannel) != 0)) - *alpha=getPixelAlpha(p,number_channels); - } - - inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels, - const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel) - { - const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); - - float red = 0.0f; - float green = 0.0f; - float blue = 0.0f; - float alpha = 0.0f; - - ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); - return (float4)(red, green, blue, alpha); - } - - inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels, - const ChannelType channel, float red, float green, float blue, float alpha) - { - if ((channel & RedChannel) != 0) - setPixelRed(p,ClampToQuantum(red)); - - if (number_channels > 2) - { - if ((channel & GreenChannel) != 0) - setPixelGreen(p,ClampToQuantum(green)); - - if ((channel & BlueChannel) != 0) - setPixelBlue(p,ClampToQuantum(blue)); - } - - if (((number_channels == 4) || (number_channels == 2)) && - ((channel & AlphaChannel) != 0)) - setPixelAlpha(p,number_channels,ClampToQuantum(alpha)); - } - - inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels, - const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel, - float4 pixel) - { - __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); - WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w); - } - - inline float GetPixelIntensity(const unsigned int colorspace, - const unsigned int method,float red,float green,float blue) - { - float intensity; - - if (colorspace == GRAYColorspace) - return red; - - switch (method) - { - case AveragePixelIntensityMethod: - { - intensity=(red+green+blue)/3.0; - break; - } - case BrightnessPixelIntensityMethod: - { - intensity=MagickMax(MagickMax(red,green),blue); - break; - } - case LightnessPixelIntensityMethod: - { - intensity=(MagickMin(MagickMin(red,green),blue)+ - MagickMax(MagickMax(red,green),blue))/2.0; - break; - } - case MSPixelIntensityMethod: - { - intensity=(float) (((float) red*red+green*green+blue*blue)/ - (3.0*QuantumRange)); - break; - } - case Rec601LumaPixelIntensityMethod: - { - /* - if (image->colorspace == RGBColorspace) - { - red=EncodePixelGamma(red); - green=EncodePixelGamma(green); - blue=EncodePixelGamma(blue); - } - */ - intensity=0.298839*red+0.586811*green+0.114350*blue; - break; - } - case Rec601LuminancePixelIntensityMethod: - { - /* - if (image->colorspace == sRGBColorspace) - { - red=DecodePixelGamma(red); - green=DecodePixelGamma(green); - blue=DecodePixelGamma(blue); - } - */ - intensity=0.298839*red+0.586811*green+0.114350*blue; - break; - } - case Rec709LumaPixelIntensityMethod: - default: - { - /* - if (image->colorspace == RGBColorspace) - { - red=EncodePixelGamma(red); - green=EncodePixelGamma(green); - blue=EncodePixelGamma(blue); - } - */ - intensity=0.212656*red+0.715158*green+0.072186*blue; - break; - } - case Rec709LuminancePixelIntensityMethod: - { - /* - if (image->colorspace == sRGBColorspace) - { - red=DecodePixelGamma(red); - green=DecodePixelGamma(green); - blue=DecodePixelGamma(blue); - } - */ - intensity=0.212656*red+0.715158*green+0.072186*blue; - break; - } - case RMSPixelIntensityMethod: - { - intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/ - sqrt(3.0)); - break; - } - } - - return intensity; - } - - inline int mirrorBottom(int value) - { - return (value < 0) ? - (value) : value; - } - - inline int mirrorTop(int value, int width) - { - return (value >= width) ? (2 * width - value - 1) : value; - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% A d d N o i s e % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - /* - Part of MWC64X by David Thomas, dt10@imperial.ac.uk - This is provided under BSD, full license is with the main package. - See http://www.doc.ic.ac.uk/~dt10/research - */ - - // Pre: a=M) || (v=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug. - v=v-M; - return v; - } - - // Pre: a>1; - } - return r; - } - - // Pre: a=0 - // Post: r=(a^b) mod M - // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on - // most architectures - ulong MWC_PowMod64(ulong a, ulong e, ulong M) - { - ulong sqr=a, acc=1; - while(e!=0){ - if(e&1) - acc=MWC_MulMod64(acc,sqr,M); - sqr=MWC_MulMod64(sqr,sqr,M); - e=e>>1; - } - return acc; - } - - uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance) - { - ulong m=MWC_PowMod64(A, distance, M); - ulong x=curr.x*(ulong)A+curr.y; - x=MWC_MulMod64(x, m, M); - return (uint2)((uint)(x/A), (uint)(x%A)); - } - - uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap) - { - // This is an arbitrary constant for starting LCG jumping from. I didn't - // want to start from 1, as then you end up with the two or three first values - // being a bit poor in ones - once you've decided that, one constant is as - // good as any another. There is no deep mathematical reason for it, I just - // generated a random number. - enum{ MWC_BASEID = 4077358422479273989UL }; - - ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap; - ulong m=MWC_PowMod64(A, dist, M); - - ulong x=MWC_MulMod64(MWC_BASEID, m, M); - return (uint2)((uint)(x/A), (uint)(x%A)); - } - - //! Represents the state of a particular generator - typedef struct{ uint x; uint c; } mwc64x_state_t; - - enum{ MWC64X_A = 4294883355U }; - enum{ MWC64X_M = 18446383549859758079UL }; - - void MWC64X_Step(mwc64x_state_t *s) - { - uint X=s->x, C=s->c; - - uint Xn=MWC64X_A*X+C; - uint carry=(uint)(Xnx=Xn; - s->c=Cn; - } - - void MWC64X_Skip(mwc64x_state_t *s, ulong distance) - { - uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance); - s->x=tmp.x; - s->c=tmp.y; - } - - void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset) - { - uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset); - s->x=tmp.x; - s->c=tmp.y; - } - - //! Return a 32-bit integer in the range [0..2^32) - uint MWC64X_NextUint(mwc64x_state_t *s) - { - uint res=s->x ^ s->c; - MWC64X_Step(s); - return res; - } - - // - // End of MWC64X excerpt - // - - float mwcReadPseudoRandomValue(mwc64x_state_t* rng) - { - return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0 - } - - float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate) - { - float - alpha, - beta, - noise, - sigma; - - noise = 0.0f; - alpha=mwcReadPseudoRandomValue(r); - switch(noise_type) - { - case UniformNoise: - default: - { - noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); - break; - } - case GaussianNoise: - { - float - gamma, - tau; - - if (alpha == 0.0f) - alpha=1.0f; - beta=mwcReadPseudoRandomValue(r); - gamma=sqrt(-2.0f*log(alpha)); - sigma=gamma*cospi((2.0f*beta)); - tau=gamma*sinpi((2.0f*beta)); - noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau; - break; - } - case ImpulseNoise: - { - if (alpha < (SigmaImpulse/2.0f)) - noise=0.0f; - else - if (alpha >= (1.0f-(SigmaImpulse/2.0f))) - noise=QuantumRange; - else - noise=pixel; - break; - } - case LaplacianNoise: - { - if (alpha <= 0.5f) - { - if (alpha <= MagickEpsilon) - noise=(pixel-QuantumRange); - else - noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ - 0.5f); - break; - } - beta=1.0f-alpha; - if (beta <= (0.5f*MagickEpsilon)) - noise=(pixel+QuantumRange); - else - noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); - break; - } - case MultiplicativeGaussianNoise: - { - sigma=1.0f; - if (alpha > MagickEpsilon) - sigma=sqrt(-2.0f*log(alpha)); - beta=mwcReadPseudoRandomValue(r); - noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma* - cospi((2.0f*beta))/2.0f); - break; - } - case PoissonNoise: - { - float - poisson; - unsigned int i; - poisson=exp(-SigmaPoisson*QuantumScale*pixel); - for (i=0; alpha > poisson; i++) - { - beta=mwcReadPseudoRandomValue(r); - alpha*=beta; - } - noise=(QuantumRange*i/SigmaPoisson); - break; - } - case RandomNoise: - { - noise=(QuantumRange*SigmaRandom*alpha); - break; - } - } - return noise; - } - - __kernel - void AddNoise(const __global CLQuantum *image, - const unsigned int number_channels,const ChannelType channel, - const unsigned int length,const unsigned int pixelsPerWorkItem, - const NoiseType noise_type,const float attenuate,const unsigned int seed0, - const unsigned int seed1,const unsigned int numRandomNumbersPerPixel, - __global CLQuantum *filteredImage) - { - mwc64x_state_t rng; - rng.x = seed0; - rng.c = seed1; - - uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use - uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream); - MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams - - uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels); - uint count = pixelsPerWorkItem; - - while (count > 0 && pos < length) - { - const __global CLQuantum *p = image + pos; - __global CLQuantum *q = filteredImage + pos; - - float red; - float green; - float blue; - float alpha; - - ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); - - if ((channel & RedChannel) != 0) - red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate); - - if (number_channels > 2) - { - if ((channel & GreenChannel) != 0) - green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate); - - if ((channel & BlueChannel) != 0) - blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate); - } - - if (((number_channels == 4) || (number_channels == 2)) && - ((channel & AlphaChannel) != 0)) - alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate); - - WriteChannels(q, number_channels, channel, red, green, blue, alpha); - - pos += (get_local_size(0) * number_channels); - count--; - } - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% B l u r % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - /* - Reduce image noise and reduce detail levels by row - */ - __kernel void BlurRow(const __global CLQuantum *image, - const unsigned int number_channels,const ChannelType channel, - __constant float *filter,const unsigned int width, - const unsigned int imageColumns,const unsigned int imageRows, - __local float4 *temp,__global float4 *tempImage) - { - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int columns = imageColumns; - - const unsigned int radius = (width-1)/2; - const int wsize = get_local_size(0); - const unsigned int loadSize = wsize+width; - - //group coordinate - const int groupX=get_local_size(0)*get_group_id(0); - - //parallel load and clamp - for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0)) - { - int cx = ClampToCanvas(i + groupX - radius, columns); - temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel); - } - - // barrier - barrier(CLK_LOCAL_MEM_FENCE); - - // only do the work if this is not a patched item - if (get_global_id(0) < columns) - { - // compute - float4 result = (float4) 0; - - int i = 0; - - for ( ; i+7 < width; ) - { - for (int j=0; j < 8; j++) - result+=filter[i+j]*temp[i+j+get_local_id(0)]; - i+=8; - } - - for ( ; i < width; i++) - result+=filter[i]*temp[i+get_local_id(0)]; - - // write back to global - tempImage[y*columns+x] = result; - } - } - ) - - STRINGIFY( - /* - Reduce image noise and reduce detail levels by line - */ - __kernel void BlurColumn(const __global float4 *blurRowData, - const unsigned int number_channels,const ChannelType channel, - __constant float *filter,const unsigned int width, - const unsigned int imageColumns,const unsigned int imageRows, - __local float4 *temp,__global CLQuantum *filteredImage) - { - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int columns = imageColumns; - const int rows = imageRows; - - unsigned int radius = (width-1)/2; - const int wsize = get_local_size(1); - const unsigned int loadSize = wsize+width; - - //group coordinate - const int groupX=get_local_size(0)*get_group_id(0); - const int groupY=get_local_size(1)*get_group_id(1); - //notice that get_local_size(0) is 1, so - //groupX=get_group_id(0); - - //parallel load and clamp - for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1)) - temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX]; - - // barrier - barrier(CLK_LOCAL_MEM_FENCE); - - // only do the work if this is not a patched item - if (get_global_id(1) < rows) - { - // compute - float4 result = (float4) 0; - - int i = 0; - - for ( ; i+7 < width; ) - { - for (int j=0; j < 8; j++) - result+=filter[i+j]*temp[i+j+get_local_id(1)]; - i+=8; - } - - for ( ; i < width; i++) - result+=filter[i]*temp[i+get_local_id(1)]; - - // write back to global - WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); - } - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% C o m p o s i t e % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - inline float ColorDodge(const float Sca, - const float Sa,const float Dca,const float Da) - { - /* - Oct 2004 SVG specification. - */ - if ((Sca*Da+Dca*Sa) >= Sa*Da) - return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa)); - return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa)); - - - /* - New specification, March 2009 SVG specification. This specification was - also wrong of non-overlap cases. - */ - /* - if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon)) - return(Sca*(1.0-Da)); - if (fabs(Sca-Sa) < MagickEpsilon) - return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa)); - return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca))); - */ - - /* - Working from first principles using the original formula: - - f(Sc,Dc) = Dc/(1-Sc) - - This works correctly! Looks like the 2004 model was right but just - required a extra condition for correct handling. - */ - - /* - if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon)) - return(Sca*(1.0-Da)+Dca*(1.0-Sa)); - if (fabs(Sca-Sa) < MagickEpsilon) - return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa)); - return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa)); - */ - } - - inline void CompositeColorDodge(const float4 *p, - const float4 *q,float4 *composite) { - - float - Da, - gamma, - Sa; - - Sa=QuantumScale*getAlphaF4(*p); /* simplify and speed up equations */ - Da=QuantumScale*getAlphaF4(*q); - gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */ - setAlphaF4(composite,QuantumRange*gamma); - gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma); - setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale* - getRedF4(*q)*Da,Da)); - setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale* - getGreenF4(*q)*Da,Da)); - setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale* - getBlueF4(*q)*Da,Da)); - } - ) - - STRINGIFY( - inline void MagickPixelCompositePlus(const float4 *p, - const float alpha,const float4 *q, - const float beta,float4 *composite) - { - float - gamma; - - float - Da, - Sa; - /* - Add two pixels with the given opacities. - */ - Sa=QuantumScale*alpha; - Da=QuantumScale*beta; - gamma=RoundToUnity(Sa+Da); /* 'Plus' blending -- not 'Over' blending */ - setAlphaF4(composite,QuantumRange*gamma); - gamma=PerceptibleReciprocal(gamma); - setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q))); - setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q))); - setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q))); - } - ) - - STRINGIFY( - inline void MagickPixelCompositeBlend(const float4 *p, - const float alpha,const float4 *q, - const float beta,float4 *composite) - { - MagickPixelCompositePlus(p,(float) (alpha* - (getAlphaF4(*p))),q,(float) (beta* - (getAlphaF4(*q))),composite); - } - ) - - STRINGIFY( - __kernel - void Composite(__global CLPixelType *image, - const unsigned int imageWidth, - const unsigned int imageHeight, - const __global CLPixelType *compositeImage, - const unsigned int compositeWidth, - const unsigned int compositeHeight, - const unsigned int compose, - const ChannelType channel, - const unsigned int matte, - const float destination_dissolve, - const float source_dissolve) { - - uint2 index; - index.x = get_global_id(0); - index.y = get_global_id(1); - - - if (index.x >= imageWidth - || index.y >= imageHeight) { - return; - } - const CLPixelType inputPixel = image[index.y*imageWidth+index.x]; - float4 destination; - setRedF4(&destination,getRed(inputPixel)); - setGreenF4(&destination,getGreen(inputPixel)); - setBlueF4(&destination,getBlue(inputPixel)); - - - const CLPixelType compositePixel - = compositeImage[index.y*imageWidth+index.x]; - float4 source; - setRedF4(&source,getRed(compositePixel)); - setGreenF4(&source,getGreen(compositePixel)); - setBlueF4(&source,getBlue(compositePixel)); - - if (matte != 0) { - setAlphaF4(&destination,getAlpha(inputPixel)); - setAlphaF4(&source,getAlpha(compositePixel)); - } - else { - setAlphaF4(&destination,1.0f); - setAlphaF4(&source,1.0f); - } - - float4 composite=destination; - - CompositeOperator op = (CompositeOperator)compose; - switch (op) { - case ColorDodgeCompositeOp: - CompositeColorDodge(&source,&destination,&composite); - break; - case BlendCompositeOp: - MagickPixelCompositeBlend(&source,source_dissolve,&destination, - destination_dissolve,&composite); - break; - default: - // unsupported operators - break; - }; - - CLPixelType outputPixel; - setRed(&outputPixel, ClampToQuantum(getRedF4(composite))); - setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite))); - setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite))); - setAlpha(&outputPixel, ClampToQuantum(getAlphaF4(composite))); - image[index.y*imageWidth+index.x] = outputPixel; - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% C o n t r a s t % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - - inline float3 ConvertRGBToHSB(CLPixelType pixel) { - float3 HueSaturationBrightness; - HueSaturationBrightness.x = 0.0f; // Hue - HueSaturationBrightness.y = 0.0f; // Saturation - HueSaturationBrightness.z = 0.0f; // Brightness - - float r=(float) getRed(pixel); - float g=(float) getGreen(pixel); - float b=(float) getBlue(pixel); - - float tmin=MagickMin(MagickMin(r,g),b); - float tmax=MagickMax(MagickMax(r,g),b); - - if (tmax!=0.0f) { - float delta=tmax-tmin; - HueSaturationBrightness.y=delta/tmax; - HueSaturationBrightness.z=QuantumScale*tmax; - - if (delta != 0.0f) { - HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f)); - HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta; - HueSaturationBrightness.x/=6.0f; - HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f; - } - } - return HueSaturationBrightness; - } - - inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) { - - float hue = HueSaturationBrightness.x; - float brightness = HueSaturationBrightness.z; - float saturation = HueSaturationBrightness.y; - - CLPixelType rgb; - - if (saturation == 0.0f) { - setRed(&rgb,ClampToQuantum(QuantumRange*brightness)); - setGreen(&rgb,getRed(rgb)); - setBlue(&rgb,getRed(rgb)); - } - else { - - float h=6.0f*(hue-floor(hue)); - float f=h-floor(h); - float p=brightness*(1.0f-saturation); - float q=brightness*(1.0f-saturation*f); - float t=brightness*(1.0f-(saturation*(1.0f-f))); - - float clampedBrightness = ClampToQuantum(QuantumRange*brightness); - float clamped_t = ClampToQuantum(QuantumRange*t); - float clamped_p = ClampToQuantum(QuantumRange*p); - float clamped_q = ClampToQuantum(QuantumRange*q); - int ih = (int)h; - setRed(&rgb, (ih == 1)?clamped_q: - (ih == 2 || ih == 3)?clamped_p: - (ih == 4)?clamped_t: - clampedBrightness); - - setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness: - (ih == 3)?clamped_q: - (ih == 4 || ih == 5)?clamped_p: - clamped_t); - - setBlue(&rgb, (ih == 2)?clamped_t: - (ih == 3 || ih == 4)?clampedBrightness: - (ih == 5)?clamped_q: - clamped_p); - } - return rgb; - } - - __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen) - { - - const int sign = sharpen!=0?1:-1; - const int x = get_global_id(0); - const int y = get_global_id(1); - const int columns = get_global_size(0); - const int c = x + y * columns; - - CLPixelType pixel = im[c]; - float3 HueSaturationBrightness = ConvertRGBToHSB(pixel); - float brightness = HueSaturationBrightness.z; - brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness); - brightness = clamp(brightness,0.0f,1.0f); - HueSaturationBrightness.z = brightness; - - CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness); - filteredPixel.w = pixel.w; - im[c] = filteredPixel; - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% C o n t r a s t S t r e t c h % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - /* - */ - __kernel void Histogram(__global CLPixelType * restrict im, - const ChannelType channel, - const unsigned int colorspace, - const unsigned int method, - __global uint4 * restrict histogram) - { - const int x = get_global_id(0); - const int y = get_global_id(1); - const int columns = get_global_size(0); - const int c = x + y * columns; - if ((channel & SyncChannels) != 0) - { - float red=(float)getRed(im[c]); - float green=(float)getGreen(im[c]); - float blue=(float)getBlue(im[c]); - - float intensity = GetPixelIntensity(colorspace, method, red, green, blue); - uint pos = ScaleQuantumToMap(ClampToQuantum(intensity)); - atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position - } - else - { - // for equalizing, we always need all channels? - // otherwise something more - } - } - ) - - STRINGIFY( - /* - */ - __kernel void ContrastStretch(__global CLPixelType * restrict im, - const ChannelType channel, - __global CLPixelType * restrict stretch_map, - const float4 white, const float4 black) - { - const int x = get_global_id(0); - const int y = get_global_id(1); - const int columns = get_global_size(0); - const int c = x + y * columns; - - uint ePos; - CLPixelType oValue, eValue; - CLQuantum red, green, blue, alpha; - - //read from global - oValue=im[c]; - - if ((channel & RedChannel) != 0) - { - if (getRedF4(white) != getRedF4(black)) - { - ePos = ScaleQuantumToMap(getRed(oValue)); - eValue = stretch_map[ePos]; - red = getRed(eValue); - } - } - - if ((channel & GreenChannel) != 0) - { - if (getGreenF4(white) != getGreenF4(black)) - { - ePos = ScaleQuantumToMap(getGreen(oValue)); - eValue = stretch_map[ePos]; - green = getGreen(eValue); - } - } - - if ((channel & BlueChannel) != 0) - { - if (getBlueF4(white) != getBlueF4(black)) - { - ePos = ScaleQuantumToMap(getBlue(oValue)); - eValue = stretch_map[ePos]; - blue = getBlue(eValue); - } - } - - if ((channel & AlphaChannel) != 0) - { - if (getAlphaF4(white) != getAlphaF4(black)) - { - ePos = ScaleQuantumToMap(getAlpha(oValue)); - eValue = stretch_map[ePos]; - alpha = getAlpha(eValue); - } - } - - //write back - im[c]=(CLPixelType)(blue, green, red, alpha); - - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% C o n v o l v e % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - __kernel - void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output, - const unsigned int imageWidth, const unsigned int imageHeight, - __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, - const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) { - - int2 blockID; - blockID.x = get_global_id(0) / get_local_size(0); - blockID.y = get_global_id(1) / get_local_size(1); - - // image area processed by this workgroup - int2 imageAreaOrg; - imageAreaOrg.x = blockID.x * get_local_size(0); - imageAreaOrg.y = blockID.y * get_local_size(1); - - int2 midFilterDimen; - midFilterDimen.x = (filterWidth-1)/2; - midFilterDimen.y = (filterHeight-1)/2; - - int2 cachedAreaOrg = imageAreaOrg - midFilterDimen; - - // dimension of the local cache - int2 cachedAreaDimen; - cachedAreaDimen.x = get_local_size(0) + filterWidth - 1; - cachedAreaDimen.y = get_local_size(1) + filterHeight - 1; - - // cache the pixels accessed by this workgroup in local memory - int localID = get_local_id(1)*get_local_size(0)+get_local_id(0); - int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y; - int groupSize = get_local_size(0) * get_local_size(1); - for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) { - - int2 cachedAreaIndex; - cachedAreaIndex.x = i % cachedAreaDimen.x; - cachedAreaIndex.y = i / cachedAreaDimen.x; - - int2 imagePixelIndex; - imagePixelIndex = cachedAreaOrg + cachedAreaIndex; - - // only support EdgeVirtualPixelMethod through ClampToCanvas - // TODO: implement other virtual pixel method - imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth); - imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight); - - pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x]; - } - - // cache the filter - for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) { - filterCache[i] = filter[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - - int2 imageIndex; - imageIndex.x = imageAreaOrg.x + get_local_id(0); - imageIndex.y = imageAreaOrg.y + get_local_id(1); - - // if out-of-range, stops here and quit - if (imageIndex.x >= imageWidth - || imageIndex.y >= imageHeight) { - return; - } - - int filterIndex = 0; - float4 sum = (float4)0.0f; - float gamma = 0.0f; - if (((channel & AlphaChannel) == 0) || (matte == 0)) { - int cacheIndexY = get_local_id(1); - for (int j = 0; j < filterHeight; j++) { - int cacheIndexX = get_local_id(0); - for (int i = 0; i < filterWidth; i++) { - CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; - float f = filterCache[filterIndex]; - - sum.x += f * p.x; - sum.y += f * p.y; - sum.z += f * p.z; - sum.w += f * p.w; - - gamma += f; - filterIndex++; - cacheIndexX++; - } - cacheIndexY++; - } - } - else { - int cacheIndexY = get_local_id(1); - for (int j = 0; j < filterHeight; j++) { - int cacheIndexX = get_local_id(0); - for (int i = 0; i < filterWidth; i++) { - - CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; - float alpha = QuantumScale*p.w; - float f = filterCache[filterIndex]; - float g = alpha * f; - - sum.x += g*p.x; - sum.y += g*p.y; - sum.z += g*p.z; - sum.w += f*p.w; - - gamma += g; - filterIndex++; - cacheIndexX++; - } - cacheIndexY++; - } - gamma = PerceptibleReciprocal(gamma); - sum.xyz = gamma*sum.xyz; - } - CLPixelType outputPixel; - outputPixel.x = ClampToQuantum(sum.x); - outputPixel.y = ClampToQuantum(sum.y); - outputPixel.z = ClampToQuantum(sum.z); - outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; - - output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; - } - ) - - STRINGIFY( - __kernel - void Convolve(const __global CLPixelType *input, __global CLPixelType *output, - const uint imageWidth, const uint imageHeight, - __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, - const uint matte, const ChannelType channel) { - - int2 imageIndex; - imageIndex.x = get_global_id(0); - imageIndex.y = get_global_id(1); - - /* - unsigned int imageWidth = get_global_size(0); - unsigned int imageHeight = get_global_size(1); - */ - if (imageIndex.x >= imageWidth - || imageIndex.y >= imageHeight) - return; - - int2 midFilterDimen; - midFilterDimen.x = (filterWidth-1)/2; - midFilterDimen.y = (filterHeight-1)/2; - - int filterIndex = 0; - float4 sum = (float4)0.0f; - float gamma = 0.0f; - if (((channel & AlphaChannel) == 0) || (matte == 0)) { - for (int j = 0; j < filterHeight; j++) { - int2 inputPixelIndex; - inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; - inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); - for (int i = 0; i < filterWidth; i++) { - inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; - inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); - - CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; - float f = filter[filterIndex]; - - sum.x += f * p.x; - sum.y += f * p.y; - sum.z += f * p.z; - sum.w += f * p.w; - - gamma += f; - - filterIndex++; - } - } - } - else { - - for (int j = 0; j < filterHeight; j++) { - int2 inputPixelIndex; - inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; - inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); - for (int i = 0; i < filterWidth; i++) { - inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; - inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); - - CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; - float alpha = QuantumScale*p.w; - float f = filter[filterIndex]; - float g = alpha * f; - - sum.x += g*p.x; - sum.y += g*p.y; - sum.z += g*p.z; - sum.w += f*p.w; - - gamma += g; - - - filterIndex++; - } - } - gamma = PerceptibleReciprocal(gamma); - sum.xyz = gamma*sum.xyz; - } - - CLPixelType outputPixel; - outputPixel.x = ClampToQuantum(sum.x); - outputPixel.y = ClampToQuantum(sum.y); - outputPixel.z = ClampToQuantum(sum.z); - outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; - - output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% D e s p e c k l e % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - - __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage - , const unsigned int imageWidth, const unsigned int imageHeight - , const int2 offset, const int polarity, const int matte) { - - int x = get_global_id(0); - int y = get_global_id(1); - - CLPixelType v = inputImage[y*imageWidth+x]; - - int2 neighbor; - neighbor.y = y + offset.y; - neighbor.x = x + offset.x; - - int2 clampedNeighbor; - clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); - clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); - - CLPixelType r = (clampedNeighbor.x == neighbor.x - && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] - :(CLPixelType)0; - - int sv[4]; - sv[0] = (int)v.x; - sv[1] = (int)v.y; - sv[2] = (int)v.z; - sv[3] = (int)v.w; - - int sr[4]; - sr[0] = (int)r.x; - sr[1] = (int)r.y; - sr[2] = (int)r.z; - sr[3] = (int)r.w; - - if (polarity > 0) { - \n #pragma unroll 4\n - for (unsigned int i = 0; i < 4; i++) { - sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i]; - } - } - else { - \n #pragma unroll 4\n - for (unsigned int i = 0; i < 4; i++) { - sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i]; - } - - } - - v.x = (CLQuantum)sv[0]; - v.y = (CLQuantum)sv[1]; - v.z = (CLQuantum)sv[2]; - - if (matte!=0) - v.w = (CLQuantum)sv[3]; - - outputImage[y*imageWidth+x] = v; - - } - - - ) - - STRINGIFY( - - __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage - , const unsigned int imageWidth, const unsigned int imageHeight - , const int2 offset, const int polarity, const int matte) { - - int x = get_global_id(0); - int y = get_global_id(1); - - CLPixelType v = inputImage[y*imageWidth+x]; - - int2 neighbor, clampedNeighbor; - - neighbor.y = y + offset.y; - neighbor.x = x + offset.x; - clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); - clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); - - CLPixelType r = (clampedNeighbor.x == neighbor.x - && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] - :(CLPixelType)0; - - - neighbor.y = y - offset.y; - neighbor.x = x - offset.x; - clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); - clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); - - CLPixelType s = (clampedNeighbor.x == neighbor.x - && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] - :(CLPixelType)0; - - - int sv[4]; - sv[0] = (int)v.x; - sv[1] = (int)v.y; - sv[2] = (int)v.z; - sv[3] = (int)v.w; - - int sr[4]; - sr[0] = (int)r.x; - sr[1] = (int)r.y; - sr[2] = (int)r.z; - sr[3] = (int)r.w; - - int ss[4]; - ss[0] = (int)s.x; - ss[1] = (int)s.y; - ss[2] = (int)s.z; - ss[3] = (int)s.w; - - if (polarity > 0) { - \n #pragma unroll 4\n - for (unsigned int i = 0; i < 4; i++) { - //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i]; - // - //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); - //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); - sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); - } - } - else { - \n #pragma unroll 4\n - for (unsigned int i = 0; i < 4; i++) { - //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i]; - // - //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); - sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); - } - } - - v.x = (CLQuantum)sv[0]; - v.y = (CLQuantum)sv[1]; - v.z = (CLQuantum)sv[2]; - - if (matte!=0) - v.w = (CLQuantum)sv[3]; - - outputImage[y*imageWidth+x] = v; - - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% E q u a l i z e % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - /* - */ - __kernel void Equalize(__global CLPixelType * restrict im, - const ChannelType channel, - __global CLPixelType * restrict equalize_map, - const float4 white, const float4 black) - { - const int x = get_global_id(0); - const int y = get_global_id(1); - const int columns = get_global_size(0); - const int c = x + y * columns; - - uint ePos; - CLPixelType oValue, eValue; - CLQuantum red, green, blue, alpha; - - //read from global - oValue=im[c]; - - if ((channel & SyncChannels) != 0) - { - if (getRedF4(white) != getRedF4(black)) - { - ePos = ScaleQuantumToMap(getRed(oValue)); - eValue = equalize_map[ePos]; - red = getRed(eValue); - ePos = ScaleQuantumToMap(getGreen(oValue)); - eValue = equalize_map[ePos]; - green = getRed(eValue); - ePos = ScaleQuantumToMap(getBlue(oValue)); - eValue = equalize_map[ePos]; - blue = getRed(eValue); - ePos = ScaleQuantumToMap(getAlpha(oValue)); - eValue = equalize_map[ePos]; - alpha = getRed(eValue); - - //write back - im[c]=(CLPixelType)(blue, green, red, alpha); - } - - } - - // for equalizing, we always need all channels? - // otherwise something more - - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% F u n c t i o n % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - /* - apply FunctionImageChannel(braightness-contrast) - */ - CLQuantum ApplyFunction(float pixel,const MagickFunction function, - const unsigned int number_parameters,__constant float *parameters) - { - float result = 0.0f; - - switch (function) - { - case PolynomialFunction: - { - for (unsigned int i=0; i < number_parameters; i++) - result = result*QuantumScale*pixel + parameters[i]; - result *= QuantumRange; - break; - } - case SinusoidFunction: - { - float freq,phase,ampl,bias; - freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; - phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f; - ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f; - bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; - result = QuantumRange*(ampl*sin(2.0f*MagickPI* - (freq*QuantumScale*pixel + phase/360.0f)) + bias); - break; - } - case ArcsinFunction: - { - float width,range,center,bias; - width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; - center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; - range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; - bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; - - result = 2.0f/width*(QuantumScale*pixel - center); - result = range/MagickPI*asin(result)+bias; - result = ( result <= -1.0f ) ? bias - range/2.0f : result; - result = ( result >= 1.0f ) ? bias + range/2.0f : result; - result *= QuantumRange; - break; - } - case ArctanFunction: - { - float slope,range,center,bias; - slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; - center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; - range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; - bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; - result = MagickPI*slope*(QuantumScale*pixel-center); - result = QuantumRange*(range/MagickPI*atan(result) + bias); - break; - } - case UndefinedFunction: - break; - } - return(ClampToQuantum(result)); - } - ) - - STRINGIFY( - /* - Improve brightness / contrast of the image - channel : define which channel is improved - function : the function called to enchance the brightness contrast - number_parameters : numbers of parameters - parameters : the parameter - */ - __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels, - const ChannelType channel,const MagickFunction function,const unsigned int number_parameters, - __constant float *parameters) - { - const unsigned int x = get_global_id(0); - const unsigned int y = get_global_id(1); - const unsigned int columns = get_global_size(0); - __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); - - float red; - float green; - float blue; - float alpha; - - ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); - - if ((channel & RedChannel) != 0) - red=ApplyFunction(red, function, number_parameters, parameters); - - if (number_channels > 2) - { - if ((channel & GreenChannel) != 0) - green=ApplyFunction(green, function, number_parameters, parameters); - - if ((channel & BlueChannel) != 0) - blue=ApplyFunction(blue, function, number_parameters, parameters); - } - - if (((number_channels == 4) || (number_channels == 2)) && - ((channel & AlphaChannel) != 0)) - alpha=ApplyFunction(alpha, function, number_parameters, parameters); - - WriteChannels(p, number_channels, channel, red, green, blue, alpha); - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% G r a y s c a l e % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - __kernel void Grayscale(__global CLQuantum *image,const int number_channels, - const unsigned int colorspace,const unsigned int method) - { - const unsigned int x = get_global_id(0); - const unsigned int y = get_global_id(1); - const unsigned int columns = get_global_size(0); - __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); - - float - blue, - green, - red; - - red=getPixelRed(p); - green=getPixelGreen(p); - blue=getPixelBlue(p); - - CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue)); - - setPixelRed(p,intensity); - setPixelGreen(p,intensity); - setPixelBlue(p,intensity); - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% L o c a l C o n t r a s t % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - - __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage, - const int radius, - const int imageWidth, - const int imageHeight) - { - const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f)); - - int x = get_local_id(0); - int y = get_global_id(1); - - global CLPixelType *src = srcImage + y * imageWidth; - - for (int i = x; i < imageWidth; i += get_local_size(0)) { - float sum = 0.0f; - float weight = 1.0f; - - int j = i - radius; - while ((j + 7) < i) { - for (int k = 0; k < 8; ++k) // Unroll 8x - sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)])); - weight += 8.0f; - j+=8; - } - while (j < i) { - sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)])); - weight += 1.0f; - ++j; - } - - while ((j + 7) < radius + i) { - for (int k = 0; k < 8; ++k) // Unroll 8x - sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)])); - weight -= 8.0f; - j+=8; - } - while (j < radius + i) { - sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)])); - weight -= 1.0f; - ++j; - } - - tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1)); - } - } - ) - - STRINGIFY( - __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage, - const int radius, - const float strength, - const int imageWidth, - const int imageHeight) - { - const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f); - - int x = get_global_id(0); - int y = get_global_id(1); - - if ((x >= imageWidth) || (y >= imageHeight)) - return; - - global float *src = blurImage + x; - - float sum = 0.0f; - float weight = 1.0f; - - int j = y - radius; - while ((j + 7) < y) { - for (int k = 0; k < 8; ++k) // Unroll 8x - sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth]; - weight += 8.0f; - j+=8; - } - while (j < y) { - sum += weight * src[mirrorBottom(j) * imageWidth]; - weight += 1.0f; - ++j; - } - - while ((j + 7) < radius + y) { - for (int k = 0; k < 8; ++k) // Unroll 8x - sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth]; - weight -= 8.0f; - j+=8; - } - while (j < radius + y) { - sum += weight * src[mirrorTop(j, imageHeight) * imageWidth]; - weight -= 1.0f; - ++j; - } - - CLPixelType pixel = srcImage[x + y * imageWidth]; - float srcVal = dot(RGB, convert_float4(pixel)); - float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f); - mult = (srcVal + mult) / srcVal; - - pixel.x = ClampToQuantum(pixel.x * mult); - pixel.y = ClampToQuantum(pixel.y * mult); - pixel.z = ClampToQuantum(pixel.z * mult); - - dstImage[x + y * imageWidth] = pixel; - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% M o d u l a t e % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - - inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue, - float *hue, float *saturation, float *lightness) - { - float - c, - tmax, - tmin; - - /* - Convert RGB to HSL colorspace. - */ - tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue)); - tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue)); - - c=tmax-tmin; - - *lightness=(tmax+tmin)/2.0; - if (c <= 0.0) - { - *hue=0.0; - *saturation=0.0; - return; - } - - if (tmax == (QuantumScale*red)) - { - *hue=(QuantumScale*green-QuantumScale*blue)/c; - if ((QuantumScale*green) < (QuantumScale*blue)) - *hue+=6.0; - } - else - if (tmax == (QuantumScale*green)) - *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c; - else - *hue=4.0+(QuantumScale*red-QuantumScale*green)/c; - - *hue*=60.0/360.0; - if (*lightness <= 0.5) - *saturation=c/(2.0*(*lightness)); - else - *saturation=c/(2.0-2.0*(*lightness)); - } - - inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness, - CLQuantum *red,CLQuantum *green,CLQuantum *blue) - { - float - b, - c, - g, - h, - tmin, - r, - x; - - /* - Convert HSL to RGB colorspace. - */ - h=hue*360.0; - if (lightness <= 0.5) - c=2.0*lightness*saturation; - else - c=(2.0-2.0*lightness)*saturation; - tmin=lightness-0.5*c; - h-=360.0*floor(h/360.0); - h/=60.0; - x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0)); - switch ((int) floor(h) % 6) - { - case 0: - { - r=tmin+c; - g=tmin+x; - b=tmin; - break; - } - case 1: - { - r=tmin+x; - g=tmin+c; - b=tmin; - break; - } - case 2: - { - r=tmin; - g=tmin+c; - b=tmin+x; - break; - } - case 3: - { - r=tmin; - g=tmin+x; - b=tmin+c; - break; - } - case 4: - { - r=tmin+x; - g=tmin; - b=tmin+c; - break; - } - case 5: - { - r=tmin+c; - g=tmin; - b=tmin+x; - break; - } - default: - { - r=0.0; - g=0.0; - b=0.0; - } - } - *red=ClampToQuantum(QuantumRange*r); - *green=ClampToQuantum(QuantumRange*g); - *blue=ClampToQuantum(QuantumRange*b); - } - - inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, - CLQuantum *red,CLQuantum *green,CLQuantum *blue) - { - float - hue, - lightness, - saturation; - - /* - Increase or decrease color lightness, saturation, or hue. - */ - ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness); - hue+=0.5*(0.01*percent_hue-1.0); - while (hue < 0.0) - hue+=1.0; - while (hue >= 1.0) - hue-=1.0; - saturation*=0.01*percent_saturation; - lightness*=0.01*percent_lightness; - ConvertHSLToRGB(hue,saturation,lightness,red,green,blue); - } - - __kernel void Modulate(__global CLPixelType *im, - const float percent_brightness, - const float percent_hue, - const float percent_saturation, - const int colorspace) - { - - const int x = get_global_id(0); - const int y = get_global_id(1); - const int columns = get_global_size(0); - const int c = x + y * columns; - - CLPixelType pixel = im[c]; - - CLQuantum - blue, - green, - red; - - red=getRed(pixel); - green=getGreen(pixel); - blue=getBlue(pixel); - - switch (colorspace) - { - case HSLColorspace: - default: - { - ModulateHSL(percent_hue, percent_saturation, percent_brightness, - &red, &green, &blue); - } - - } - - CLPixelType filteredPixel; - - setRed(&filteredPixel, red); - setGreen(&filteredPixel, green); - setBlue(&filteredPixel, blue); - filteredPixel.w = pixel.w; - - im[c] = filteredPixel; - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% M o t i o n B l u r % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - __kernel - void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output, - const unsigned int imageWidth, const unsigned int imageHeight, - const __global float *filter, const unsigned int width, const __global int2* offset, - const float4 bias, - const ChannelType channel, const unsigned int matte) { - - int2 currentPixel; - currentPixel.x = get_global_id(0); - currentPixel.y = get_global_id(1); - - if (currentPixel.x >= imageWidth - || currentPixel.y >= imageHeight) - return; - - float4 pixel; - pixel.x = (float)bias.x; - pixel.y = (float)bias.y; - pixel.z = (float)bias.z; - pixel.w = (float)bias.w; - - if (((channel & AlphaChannel) == 0) || (matte == 0)) { - - for (int i = 0; i < width; i++) { - // only support EdgeVirtualPixelMethod through ClampToCanvas - // TODO: implement other virtual pixel method - int2 samplePixel = currentPixel + offset[i]; - samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); - samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); - CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; - - pixel.x += (filter[i] * (float)samplePixelValue.x); - pixel.y += (filter[i] * (float)samplePixelValue.y); - pixel.z += (filter[i] * (float)samplePixelValue.z); - pixel.w += (filter[i] * (float)samplePixelValue.w); - } - - CLPixelType outputPixel; - outputPixel.x = ClampToQuantum(pixel.x); - outputPixel.y = ClampToQuantum(pixel.y); - outputPixel.z = ClampToQuantum(pixel.z); - outputPixel.w = ClampToQuantum(pixel.w); - output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; - } - else { - - float gamma = 0.0f; - for (int i = 0; i < width; i++) { - // only support EdgeVirtualPixelMethod through ClampToCanvas - // TODO: implement other virtual pixel method - int2 samplePixel = currentPixel + offset[i]; - samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); - samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); - - CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; - - float alpha = QuantumScale*samplePixelValue.w; - float k = filter[i]; - pixel.x = pixel.x + k * alpha * samplePixelValue.x; - pixel.y = pixel.y + k * alpha * samplePixelValue.y; - pixel.z = pixel.z + k * alpha * samplePixelValue.z; - - pixel.w += k * alpha * samplePixelValue.w; - - gamma+=k*alpha; - } - gamma = PerceptibleReciprocal(gamma); - pixel.xyz = gamma*pixel.xyz; - - CLPixelType outputPixel; - outputPixel.x = ClampToQuantum(pixel.x); - outputPixel.y = ClampToQuantum(pixel.y); - outputPixel.z = ClampToQuantum(pixel.z); - outputPixel.w = ClampToQuantum(pixel.w); - output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; - } - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% R e s i z e % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - // Based on Box from resize.c - float BoxResizeFilter(const float x) - { - return 1.0f; - } - ) - - STRINGIFY( - // Based on CubicBC from resize.c - float CubicBC(const float x,const __global float* resizeFilterCoefficients) - { - /* - Cubic Filters using B,C determined values: - Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter - Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears - Spline B = 1 C = 0 B-Spline Gaussian approximation - Hermite B = 0 C = 0 B-Spline interpolator - - See paper by Mitchell and Netravali, Reconstruction Filters in Computer - Graphics Computer Graphics, Volume 22, Number 4, August 1988 - http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/ - Mitchell.pdf. - - Coefficents are determined from B,C values: - P0 = ( 6 - 2*B )/6 = coeff[0] - P1 = 0 - P2 = (-18 +12*B + 6*C )/6 = coeff[1] - P3 = ( 12 - 9*B - 6*C )/6 = coeff[2] - Q0 = ( 8*B +24*C )/6 = coeff[3] - Q1 = ( -12*B -48*C )/6 = coeff[4] - Q2 = ( 6*B +30*C )/6 = coeff[5] - Q3 = ( - 1*B - 6*C )/6 = coeff[6] - - which are used to define the filter: - - P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1 - Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2 - - which ensures function is continuous in value and derivative (slope). - */ - if (x < 1.0) - return(resizeFilterCoefficients[0]+x*(x* - (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2]))); - if (x < 2.0) - return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x* - (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6]))); - return(0.0); - } - ) - - STRINGIFY( - float Sinc(const float x) - { - if (x != 0.0f) - { - const float alpha=(float) (MagickPI*x); - return sinpi(x)/alpha; - } - return(1.0f); - } - ) - - STRINGIFY( - float Triangle(const float x) - { - /* - 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or - a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function - for Sinc(). - */ - return ((x<1.0f)?(1.0f-x):0.0f); - } - ) - - - STRINGIFY( - float Hann(const float x) - { - /* - Cosine window function: - 0.5+0.5*cos(pi*x). - */ - const float cosine=cos((MagickPI*x)); - return(0.5f+0.5f*cosine); - } - ) - - STRINGIFY( - float Hamming(const float x) - { - /* - Offset cosine window function: - .54 + .46 cos(pi x). - */ - const float cosine=cos((MagickPI*x)); - return(0.54f+0.46f*cosine); - } - ) - - STRINGIFY( - float Blackman(const float x) - { - /* - Blackman: 2nd order cosine windowing function: - 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x) - - Refactored by Chantal Racette and Nicolas Robidoux to one trig call and - five flops. - */ - const float cosine=cos((MagickPI*x)); - return(0.34f+cosine*(0.5f+cosine*0.16f)); - } - ) - - STRINGIFY( - inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients) - { - switch (filterType) - { - /* Call Sinc even for SincFast to get better precision on GPU - and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/ - case SincWeightingFunction: - case SincFastWeightingFunction: - return Sinc(x); - case CubicBCWeightingFunction: - return CubicBC(x,filterCoefficients); - case BoxWeightingFunction: - return BoxResizeFilter(x); - case TriangleWeightingFunction: - return Triangle(x); - case HannWeightingFunction: - return Hann(x); - case HammingWeightingFunction: - return Hamming(x); - case BlackmanWeightingFunction: - return Blackman(x); - - default: - return 0.0f; - } - } - ) - - - STRINGIFY( - inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType - , const ResizeWeightingFunctionType resizeWindowType - , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x) - { - float scale; - float xBlur = fabs(x/resizeFilterBlur); - if (resizeWindowSupport < MagickEpsilon - || resizeWindowType == BoxWeightingFunction) - { - scale = 1.0f; - } - else - { - scale = resizeFilterScale; - scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients); - } - float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients); - return weight; - } - - ) - - ; - const char *accelerateKernels2 = - - STRINGIFY( - - inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { - return (numWorkItems/pixelPerWorkgroup); - } - - // returns the index of the pixel for the current workitem to compute. - // returns -1 if this workitem doesn't need to participate in any computation - inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { - const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems); - int pixelIndex = itemID/numWorkItemsPerPixel; - pixelIndex = (pixelIndex 2) - { - cp.y = (float) *(p + 1); - cp.z = (float) *(p + 2); - } - - if (alpha_index != 0) - { - cp.w = (float) *(p + alpha_index); - - float alpha = weight * QuantumScale * cp.w; - - filteredPixel.x += alpha * cp.x; - filteredPixel.y += alpha * cp.y; - filteredPixel.z += alpha * cp.z; - filteredPixel.w += weight * cp.w; - gamma += alpha; - } - else - filteredPixel += ((float4) weight)*cp; - - density += weight; - } - } - } - - // initialize the accumulators to zero - if (itemID < actualNumPixelInThisChunk) { - outputPixelCache[itemID] = (float4)0.0f; - densityCache[itemID] = 0.0f; - if (alpha_index != 0) - gammaCache[itemID] = 0.0f; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // accumulatte the filtered pixel value and the density - for (unsigned int i = 0; i < numItems; i++) { - if (pixelIndex != -1) { - if (itemID%numItems == i) { - outputPixelCache[pixelIndex]+=filteredPixel; - densityCache[pixelIndex]+=density; - if (alpha_index != 0) - gammaCache[pixelIndex]+=gamma; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (itemID < actualNumPixelInThisChunk) - { - float4 filteredPixel = outputPixelCache[itemID]; - - float gamma = 0.0f; - if (alpha_index != 0) - gamma = gammaCache[itemID]; - - float density = densityCache[itemID]; - if ((density != 0.0f) && (density != 1.0f)) - { - density = PerceptibleReciprocal(density); - filteredPixel *= (float4) density; - gamma *= density; - } - - if (alpha_index != 0) - { - gamma = PerceptibleReciprocal(gamma); - filteredPixel.x *= gamma; - filteredPixel.y *= gamma; - filteredPixel.z *= gamma; - } - - WriteFloat4(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, AllChannels, filteredPixel); - } - } - } - ) - - - STRINGIFY( - __kernel __attribute__((reqd_work_group_size(1, 256, 1))) - void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, - const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, - const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor, - const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, - const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, - const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, - const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, - __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) - { - // calculate the range of resized image pixels computed by this workgroup - const unsigned int startY = get_group_id(1)*pixelPerWorkgroup; - const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows); - const unsigned int actualNumPixelToCompute = stopY - startY; - - // calculate the range of input image pixels to cache - float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f); - const float support = MagickMax(scale*resizeFilterSupport,0.5f); - scale = PerceptibleReciprocal(scale); - - const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0)); - const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows); - - // cache the input pixels into local memory - const unsigned int x = get_global_id(0); - unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY); - unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY; - unsigned int stride = inputColumns * number_channels; - for (unsigned int i = 0; i < number_channels; i++) - { - event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0); - wait_group_events(1,&e); - } - - unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; - unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; - for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) - { - const unsigned int chunkStartY = startY + chunk*pixelChunkSize; - const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY); - const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY; - - // determine which resized pixel computed by this workitem - const unsigned int itemID = get_local_id(1); - const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1)); - - const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1)); - - float4 filteredPixel = (float4)0.0f; - float density = 0.0f; - float gamma = 0.0f; - // -1 means this workitem doesn't participate in the computation - if (pixelIndex != -1) - { - // x coordinated of the resized pixel computed by this workitem - const int y = chunkStartY + pixelIndex; - - // calculate how many steps required for this pixel - const float bisect = (y+0.5)/yFactor+MagickEpsilon; - const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); - const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows); - const unsigned int n = stop - start; - - // calculate how many steps this workitem will contribute - unsigned int numStepsPerWorkItem = n / numItems; - numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); - - const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; - if (startStep < n) - { - const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); - - unsigned int cacheIndex = start+startStep-cacheRangeStartY; - for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) - { - float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, - (ResizeWeightingFunctionType) resizeFilterType, - (ResizeWeightingFunctionType) resizeWindowType, - resizeFilterScale, resizeFilterWindowSupport, - resizeFilterBlur, scale*(start + i - bisect + 0.5)); - - float4 cp = (float4)0.0f; - - __local CLQuantum *p = inputImageCache + cacheIndex; - cp.x = (float) *(p); - if (number_channels > 2) - { - cp.y = (float) *(p + rangeLength); - cp.z = (float) *(p + (rangeLength * 2)); - } - - if (alpha_index != 0) - { - cp.w = (float) *(p + (rangeLength * alpha_index)); - - float alpha = weight * QuantumScale * cp.w; - - filteredPixel.x += alpha * cp.x; - filteredPixel.y += alpha * cp.y; - filteredPixel.z += alpha * cp.z; - filteredPixel.w += weight * cp.w; - gamma += alpha; - } - else - filteredPixel += ((float4) weight)*cp; - - density += weight; - } - } - } - - // initialize the accumulators to zero - if (itemID < actualNumPixelInThisChunk) { - outputPixelCache[itemID] = (float4)0.0f; - densityCache[itemID] = 0.0f; - if (alpha_index != 0) - gammaCache[itemID] = 0.0f; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // accumulatte the filtered pixel value and the density - for (unsigned int i = 0; i < numItems; i++) { - if (pixelIndex != -1) { - if (itemID%numItems == i) { - outputPixelCache[pixelIndex]+=filteredPixel; - densityCache[pixelIndex]+=density; - if (alpha_index != 0) - gammaCache[pixelIndex]+=gamma; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (itemID < actualNumPixelInThisChunk) - { - float4 filteredPixel = outputPixelCache[itemID]; - - float gamma = 0.0f; - if (alpha_index != 0) - gamma = gammaCache[itemID]; - - float density = densityCache[itemID]; - if ((density != 0.0f) && (density != 1.0f)) - { - density = PerceptibleReciprocal(density); - filteredPixel *= (float4) density; - gamma *= density; - } - - if (alpha_index != 0) - { - gamma = PerceptibleReciprocal(gamma); - filteredPixel.x *= gamma; - filteredPixel.y *= gamma; - filteredPixel.z *= gamma; - } - - WriteFloat4(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, AllChannels, filteredPixel); - } - } - } - ) - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% R o t a t i o n a l B l u r % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -*/ - - STRINGIFY( - __kernel void RotationalBlur(const __global CLQuantum *image,const unsigned int number_channels, - const unsigned int channel,const float4 bias,const float2 blurCenter,__constant float *cos_theta, - __constant float *sin_theta,const unsigned int cossin_theta_size,__global CLQuantum *filteredImage) - { - const int x = get_global_id(0); - const int y = get_global_id(1); - const int columns = get_global_size(0); - const int rows = get_global_size(1); - unsigned int step = 1; - float center_x = (float) x - blurCenter.x; - float center_y = (float) y - blurCenter.y; - float radius = hypot(center_x, center_y); - - float blur_radius = hypot(blurCenter.x, blurCenter.y); - - if (radius > MagickEpsilon) - { - step = (unsigned int) (blur_radius / radius); - if (step == 0) - step = 1; - if (step >= cossin_theta_size) - step = cossin_theta_size-1; - } - - float4 result = bias; - - float normalize = 0.0f; - float gamma = 0.0f; - - for (unsigned int i=0; i= 0) && (groupStopY < rows)) - { - event_t e = async_work_group_strided_copy(cachedData, - blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0); - wait_group_events(1,&e); - } - else - { - for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) - cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX]; - - barrier(CLK_LOCAL_MEM_FENCE); - } - // cache the filter as well - event_t e = async_work_group_copy(cachedFilter,filter,width,0); - wait_group_events(1,&e); - - // only do the work if this is not a patched item - const int cy = get_global_id(1); - - if (cy < rows) - { - float4 blurredPixel = (float4) 0.0f; - - int i = 0; - - for ( ; i+7 < width; ) - { - for (int j=0; j < 8; j++, i++) - blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)]; - } - - for ( ; i < width; i++) - blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)]; - - float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel); - float4 outputPixel = inputImagePixel - blurredPixel; - - float quantumThreshold = QuantumRange*threshold; - - int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold); - outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask); - - //write back - WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel); - } - } - ) - - STRINGIFY( - __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels, - const ChannelType channel,__constant float *filter,const unsigned int width, - const unsigned int columns,const unsigned int rows,__local float4 *pixels, - const float gain,const float threshold,__global CLQuantum *filteredImage) - { - const unsigned int x = get_global_id(0); - const unsigned int y = get_global_id(1); - - const unsigned int radius = (width - 1) / 2; - - int row = y - radius; - int baseRow = get_group_id(1) * get_local_size(1) - radius; - int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius; - - while (row < endRow) { - int srcy = (row < 0) ? -row : row; // mirror pad - srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy; - - float4 value = 0.0f; - - int ix = x - radius; - int i = 0; - - while (i + 7 < width) { - for (int j = 0; j < 8; ++j) { // unrolled - int srcx = ix + j; - srcx = (srcx < 0) ? -srcx : srcx; - srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; - value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); - } - ix += 8; - i += 8; - } - - while (i < width) { - int srcx = (ix < 0) ? -ix : ix; // mirror pad - srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; - value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); - ++i; - ++ix; - } - pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value; - row += get_local_size(1); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - const int px = get_local_id(0); - const int py = get_local_id(1); - const int prp = get_local_size(0); - float4 value = (float4)(0.0f); - - int i = 0; - while (i + 7 < width) { - for (int j = 0; j < 8; ++j) // unrolled - value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp]; - i += 8; - } - while (i < width) { - value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; - ++i; - } - - float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel); - float4 diff = srcPixel - value; - - float quantumThreshold = QuantumRange*threshold; - - int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold); - value = select(srcPixel + diff * gain, srcPixel, mask); - - if ((x < columns) && (y < rows)) - WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value); - } - ) - - STRINGIFY( - __kernel __attribute__((reqd_work_group_size(64, 4, 1))) - void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage, - const unsigned int number_channels,const unsigned int max_channels, - const float threshold,const int passes,const unsigned int imageWidth, - const unsigned int imageHeight) - { - const int pad = (1 << (passes - 1)); - const int tileSize = 64; - const int tileRowPixels = 64; - const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 }; - - CLQuantum stage[48]; // 16 * 3 (we only need 3 channels) - - local float buffer[64 * 64]; - - int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0); - int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad; - - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { - int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + - (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels; - - for (int channel = 0; channel < max_channels; ++channel) - stage[(i / 4) + (16 * channel)] = srcImage[pos + channel]; - } - - for (int channel = 0; channel < max_channels; ++channel) { - // Load LDS - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]); - - // Process - - float tmp[16]; - float accum[16]; - float pixel; - - for (int i = 0; i < 16; i++) - accum[i]=0.0f; - - for (int pass = 0; pass < passes; ++pass) { - const int radius = 1 << pass; - const int x = get_local_id(0); - const float thresh = threshold * noise[pass]; - - // Apply horizontal hat - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { - const int offset = i * tileRowPixels; - if (pass == 0) - tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass - pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]); - barrier(CLK_LOCAL_MEM_FENCE); - buffer[x + offset] = pixel; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Apply vertical hat - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { - pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]); - float delta = tmp[i / 4] - pixel; - tmp[i / 4] = pixel; // hold output in tmp until all workitems are done - if (delta < -thresh) - delta += thresh; - else if (delta > thresh) - delta -= thresh; - else - delta = 0; - accum[i / 4] += delta; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (pass < passes - 1) - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass - else // last pass - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output - barrier(CLK_LOCAL_MEM_FENCE); - } - - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]); - - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Write from stage to output - - if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { - if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) { - int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels)); - for (int channel = 0; channel < max_channels; ++channel) { - dstImage[pos + channel] = stage[(i / 4) + (16 * channel)]; - } - } - } - } - } - ) - - ; - -#endif // MAGICKCORE_OPENCL_SUPPORT - extern MagickPrivate Image *AccelerateAddNoiseImage(const Image*,const NoiseType,ExceptionInfo *), *AccelerateBlurImage(const Image *,const double,const double,ExceptionInfo *), diff --git a/MagickCore/accelerate.c b/MagickCore/accelerate.c index 507012867..92daf1e8b 100644 --- a/MagickCore/accelerate.c +++ b/MagickCore/accelerate.c @@ -41,6 +41,7 @@ Include declarations. */ #include "MagickCore/studio.h" #include "MagickCore/accelerate-private.h" +#include "MagickCore/accelerate-kernels-private.h" #include "MagickCore/artifact.h" #include "MagickCore/cache.h" #include "MagickCore/cache-private.h" -- 2.40.0