From: dirk Date: Wed, 13 Aug 2014 20:39:42 +0000 (+0000) Subject: OpenCL performance improvements. X-Git-Tag: 7.0.1-0~2101 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=859503ca8fec1bf5c406af69ddee90fa05043379;p=imagemagick OpenCL performance improvements. --- diff --git a/MagickCore/accelerate-private.h b/MagickCore/accelerate-private.h index 726bbd362..d5afd755f 100644 --- a/MagickCore/accelerate-private.h +++ b/MagickCore/accelerate-private.h @@ -1325,6 +1325,96 @@ const char* accelerateKernels = ) + STRINGIFY( + __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im, + __constant float *filter, + const unsigned int width, + const unsigned int imageColumns, const unsigned int imageRows, + __local float4 *pixels, + const float gain, const float threshold, const unsigned int justBlur) + { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const unsigned int radius = (width - 1) / 2; + + int row = y - radius; + int baseRow = get_group_id(1) * get_local_size(1) - radius; + int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius; + + while (row < endRow) { + int srcy = (row < 0) ? -row : row; // mirror pad + srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy; + + float4 value = 0.0f; + + int ix = x - radius; + int i = 0; + + while (i + 7 < width) { + for (int j = 0; j < 8; ++j) { // unrolled + int srcx = ix + j; + srcx = (srcx < 0) ? -srcx : srcx; + srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx; + value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]); + } + ix += 8; + i += 8; + } + + while (i < width) { + int srcx = (ix < 0) ? -ix : ix; // mirror pad + srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx; + value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]); + ++i; + ++ix; + } + pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value; + row += get_local_size(1); + } + + + barrier(CLK_LOCAL_MEM_FENCE); + + + const int px = get_local_id(0); + const int py = get_local_id(1); + const int prp = get_local_size(0); + float4 value = (float4)(0.0f); + + int i = 0; + while (i + 7 < width) { // unrolled + value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; + value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp]; + value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp]; + value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp]; + value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp]; + value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp]; + value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp]; + value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp]; + i += 8; + } + while (i < width) { + value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; + ++i; + } + + if (justBlur == 0) { // apply sharpening + float4 srcPixel = convert_float4(im[x + y * imageColumns]); + float4 diff = srcPixel - value; + + float quantumThreshold = QuantumRange*threshold; + + int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold); + value = select(srcPixel + diff * gain, srcPixel, mask); + } + + if ((x < imageColumns) && (y < imageRows)) + filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3)); + } + ) + + STRINGIFY( @@ -2654,220 +2744,308 @@ const char* accelerateKernels = ) - STRINGIFY( - - typedef enum - { - UndefinedNoise, - UniformNoise, - GaussianNoise, - MultiplicativeGaussianNoise, - ImpulseNoise, - LaplacianNoise, - PoissonNoise, - RandomNoise - } NoiseType; - - typedef struct { - const global float* rns; - } RandomNumbers; - - - float ReadPseudoRandomValue(RandomNumbers* r) { - float v = *r->rns; - r->rns++; - return v; - } - ) - - OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) - OPENCL_DEFINE(SigmaGaussian,(attenuate*0.015625f)) - OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) - OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) - OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) - OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) - OPENCL_DEFINE(SigmaRandom, (attenuate)) - OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) - - STRINGIFY( - float GenerateDifferentialNoise(RandomNumbers* r, CLQuantum pixel, NoiseType noise_type, float attenuate) { - - float - alpha, - beta, - noise, - sigma; - - noise = 0.0f; - alpha=ReadPseudoRandomValue(r); - switch(noise_type) { - case UniformNoise: - default: - { - noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); - break; - } - case GaussianNoise: - { - float - gamma, - tau; - - if (alpha == 0.0f) - alpha=1.0f; - beta=ReadPseudoRandomValue(r); - gamma=sqrt(-2.0f*log(alpha)); - sigma=gamma*cospi((2.0f*beta)); - tau=gamma*sinpi((2.0f*beta)); - noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+ - QuantumRange*TauGaussian*tau); - break; - } +OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) +OPENCL_DEFINE(SigmaGaussian,(attenuate*0.015625f)) +OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) +OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) +OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) +OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) +OPENCL_DEFINE(SigmaRandom, (attenuate)) +OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) +STRINGIFY( - case ImpulseNoise: - { - if (alpha < (SigmaImpulse/2.0f)) - noise=0.0f; - else - if (alpha >= (1.0f-(SigmaImpulse/2.0f))) - noise=(float)QuantumRange; - else - noise=(float)pixel; - break; - } - case LaplacianNoise: - { - if (alpha <= 0.5f) - { - if (alpha <= MagickEpsilon) - noise=(float) (pixel-QuantumRange); - else - noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ - 0.5f); - break; - } - beta=1.0f-alpha; - if (beta <= (0.5f*MagickEpsilon)) - noise=(float) (pixel+QuantumRange); - else - noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); - break; - } - case MultiplicativeGaussianNoise: - { - sigma=1.0f; - if (alpha > MagickEpsilon) - sigma=sqrt(-2.0f*log(alpha)); - beta=ReadPseudoRandomValue(r); - noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma* - cospi((float) (2.0f*beta))/2.0f); - break; - } - case PoissonNoise: - { - float - poisson; - unsigned int i; - poisson=exp(-SigmaPoisson*QuantumScale*pixel); - for (i=0; alpha > poisson; i++) - { - beta=ReadPseudoRandomValue(r); - alpha*=beta; - } - noise=(float) (QuantumRange*i/SigmaPoisson); - break; - } - case RandomNoise: - { - noise=(float) (QuantumRange*SigmaRandom*alpha); - break; - } - - }; - return noise; - } +/* +Part of MWC64X by David Thomas, dt10@imperial.ac.uk +This is provided under BSD, full license is with the main package. +See http://www.doc.ic.ac.uk/~dt10/research +*/ + +// Pre: a=M) || (v=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug. + v=v-M; + return v; +} - __kernel - void AddNoiseImage(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage - ,const unsigned int inputColumns, const unsigned int inputRows - ,const ChannelType channel - ,const NoiseType noise_type, const float attenuate - ,const __global float* randomNumbers, const unsigned int numRandomNumbersPerPixel - ,const unsigned int rowOffset) { +// Pre: a>1; + } + return r; +} - unsigned int x = get_global_id(0); - unsigned int y = get_global_id(1) + rowOffset; - RandomNumbers r; - r.rns = randomNumbers + (get_global_id(1) * inputColumns + get_global_id(0))*numRandomNumbersPerPixel; - CLPixelType p = inputImage[y*inputColumns+x]; - CLPixelType q = filteredImage[y*inputColumns+x]; +// Pre: a=0 +// Post: r=(a^b) mod M +// This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on +// most architectures +ulong MWC_PowMod64(ulong a, ulong e, ulong M) +{ + ulong sqr=a, acc=1; + while(e!=0){ + if(e&1) + acc=MWC_MulMod64(acc,sqr,M); + sqr=MWC_MulMod64(sqr,sqr,M); + e=e>>1; + } + return acc; +} - if ((channel&RedChannel)!=0) { - setRed(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getRed(p),noise_type,attenuate))); - } - - if ((channel&GreenChannel)!=0) { - setGreen(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getGreen(p),noise_type,attenuate))); - } +uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance) +{ + ulong m=MWC_PowMod64(A, distance, M); + ulong x=curr.x*(ulong)A+curr.y; + x=MWC_MulMod64(x, m, M); + return (uint2)((uint)(x/A), (uint)(x%A)); +} - if ((channel&BlueChannel)!=0) { - setBlue(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getBlue(p),noise_type,attenuate))); - } +uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap) +{ + // This is an arbitrary constant for starting LCG jumping from. I didn't + // want to start from 1, as then you end up with the two or three first values + // being a bit poor in ones - once you've decided that, one constant is as + // good as any another. There is no deep mathematical reason for it, I just + // generated a random number. + enum{ MWC_BASEID = 4077358422479273989UL }; + + ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap; + ulong m=MWC_PowMod64(A, dist, M); + + ulong x=MWC_MulMod64(MWC_BASEID, m, M); + return (uint2)((uint)(x/A), (uint)(x%A)); +} + +//! Represents the state of a particular generator +typedef struct{ uint x; uint c; } mwc64x_state_t; + +enum{ MWC64X_A = 4294883355U }; +enum{ MWC64X_M = 18446383549859758079UL }; + +void MWC64X_Step(mwc64x_state_t *s) +{ + uint X=s->x, C=s->c; + + uint Xn=MWC64X_A*X+C; + uint carry=(uint)(Xnx=Xn; + s->c=Cn; +} - if ((channel & OpacityChannel) != 0) { - setOpacity(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getOpacity(p),noise_type,attenuate))); - } +void MWC64X_Skip(mwc64x_state_t *s, ulong distance) +{ + uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance); + s->x=tmp.x; + s->c=tmp.y; +} - filteredImage[y*inputColumns+x] = q; - } +void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset) +{ + uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset); + s->x=tmp.x; + s->c=tmp.y; +} +//! Return a 32-bit integer in the range [0..2^32) +uint MWC64X_NextUint(mwc64x_state_t *s) +{ + uint res=s->x ^ s->c; + MWC64X_Step(s); + return res; +} + +// +// End of MWC64X excerpt +// + + + typedef enum + { + UndefinedNoise, + UniformNoise, + GaussianNoise, + MultiplicativeGaussianNoise, + ImpulseNoise, + LaplacianNoise, + PoissonNoise, + RandomNoise + } NoiseType; + + + float mwcReadPseudoRandomValue(mwc64x_state_t* rng) { + return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0 + } + + + float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) { + + float + alpha, + beta, + noise, + sigma; + + noise = 0.0f; + alpha=mwcReadPseudoRandomValue(r); + switch(noise_type) { + case UniformNoise: + default: + { + noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); + break; + } + case GaussianNoise: + { + float + gamma, + tau; + + if (alpha == 0.0f) + alpha=1.0f; + beta=mwcReadPseudoRandomValue(r); + gamma=sqrt(-2.0f*log(alpha)); + sigma=gamma*cospi((2.0f*beta)); + tau=gamma*sinpi((2.0f*beta)); + noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+ + QuantumRange*TauGaussian*tau); + break; + } + + + case ImpulseNoise: + { + if (alpha < (SigmaImpulse/2.0f)) + noise=0.0f; + else + if (alpha >= (1.0f-(SigmaImpulse/2.0f))) + noise=(float)QuantumRange; + else + noise=(float)pixel; + break; + } + case LaplacianNoise: + { + if (alpha <= 0.5f) + { + if (alpha <= MagickEpsilon) + noise=(float) (pixel-QuantumRange); + else + noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ + 0.5f); + break; + } + beta=1.0f-alpha; + if (beta <= (0.5f*MagickEpsilon)) + noise=(float) (pixel+QuantumRange); + else + noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); + break; + } + case MultiplicativeGaussianNoise: + { + sigma=1.0f; + if (alpha > MagickEpsilon) + sigma=sqrt(-2.0f*log(alpha)); + beta=mwcReadPseudoRandomValue(r); + noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma* + cospi((float) (2.0f*beta))/2.0f); + break; + } + case PoissonNoise: + { + float + poisson; + unsigned int i; + poisson=exp(-SigmaPoisson*QuantumScale*pixel); + for (i=0; alpha > poisson; i++) + { + beta=mwcReadPseudoRandomValue(r); + alpha*=beta; + } + noise=(float) (QuantumRange*i/SigmaPoisson); + break; + } + case RandomNoise: + { + noise=(float) (QuantumRange*SigmaRandom*alpha); + break; + } + + }; + return noise; + } + + + + + + __kernel + void GenerateNoiseImage(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage + ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem + ,const ChannelType channel + ,const NoiseType noise_type, const float attenuate + ,const unsigned int seed0, const unsigned int seed1 + ,const unsigned int numRandomNumbersPerPixel) { + + mwc64x_state_t rng; + rng.x = seed0; + rng.c = seed1; + + uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use + uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream); + + MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams + + uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0); // pixel to process + + uint count = pixelsPerWorkItem; + + while (count > 0) { + if (pos < inputPixelCount) { + CLPixelType p = inputImage[pos]; + + if ((channel&RedChannel)!=0) { + setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate))); + } + + if ((channel&GreenChannel)!=0) { + setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate))); + } + + if ((channel&BlueChannel)!=0) { + setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate))); + } + + if ((channel & OpacityChannel) != 0) { + setOpacity(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getOpacity(p),noise_type,attenuate))); + } + + filteredImage[pos] = p; + //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255); + } + pos += get_local_size(0); + --count; + } + } ) - STRINGIFY( - __kernel - void RandomImage(__global CLPixelType* inputImage, - const uint imageColumns, const uint imageRows, - __global uint* seeds, - const float randNormNumerator, - const uint randNormDenominator) { - - unsigned int numGenerators = get_global_size(0); - unsigned numRandPixelsPerWorkItem = ((imageColumns*imageRows) + (numGenerators-1)) - / numGenerators; - - uint4 s; - s.x = seeds[get_global_id(0)*4]; - s.y = seeds[get_global_id(0)*4+1]; - s.z = seeds[get_global_id(0)*4+2]; - s.w = seeds[get_global_id(0)*4+3]; - - unsigned int offset = get_group_id(0) * get_local_size(0) * numRandPixelsPerWorkItem; - for (unsigned int n = 0; n < numRandPixelsPerWorkItem; n++) - { - int i = offset + n*get_local_size(0) + get_local_id(0); - if (i >= imageColumns*imageRows) - break; - - float rand = GetPseudoRandomValue(&s,randNormNumerator/randNormDenominator); - CLQuantum v = ClampToQuantum(QuantumRange*rand); - - CLPixelType p; - setRed(&p,v); - setGreen(&p,v); - setBlue(&p,v); - setOpacity(&p,0); - - inputImage[i] = p; - } - - seeds[get_global_id(0)*4] = s.x; - seeds[get_global_id(0)*4+1] = s.y; - seeds[get_global_id(0)*4+2] = s.z; - seeds[get_global_id(0)*4+3] = s.w; - } - ) STRINGIFY( __kernel diff --git a/MagickCore/accelerate.c b/MagickCore/accelerate.c index 246386278..2d4ac3587 100644 --- a/MagickCore/accelerate.c +++ b/MagickCore/accelerate.c @@ -1535,6 +1535,17 @@ cleanup: return filteredImage; } +static Image *ComputeUnsharpMaskImageSingle(const Image *image, + const ChannelType channel,const double radius,const double sigma, + const double gain,const double threshold,int blurOnly, ExceptionInfo *exception); + +static Image* ComputeBlurImageSingle(const Image* image, + const ChannelType channel,const double radius,const double sigma, + ExceptionInfo *exception) +{ + return ComputeUnsharpMaskImageSingle(image, channel, radius, sigma, 0.0, 0.0, 1, exception); +} + MagickExport Image* AccelerateBlurImage(const Image *image, const ChannelType channel,const double radius,const double sigma, ExceptionInfo *exception) @@ -1549,7 +1560,9 @@ MagickExport Image* AccelerateBlurImage(const Image *image, (checkAccelerateCondition(image, channel) == MagickFalse)) return NULL; - if (splitImage(image) && (image->rows / 2 > radius)) + if (radius < 12.1) + filteredImage=ComputeBlurImageSingle(image, channel, radius, sigma, exception); + else if (splitImage(image) && (image->rows / 2 > radius)) filteredImage=ComputeBlurImageSection(image, channel, radius, sigma, exception); else filteredImage=ComputeBlurImage(image, channel, radius, sigma, exception); @@ -2688,284 +2701,365 @@ cleanup: return filteredImage; } -MagickExport Image *AccelerateUnsharpMaskImage(const Image *image, +static Image *ComputeUnsharpMaskImageSingle(const Image *image, const ChannelType channel,const double radius,const double sigma, - const double gain,const double threshold,ExceptionInfo *exception) + const double gain,const double threshold,int blurOnly, ExceptionInfo *exception) { - Image - *filteredImage; + CacheView + *filteredImage_view, + *image_view; - assert(image != NULL); - assert(exception != (ExceptionInfo *) NULL); + char + geometry[MaxTextExtent]; - if ((checkOpenCLEnvironment(exception) == MagickFalse) || - (checkAccelerateCondition(image, channel) == MagickFalse)) - return NULL; + cl_command_queue + queue; - if (splitImage(image) && (image->rows / 2 > radius)) - filteredImage = ComputeUnsharpMaskImageSection(image,channel,radius,sigma,gain,threshold,exception); - else - filteredImage = ComputeUnsharpMaskImage(image,channel,radius,sigma,gain,threshold,exception); - return(filteredImage); -} + cl_context + context; -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% A c c e l e r a t e R e s i z e I m a g e % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% -% AccelerateResizeImage() is an OpenCL implementation of ResizeImage() -% -% AccelerateResizeImage() scales an image to the desired dimensions, using the given -% filter (see AcquireFilterInfo()). -% -% If an undefined filter is given the filter defaults to Mitchell for a -% colormapped image, a image with a matte channel, or if the image is -% enlarged. Otherwise the filter defaults to a Lanczos. -% -% AccelerateResizeImage() was inspired by Paul Heckbert's "zoom" program. -% -% The format of the AccelerateResizeImage method is: -% -% Image *ResizeImage(Image *image,const size_t columns, -% const size_t rows, const ResizeFilter* filter, -% ExceptionInfo *exception) -% -% A description of each parameter follows: -% -% o image: the image. -% -% o columns: the number of columns in the scaled image. -% -% o rows: the number of rows in the scaled image. -% -% o filter: Image filter to use. -% -% o exception: return any errors or warnings in this structure. -% -*/ + cl_int + justBlur, + clStatus; -static MagickBooleanType resizeHorizontalFilter(cl_mem image, - const unsigned int imageColumns,const unsigned int imageRows, - const unsigned int matte,cl_mem resizedImage, - const unsigned int resizedColumns,const unsigned int resizedRows, - const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients, - const float xFactor,MagickCLEnv clEnv,cl_command_queue queue, - ExceptionInfo *exception) -{ cl_kernel - horizontalKernel; + unsharpMaskKernel; - cl_int clStatus; + cl_mem + filteredImageBuffer, + imageBuffer, + imageKernelBuffer; - const unsigned int - workgroupSize = 256; + cl_mem_flags + mem_flags; + + const void + *inputPixels; float - resizeFilterScale, - resizeFilterSupport, - resizeFilterWindowSupport, - resizeFilterBlur, - scale, - support; + fGain, + fThreshold, + *kernelBufferPtr; - int - cacheRangeStart, - cacheRangeEnd, - numCachedPixels, - resizeFilterType, - resizeWindowType; + Image + *filteredImage; + + KernelInfo + *kernel; MagickBooleanType - status = MagickFalse; + outputReady; - size_t - deviceLocalMemorySize, - gammaAccumulatorLocalMemorySize, - global_work_size[2], - imageCacheLocalMemorySize, - pixelAccumulatorLocalMemorySize, - local_work_size[2], - totalLocalMemorySize, - weightAccumulatorLocalMemorySize; + MagickCLEnv + clEnv; - unsigned int - chunkSize, - i, - pixelPerWorkgroup; + MagickSizeType + length; - horizontalKernel = NULL; - status = MagickFalse; + void + *filteredPixels, + *hostPtr; - /* - Apply filter to resize vertically from image to resize image. - */ - scale=MAGICK_MAX(1.0/xFactor+MagickEpsilon,1.0); - support=scale*GetResizeFilterSupport(resizeFilter); - if (support < 0.5) - { - /* - Support too small even for nearest neighbour: Reduce to point - sampling. - */ - support=(MagickRealType) 0.5; - scale=1.0; - } - scale=PerceptibleReciprocal(scale); + unsigned int + i, + imageColumns, + imageRows, + kernelWidth; - if (resizedColumns < workgroupSize) - { - chunkSize = 32; - pixelPerWorkgroup = 32; - } - else - { - chunkSize = workgroupSize; - pixelPerWorkgroup = workgroupSize; - } + clEnv = NULL; + filteredImage = NULL; + filteredImage_view = NULL; + kernel = NULL; + context = NULL; + imageBuffer = NULL; + filteredImageBuffer = NULL; + imageKernelBuffer = NULL; + unsharpMaskKernel = NULL; + queue = NULL; + outputReady = MagickFalse; - /* get the local memory size supported by the device */ - deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv); + clEnv = GetDefaultOpenCLEnv(); + context = GetOpenCLContext(clEnv); + queue = AcquireOpenCLCommandQueue(clEnv); -DisableMSCWarning(4127) - while(1) -RestoreMSCWarning + /* Create and initialize OpenCL buffers. */ { - /* calculate the local memory size needed per workgroup */ - cacheRangeStart = (int) (((0 + 0.5)/xFactor+MagickEpsilon)-support+0.5); - cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/xFactor+MagickEpsilon)+support+0.5); - numCachedPixels = cacheRangeEnd - cacheRangeStart + 1; - imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket); - totalLocalMemorySize = imageCacheLocalMemorySize; - - /* local size for the pixel accumulator */ - pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4); - totalLocalMemorySize+=pixelAccumulatorLocalMemorySize; - - /* local memory size for the weight accumulator */ - weightAccumulatorLocalMemorySize = chunkSize * sizeof(float); - totalLocalMemorySize+=weightAccumulatorLocalMemorySize; - - /* local memory size for the gamma accumulator */ - if (matte == 0) - gammaAccumulatorLocalMemorySize = sizeof(float); - else - gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float); - totalLocalMemorySize+=gammaAccumulatorLocalMemorySize; + image_view=AcquireVirtualCacheView(image,exception); + inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception); + if (inputPixels == (const void *) NULL) + { + (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename); + goto cleanup; + } - if (totalLocalMemorySize <= deviceLocalMemorySize) - break; - else + /* If the host pointer is aligned to the size of CLPixelPacket, + then use the host buffer directly from the GPU; otherwise, + create a buffer on the GPU and copy the data over */ + if (ALIGNED(inputPixels,CLPixelPacket)) { - pixelPerWorkgroup = pixelPerWorkgroup/2; - chunkSize = chunkSize/2; - if (pixelPerWorkgroup == 0 - || chunkSize == 0) - { - /* quit, fallback to CPU */ - goto cleanup; - } + mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR; + } + else + { + mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR; + } + /* create a CL buffer from image pixel buffer */ + length = image->columns * image->rows; + imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); + goto cleanup; } } - resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter); - resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter); - - - if (resizeFilterType == SincFastWeightingFunction - && resizeWindowType == SincFastWeightingFunction) - { - horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilterSinc"); - } - else + /* create output */ { - horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilter"); - } - if (horizontalKernel == NULL) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", "."); - goto cleanup; + filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception); + assert(filteredImage != NULL); + if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", "."); + goto cleanup; + } + filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception); + filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception); + if (filteredPixels == (void *) NULL) + { + (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename); + goto cleanup; + } + + if (ALIGNED(filteredPixels,CLPixelPacket)) + { + mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR; + hostPtr = filteredPixels; + } + else + { + mem_flags = CL_MEM_WRITE_ONLY; + hostPtr = NULL; + } + + /* create a CL buffer from image pixel buffer */ + length = image->columns * image->rows; + filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); + goto cleanup; + } } - i = 0; - clStatus = clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&image); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageColumns); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageRows); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&matte); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&xFactor); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizedImage); + /* create the blur kernel */ + { + (void) FormatLocaleString(geometry,MaxTextExtent,"blur:%.20gx%.20g;blur:%.20gx%.20g+90",radius,sigma,radius,sigma); + kernel=AcquireKernelInfo(geometry); + if (kernel == (KernelInfo *) NULL) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireKernelInfo failed.","."); + goto cleanup; + } - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedRows); + imageKernelBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY, kernel->width * sizeof(float), NULL, &clStatus); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); + goto cleanup; + } - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeFilterType); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeWindowType); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients); - resizeFilterScale = (float) GetResizeFilterScale(resizeFilter); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterScale); + kernelBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, imageKernelBuffer, CL_TRUE, CL_MAP_WRITE, 0, kernel->width * sizeof(float), 0, NULL, NULL, &clStatus); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.","."); + goto cleanup; + } + for (i = 0; i < kernel->width; i++) + { + kernelBufferPtr[i] = (float) kernel->values[i]; + } + clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, imageKernelBuffer, kernelBufferPtr, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.", "'%s'", "."); + goto cleanup; + } + } - resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterSupport); + { + /* get the opencl kernel */ + { + unsharpMaskKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "UnsharpMask"); + if (unsharpMaskKernel == NULL) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", "."); + goto cleanup; + }; + } - resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport); + { + imageColumns = image->columns; + imageRows = image->rows; + kernelWidth = kernel->width; + fGain = (float)gain; + fThreshold = (float)threshold; + justBlur = blurOnly; - resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterBlur); + /* set the kernel arguments */ + i = 0; + clStatus=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&imageBuffer); + clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&filteredImageBuffer); + clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_mem),(void *)&imageKernelBuffer); + clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&kernelWidth); + clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&imageColumns); + clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(unsigned int),(void *)&imageRows); + clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_float4)*(8 * (32 + kernel->width)),(void *)NULL); + clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(float),(void *)&fGain); + clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(float),(void *)&fThreshold); + clStatus|=clEnv->library->clSetKernelArg(unsharpMaskKernel,i++,sizeof(cl_uint),(void *)&justBlur); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); + goto cleanup; + } + } + /* launch the kernel */ + { + size_t gsize[2]; + size_t wsize[2]; - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, imageCacheLocalMemorySize, NULL); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), &numCachedPixels); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &chunkSize); - + gsize[0] = ((image->columns + 7) / 8) * 8; + gsize[1] = ((image->rows + 31) / 32) * 32; + wsize[0] = 8; + wsize[1] = 32; - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, pixelAccumulatorLocalMemorySize, NULL); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, weightAccumulatorLocalMemorySize, NULL); - clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, gammaAccumulatorLocalMemorySize, NULL); + clStatus = clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, unsharpMaskKernel, 2, NULL, gsize, wsize, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } + clEnv->library->clFlush(queue); + } + } - if (clStatus != CL_SUCCESS) + /* get result */ + if (ALIGNED(filteredPixels,CLPixelPacket)) { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); - goto cleanup; + length = image->columns * image->rows; + clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus); + } + else + { + length = image->columns * image->rows; + clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL); } - - global_work_size[0] = (resizedColumns+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize; - global_work_size[1] = resizedRows; - - local_work_size[0] = workgroupSize; - local_work_size[1] = 1; - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, horizontalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (clStatus != CL_SUCCESS) { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", "."); goto cleanup; } - clEnv->library->clFlush(queue); - status = MagickTrue; + outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception); cleanup: OpenCLLogException(__FUNCTION__,__LINE__,exception); - if (horizontalKernel != NULL) RelinquishOpenCLKernel(clEnv, horizontalKernel); + image_view=DestroyCacheView(image_view); + if (filteredImage_view != NULL) + filteredImage_view=DestroyCacheView(filteredImage_view); - return(status); + if (kernel != NULL) kernel=DestroyKernelInfo(kernel); + if (imageBuffer!=NULL) clEnv->library->clReleaseMemObject(imageBuffer); + if (filteredImageBuffer!=NULL) clEnv->library->clReleaseMemObject(filteredImageBuffer); + if (imageKernelBuffer!=NULL) clEnv->library->clReleaseMemObject(imageKernelBuffer); + if (unsharpMaskKernel!=NULL) RelinquishOpenCLKernel(clEnv, unsharpMaskKernel); + if (queue != NULL) RelinquishOpenCLCommandQueue(clEnv, queue); + if (outputReady == MagickFalse) + { + if (filteredImage != NULL) + { + DestroyImage(filteredImage); + filteredImage = NULL; + } + } + return(filteredImage); } -static MagickBooleanType resizeVerticalFilter(cl_mem image, + +MagickExport Image *AccelerateUnsharpMaskImage(const Image *image, + const ChannelType channel,const double radius,const double sigma, + const double gain,const double threshold,ExceptionInfo *exception) +{ + Image + *filteredImage; + + assert(image != NULL); + assert(exception != (ExceptionInfo *) NULL); + + if ((checkOpenCLEnvironment(exception) == MagickFalse) || + (checkAccelerateCondition(image, channel) == MagickFalse)) + return NULL; + + if (radius < 12.1) + filteredImage = ComputeUnsharpMaskImageSingle(image,channel,radius,sigma,gain,threshold, 0, exception); + else if (splitImage(image) && (image->rows / 2 > radius)) + filteredImage = ComputeUnsharpMaskImageSection(image,channel,radius,sigma,gain,threshold,exception); + else + filteredImage = ComputeUnsharpMaskImage(image,channel,radius,sigma,gain,threshold,exception); + return(filteredImage); +} + +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% A c c e l e r a t e R e s i z e I m a g e % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% +% AccelerateResizeImage() is an OpenCL implementation of ResizeImage() +% +% AccelerateResizeImage() scales an image to the desired dimensions, using the given +% filter (see AcquireFilterInfo()). +% +% If an undefined filter is given the filter defaults to Mitchell for a +% colormapped image, a image with a matte channel, or if the image is +% enlarged. Otherwise the filter defaults to a Lanczos. +% +% AccelerateResizeImage() was inspired by Paul Heckbert's "zoom" program. +% +% The format of the AccelerateResizeImage method is: +% +% Image *ResizeImage(Image *image,const size_t columns, +% const size_t rows, const ResizeFilter* filter, +% ExceptionInfo *exception) +% +% A description of each parameter follows: +% +% o image: the image. +% +% o columns: the number of columns in the scaled image. +% +% o rows: the number of rows in the scaled image. +% +% o filter: Image filter to use. +% +% o exception: return any errors or warnings in this structure. +% +*/ + +static MagickBooleanType resizeHorizontalFilter(cl_mem image, const unsigned int imageColumns,const unsigned int imageRows, const unsigned int matte,cl_mem resizedImage, const unsigned int resizedColumns,const unsigned int resizedRows, const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients, - const float yFactor,MagickCLEnv clEnv,cl_command_queue queue, + const float xFactor,MagickCLEnv clEnv,cl_command_queue queue, ExceptionInfo *exception) { cl_kernel @@ -3015,7 +3109,7 @@ static MagickBooleanType resizeVerticalFilter(cl_mem image, /* Apply filter to resize vertically from image to resize image. */ - scale=MAGICK_MAX(1.0/yFactor+MagickEpsilon,1.0); + scale=MAGICK_MAX(1.0/xFactor+MagickEpsilon,1.0); support=scale*GetResizeFilterSupport(resizeFilter); if (support < 0.5) { @@ -3028,7 +3122,7 @@ static MagickBooleanType resizeVerticalFilter(cl_mem image, } scale=PerceptibleReciprocal(scale); - if (resizedRows < workgroupSize) + if (resizedColumns < workgroupSize) { chunkSize = 32; pixelPerWorkgroup = 32; @@ -3047,8 +3141,8 @@ DisableMSCWarning(4127) RestoreMSCWarning { /* calculate the local memory size needed per workgroup */ - cacheRangeStart = (int) (((0 + 0.5)/yFactor+MagickEpsilon)-support+0.5); - cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/yFactor+MagickEpsilon)+support+0.5); + cacheRangeStart = (int) (((0 + 0.5)/xFactor+MagickEpsilon)-support+0.5); + cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/xFactor+MagickEpsilon)+support+0.5); numCachedPixels = cacheRangeEnd - cacheRangeStart + 1; imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket); totalLocalMemorySize = imageCacheLocalMemorySize; @@ -3086,13 +3180,222 @@ RestoreMSCWarning resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter); resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter); + if (resizeFilterType == SincFastWeightingFunction && resizeWindowType == SincFastWeightingFunction) - horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilterSinc"); - else - horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilter"); - - if (horizontalKernel == NULL) + { + horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilterSinc"); + } + else + { + horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeHorizontalFilter"); + } + if (horizontalKernel == NULL) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", "."); + goto cleanup; + } + + i = 0; + clStatus = clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&image); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageColumns); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&imageRows); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&matte); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&xFactor); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizedImage); + + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedColumns); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), (void*)&resizedRows); + + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeFilterType); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), (void*)&resizeWindowType); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(cl_mem), (void*)&resizeFilterCubicCoefficients); + + resizeFilterScale = (float) GetResizeFilterScale(resizeFilter); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterScale); + + resizeFilterSupport = (float) GetResizeFilterSupport(resizeFilter); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterSupport); + + resizeFilterWindowSupport = (float) GetResizeFilterWindowSupport(resizeFilter); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterWindowSupport); + + resizeFilterBlur = (float) GetResizeFilterBlur(resizeFilter); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(float), (void*)&resizeFilterBlur); + + + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, imageCacheLocalMemorySize, NULL); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(int), &numCachedPixels); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &pixelPerWorkgroup); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, sizeof(unsigned int), &chunkSize); + + + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, pixelAccumulatorLocalMemorySize, NULL); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, weightAccumulatorLocalMemorySize, NULL); + clStatus |= clEnv->library->clSetKernelArg(horizontalKernel, i++, gammaAccumulatorLocalMemorySize, NULL); + + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); + goto cleanup; + } + + global_work_size[0] = (resizedColumns+pixelPerWorkgroup-1)/pixelPerWorkgroup*workgroupSize; + global_work_size[1] = resizedRows; + + local_work_size[0] = workgroupSize; + local_work_size[1] = 1; + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, horizontalKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } + clEnv->library->clFlush(queue); + status = MagickTrue; + + +cleanup: + OpenCLLogException(__FUNCTION__,__LINE__,exception); + + if (horizontalKernel != NULL) RelinquishOpenCLKernel(clEnv, horizontalKernel); + + return(status); +} + +static MagickBooleanType resizeVerticalFilter(cl_mem image, + const unsigned int imageColumns,const unsigned int imageRows, + const unsigned int matte,cl_mem resizedImage, + const unsigned int resizedColumns,const unsigned int resizedRows, + const ResizeFilter *resizeFilter,cl_mem resizeFilterCubicCoefficients, + const float yFactor,MagickCLEnv clEnv,cl_command_queue queue, + ExceptionInfo *exception) +{ + cl_kernel + horizontalKernel; + + cl_int clStatus; + + const unsigned int + workgroupSize = 256; + + float + resizeFilterScale, + resizeFilterSupport, + resizeFilterWindowSupport, + resizeFilterBlur, + scale, + support; + + int + cacheRangeStart, + cacheRangeEnd, + numCachedPixels, + resizeFilterType, + resizeWindowType; + + MagickBooleanType + status = MagickFalse; + + size_t + deviceLocalMemorySize, + gammaAccumulatorLocalMemorySize, + global_work_size[2], + imageCacheLocalMemorySize, + pixelAccumulatorLocalMemorySize, + local_work_size[2], + totalLocalMemorySize, + weightAccumulatorLocalMemorySize; + + unsigned int + chunkSize, + i, + pixelPerWorkgroup; + + horizontalKernel = NULL; + status = MagickFalse; + + /* + Apply filter to resize vertically from image to resize image. + */ + scale=MAGICK_MAX(1.0/yFactor+MagickEpsilon,1.0); + support=scale*GetResizeFilterSupport(resizeFilter); + if (support < 0.5) + { + /* + Support too small even for nearest neighbour: Reduce to point + sampling. + */ + support=(MagickRealType) 0.5; + scale=1.0; + } + scale=PerceptibleReciprocal(scale); + + if (resizedRows < workgroupSize) + { + chunkSize = 32; + pixelPerWorkgroup = 32; + } + else + { + chunkSize = workgroupSize; + pixelPerWorkgroup = workgroupSize; + } + + /* get the local memory size supported by the device */ + deviceLocalMemorySize = GetOpenCLDeviceLocalMemorySize(clEnv); + +DisableMSCWarning(4127) + while(1) +RestoreMSCWarning + { + /* calculate the local memory size needed per workgroup */ + cacheRangeStart = (int) (((0 + 0.5)/yFactor+MagickEpsilon)-support+0.5); + cacheRangeEnd = (int) ((((pixelPerWorkgroup-1) + 0.5)/yFactor+MagickEpsilon)+support+0.5); + numCachedPixels = cacheRangeEnd - cacheRangeStart + 1; + imageCacheLocalMemorySize = numCachedPixels * sizeof(CLPixelPacket); + totalLocalMemorySize = imageCacheLocalMemorySize; + + /* local size for the pixel accumulator */ + pixelAccumulatorLocalMemorySize = chunkSize * sizeof(cl_float4); + totalLocalMemorySize+=pixelAccumulatorLocalMemorySize; + + /* local memory size for the weight accumulator */ + weightAccumulatorLocalMemorySize = chunkSize * sizeof(float); + totalLocalMemorySize+=weightAccumulatorLocalMemorySize; + + /* local memory size for the gamma accumulator */ + if (matte == 0) + gammaAccumulatorLocalMemorySize = sizeof(float); + else + gammaAccumulatorLocalMemorySize = chunkSize * sizeof(float); + totalLocalMemorySize+=gammaAccumulatorLocalMemorySize; + + if (totalLocalMemorySize <= deviceLocalMemorySize) + break; + else + { + pixelPerWorkgroup = pixelPerWorkgroup/2; + chunkSize = chunkSize/2; + if (pixelPerWorkgroup == 0 + || chunkSize == 0) + { + /* quit, fallback to CPU */ + goto cleanup; + } + } + } + + resizeFilterType = (int)GetResizeFilterWeightingType(resizeFilter); + resizeWindowType = (int)GetResizeFilterWindowWeightingType(resizeFilter); + + if (resizeFilterType == SincFastWeightingFunction + && resizeWindowType == SincFastWeightingFunction) + horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilterSinc"); + else + horizontalKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "ResizeVerticalFilter"); + + if (horizontalKernel == NULL) { (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", "."); goto cleanup; @@ -5343,468 +5646,62 @@ MagickExport MagickBooleanType ComputeContrastStretchImageChannel(Image *image, } else { - mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR; - hostPtr = stretch_map; - } - /* create a CL buffer for stretch_map */ - length = (MaxMap+1); - stretchMapBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(PixelPacket), hostPtr, &clStatus); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); - goto cleanup; - } - - /* get the OpenCL kernel */ - stretchKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Stretch"); - if (stretchKernel == NULL) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", "."); - goto cleanup; - } - - /* set the kernel arguments */ - i = 0; - clStatus=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&imageBuffer); - clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(ChannelType),&channel); - clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&stretchMapBuffer); - clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&white); - clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&black); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); - goto cleanup; - } - - /* launch the kernel */ - global_work_size[0] = image->columns; - global_work_size[1] = image->rows; - - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, stretchKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); - goto cleanup; - } - clEnv->library->clFlush(queue); - - /* read the data back */ - if (ALIGNED(inputPixels,CLPixelPacket)) - { - length = image->columns * image->rows; - clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus); - } - else - { - length = image->columns * image->rows; - clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL); - } - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", "."); - goto cleanup; - } - - outputReady=SyncCacheViewAuthenticPixels(image_view,exception); - -cleanup: - OpenCLLogException(__FUNCTION__,__LINE__,exception); - - image_view=DestroyCacheView(image_view); - - if (imageBuffer!=NULL) - clEnv->library->clReleaseMemObject(imageBuffer); - - if (stretchMapBuffer!=NULL) - clEnv->library->clReleaseMemObject(stretchMapBuffer); - if (stretch_map!=NULL) - stretch_map=(PixelPacket *) RelinquishMagickMemory(stretch_map); - - - if (histogramBuffer!=NULL) - clEnv->library->clReleaseMemObject(histogramBuffer); - if (histogram!=NULL) - histogram=(cl_uint4 *) RelinquishMagickMemory(histogram); - - - if (histogramKernel!=NULL) - RelinquishOpenCLKernel(clEnv, histogramKernel); - if (stretchKernel!=NULL) - RelinquishOpenCLKernel(clEnv, stretchKernel); - - if (queue != NULL) - RelinquishOpenCLCommandQueue(clEnv, queue); - - return(outputReady); -} - -MagickExport MagickBooleanType AccelerateContrastStretchImageChannel( - Image *image,const ChannelType channel,const double black_point, - const double white_point,ExceptionInfo *exception) -{ - MagickBooleanType - status; - - assert(image != NULL); - assert(exception != (ExceptionInfo *) NULL); - - if ((checkOpenCLEnvironment(exception) == MagickFalse) || - (checkAccelerateCondition(image, channel) == MagickFalse) || - (checkHistogramCondition(image, channel) == MagickFalse)) - return(MagickFalse); - - status=ComputeContrastStretchImageChannel(image,channel, black_point, white_point, exception); - return(status); -} - -/* -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% % -% % -% % -% D e s p e c k l e I m a g e w i t h O p e n C L % -% % -% % -% % -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% -% DespeckleImage() reduces the speckle noise in an image while perserving the -% edges of the original image. A speckle removing filter uses a complementary -% hulling technique (raising pixels that are darker than their surrounding -% neighbors, then complementarily lowering pixels that are brighter than their -% surrounding neighbors) to reduce the speckle index of that image (reference -% Crimmins speckle removal). -% -% The format of the DespeckleImage method is: -% -% Image *DespeckleImage(const Image *image,ExceptionInfo *exception) -% -% A description of each parameter follows: -% -% o image: the image. -% -% o exception: return any errors or warnings in this structure. -% -*/ - -static Image *ComputeDespeckleImage(const Image *image, - ExceptionInfo*exception) -{ - static const int - X[4] = {0, 1, 1,-1}, - Y[4] = {1, 0, 1, 1}; - - CacheView - *filteredImage_view, - *image_view; - - cl_command_queue - queue; - - cl_context - context; - - cl_int - clStatus; - - cl_kernel - hullPass1, - hullPass2; - - cl_mem_flags - mem_flags; - - cl_mem - filteredImageBuffer, - imageBuffer, - tempImageBuffer[2]; - - const void - *inputPixels; - - Image - *filteredImage; - - int - k, - matte; - - MagickBooleanType - outputReady; - - MagickCLEnv - clEnv; - - MagickSizeType - length; - - size_t - global_work_size[2]; - - unsigned int - imageHeight, - imageWidth; - - void - *filteredPixels, - *hostPtr; - - outputReady = MagickFalse; - clEnv = NULL; - inputPixels = NULL; - filteredImage = NULL; - filteredImage_view = NULL; - filteredPixels = NULL; - context = NULL; - imageBuffer = NULL; - filteredImageBuffer = NULL; - hullPass1 = NULL; - hullPass2 = NULL; - queue = NULL; - tempImageBuffer[0] = tempImageBuffer[1] = NULL; - clEnv = GetDefaultOpenCLEnv(); - context = GetOpenCLContext(clEnv); - queue = AcquireOpenCLCommandQueue(clEnv); - - image_view=AcquireVirtualCacheView(image,exception); - inputPixels=GetCacheViewVirtualPixels(image_view,0,0,image->columns,image->rows,exception); - if (inputPixels == (void *) NULL) - { - (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning,"UnableToReadPixelCache.","`%s'",image->filename); - goto cleanup; - } - - if (ALIGNED(inputPixels,CLPixelPacket)) - { - mem_flags = CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR; - } - else - { - mem_flags = CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR; - } - /* create a CL buffer from image pixel buffer */ - length = image->columns * image->rows; - imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); - goto cleanup; - } - - mem_flags = CL_MEM_READ_WRITE; - length = image->columns * image->rows; - for (k = 0; k < 2; k++) - { - tempImageBuffer[k] = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), NULL, &clStatus); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); - goto cleanup; - } - } - - filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception); - assert(filteredImage != NULL); - if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", "."); - goto cleanup; - } - filteredImage_view=AcquireAuthenticCacheView(filteredImage,exception); - filteredPixels=GetCacheViewAuthenticPixels(filteredImage_view,0,0,filteredImage->columns,filteredImage->rows,exception); - if (filteredPixels == (void *) NULL) - { - (void) OpenCLThrowMagickException(exception,GetMagickModule(),CacheWarning, "UnableToReadPixelCache.","`%s'",filteredImage->filename); - goto cleanup; - } - - if (ALIGNED(filteredPixels,CLPixelPacket)) - { - mem_flags = CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR; - hostPtr = filteredPixels; - } - else - { - mem_flags = CL_MEM_WRITE_ONLY; - hostPtr = NULL; - } - /* create a CL buffer from image pixel buffer */ - length = image->columns * image->rows; - filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); - goto cleanup; - } - - hullPass1 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass1"); - hullPass2 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass2"); - - clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)&imageBuffer); - clStatus |=clEnv->library->clSetKernelArg(hullPass1,1,sizeof(cl_mem),(void *)(tempImageBuffer+1)); - imageWidth = image->columns; - clStatus |=clEnv->library->clSetKernelArg(hullPass1,2,sizeof(unsigned int),(void *)&imageWidth); - imageHeight = image->rows; - clStatus |=clEnv->library->clSetKernelArg(hullPass1,3,sizeof(unsigned int),(void *)&imageHeight); - matte = (image->alpha_trait==BlendPixelTrait)?0:1; - clStatus |=clEnv->library->clSetKernelArg(hullPass1,6,sizeof(int),(void *)&matte); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); - goto cleanup; - } - - clStatus = clEnv->library->clSetKernelArg(hullPass2,0,sizeof(cl_mem),(void *)(tempImageBuffer+1)); - clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)tempImageBuffer); - imageWidth = image->columns; - clStatus |=clEnv->library->clSetKernelArg(hullPass2,2,sizeof(unsigned int),(void *)&imageWidth); - imageHeight = image->rows; - clStatus |=clEnv->library->clSetKernelArg(hullPass2,3,sizeof(unsigned int),(void *)&imageHeight); - matte = (image->alpha_trait==BlendPixelTrait)?0:1; - clStatus |=clEnv->library->clSetKernelArg(hullPass2,6,sizeof(int),(void *)&matte); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); - goto cleanup; - } - - - global_work_size[0] = image->columns; - global_work_size[1] = image->rows; - - - for (k = 0; k < 4; k++) - { - cl_int2 offset; - int polarity; - - - offset.s[0] = X[k]; - offset.s[1] = Y[k]; - polarity = 1; - clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset); - clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity); - clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset); - clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); - goto cleanup; - } - /* launch the kernel */ - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); - goto cleanup; - } - /* launch the kernel */ - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); - goto cleanup; - } - - - if (k == 0) - clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)(tempImageBuffer)); - offset.s[0] = -X[k]; - offset.s[1] = -Y[k]; - polarity = 1; - clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset); - clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity); - clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset); - clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); - goto cleanup; - } - /* launch the kernel */ - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); - goto cleanup; - } - /* launch the kernel */ - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); - goto cleanup; - } - - offset.s[0] = -X[k]; - offset.s[1] = -Y[k]; - polarity = -1; - clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset); - clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity); - clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset); - clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); - goto cleanup; - } - /* launch the kernel */ - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); - goto cleanup; - } - /* launch the kernel */ - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); - goto cleanup; - } + mem_flags = CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR; + hostPtr = stretch_map; + } + /* create a CL buffer for stretch_map */ + length = (MaxMap+1); + stretchMapBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(PixelPacket), hostPtr, &clStatus); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); + goto cleanup; + } - offset.s[0] = X[k]; - offset.s[1] = Y[k]; - polarity = -1; - clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset); - clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity); - clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset); - clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity); + /* get the OpenCL kernel */ + stretchKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "Stretch"); + if (stretchKernel == NULL) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", "."); + goto cleanup; + } - if (k == 3) - clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)&filteredImageBuffer); + /* set the kernel arguments */ + i = 0; + clStatus=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&imageBuffer); + clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(ChannelType),&channel); + clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(cl_mem),(void *)&stretchMapBuffer); + clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&white); + clStatus|=clEnv->library->clSetKernelArg(stretchKernel,i++,sizeof(FloatPixelPacket),&black); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); + goto cleanup; + } - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); - goto cleanup; - } - /* launch the kernel */ - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); - goto cleanup; - } - /* launch the kernel */ - clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); - goto cleanup; - } + /* launch the kernel */ + global_work_size[0] = image->columns; + global_work_size[1] = image->rows; + + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, stretchKernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; } + clEnv->library->clFlush(queue); - if (ALIGNED(filteredPixels,CLPixelPacket)) + /* read the data back */ + if (ALIGNED(inputPixels,CLPixelPacket)) { length = image->columns * image->rows; - clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus); + clEnv->library->clEnqueueMapBuffer(queue, imageBuffer, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus); } else { length = image->columns * image->rows; - clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL); + clStatus = clEnv->library->clEnqueueReadBuffer(queue, imageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), inputPixels, 0, NULL, NULL); } if (clStatus != CL_SUCCESS) { @@ -5812,50 +5709,95 @@ static Image *ComputeDespeckleImage(const Image *image, goto cleanup; } - outputReady=SyncCacheViewAuthenticPixels(filteredImage_view,exception); + outputReady=SyncCacheViewAuthenticPixels(image_view,exception); cleanup: OpenCLLogException(__FUNCTION__,__LINE__,exception); image_view=DestroyCacheView(image_view); - if (filteredImage_view != NULL) - filteredImage_view=DestroyCacheView(filteredImage_view); - if (queue != NULL) RelinquishOpenCLCommandQueue(clEnv, queue); - if (imageBuffer!=NULL) clEnv->library->clReleaseMemObject(imageBuffer); - for (k = 0; k < 2; k++) - { - if (tempImageBuffer[k]!=NULL) clEnv->library->clReleaseMemObject(tempImageBuffer[k]); - } - if (filteredImageBuffer!=NULL) clEnv->library->clReleaseMemObject(filteredImageBuffer); - if (hullPass1!=NULL) RelinquishOpenCLKernel(clEnv, hullPass1); - if (hullPass2!=NULL) RelinquishOpenCLKernel(clEnv, hullPass2); - if (outputReady == MagickFalse && filteredImage != NULL) - filteredImage=DestroyImage(filteredImage); - return(filteredImage); + if (imageBuffer!=NULL) + clEnv->library->clReleaseMemObject(imageBuffer); + + if (stretchMapBuffer!=NULL) + clEnv->library->clReleaseMemObject(stretchMapBuffer); + if (stretch_map!=NULL) + stretch_map=(PixelPacket *) RelinquishMagickMemory(stretch_map); + + + if (histogramBuffer!=NULL) + clEnv->library->clReleaseMemObject(histogramBuffer); + if (histogram!=NULL) + histogram=(cl_uint4 *) RelinquishMagickMemory(histogram); + + + if (histogramKernel!=NULL) + RelinquishOpenCLKernel(clEnv, histogramKernel); + if (stretchKernel!=NULL) + RelinquishOpenCLKernel(clEnv, stretchKernel); + + if (queue != NULL) + RelinquishOpenCLCommandQueue(clEnv, queue); + + return(outputReady); } -MagickExport Image *AccelerateDespeckleImage(const Image* image, - ExceptionInfo* exception) +MagickExport MagickBooleanType AccelerateContrastStretchImageChannel( + Image *image,const ChannelType channel,const double black_point, + const double white_point,ExceptionInfo *exception) { - Image - *filteredImage; + MagickBooleanType + status; assert(image != NULL); assert(exception != (ExceptionInfo *) NULL); if ((checkOpenCLEnvironment(exception) == MagickFalse) || - (checkAccelerateCondition(image, AllChannels) == MagickFalse)) - return NULL; + (checkAccelerateCondition(image, channel) == MagickFalse) || + (checkHistogramCondition(image, channel) == MagickFalse)) + return(MagickFalse); - filteredImage=ComputeDespeckleImage(image,exception); - return(filteredImage); + status=ComputeContrastStretchImageChannel(image,channel, black_point, white_point, exception); + return(status); } -static Image *ComputeAddNoiseImage(const Image *image, - const ChannelType channel,const NoiseType noise_type, - ExceptionInfo *exception) +/* +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% D e s p e c k l e I m a g e w i t h O p e n C L % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% +% DespeckleImage() reduces the speckle noise in an image while perserving the +% edges of the original image. A speckle removing filter uses a complementary +% hulling technique (raising pixels that are darker than their surrounding +% neighbors, then complementarily lowering pixels that are brighter than their +% surrounding neighbors) to reduce the speckle index of that image (reference +% Crimmins speckle removal). +% +% The format of the DespeckleImage method is: +% +% Image *DespeckleImage(const Image *image,ExceptionInfo *exception) +% +% A description of each parameter follows: +% +% o image: the image. +% +% o exception: return any errors or warnings in this structure. +% +*/ + +static Image *ComputeDespeckleImage(const Image *image, + ExceptionInfo*exception) { + static const int + X[4] = {0, 1, 1,-1}, + Y[4] = {1, 0, 1, 1}; + CacheView *filteredImage_view, *image_view; @@ -5870,7 +5812,8 @@ static Image *ComputeAddNoiseImage(const Image *image, clStatus; cl_kernel - addNoiseKernel; + hullPass1, + hullPass2; cl_mem_flags mem_flags; @@ -5878,17 +5821,17 @@ static Image *ComputeAddNoiseImage(const Image *image, cl_mem filteredImageBuffer, imageBuffer, - randomNumberBuffer; - - const char - *option; + tempImageBuffer[2]; const void *inputPixels; - float - attenuate, - *randomNumberBufferPtr; + Image + *filteredImage; + + int + k, + matte; MagickBooleanType outputReady; @@ -5899,31 +5842,12 @@ static Image *ComputeAddNoiseImage(const Image *image, MagickSizeType length; - Image - *filteredImage; - - int - i; - - RandomInfo - **restrict random_info; - size_t global_work_size[2]; unsigned int - inputColumns, - inputRows, - k, - numRandomNumberPerBuffer, - numRandomNumberPerPixel, - numRowsPerKernelLaunch, - r; - -#if defined(MAGICKCORE_OPENMP_SUPPORT) - unsigned long - key; -#endif + imageHeight, + imageWidth; void *filteredPixels, @@ -5935,14 +5859,13 @@ static Image *ComputeAddNoiseImage(const Image *image, filteredImage = NULL; filteredImage_view = NULL; filteredPixels = NULL; - randomNumberBufferPtr = NULL; context = NULL; imageBuffer = NULL; - randomNumberBuffer = NULL; filteredImageBuffer = NULL; + hullPass1 = NULL; + hullPass2 = NULL; queue = NULL; - addNoiseKernel = NULL; - + tempImageBuffer[0] = tempImageBuffer[1] = NULL; clEnv = GetDefaultOpenCLEnv(); context = GetOpenCLContext(clEnv); queue = AcquireOpenCLCommandQueue(clEnv); @@ -5972,6 +5895,17 @@ static Image *ComputeAddNoiseImage(const Image *image, goto cleanup; } + mem_flags = CL_MEM_READ_WRITE; + length = image->columns * image->rows; + for (k = 0; k < 2; k++) + { + tempImageBuffer[k] = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), NULL, &clStatus); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); + goto cleanup; + } + } filteredImage = CloneImage(image,image->columns,image->rows,MagickTrue,exception); assert(filteredImage != NULL); @@ -6007,105 +5941,162 @@ static Image *ComputeAddNoiseImage(const Image *image, goto cleanup; } - /* find out how many random numbers needed by pixel */ - numRandomNumberPerPixel = 0; - { - unsigned int numRandPerChannel = 0; - switch (noise_type) - { - case UniformNoise: - case ImpulseNoise: - case LaplacianNoise: - case RandomNoise: - default: - numRandPerChannel = 1; - break; - case GaussianNoise: - case MultiplicativeGaussianNoise: - case PoissonNoise: - numRandPerChannel = 2; - break; - }; + hullPass1 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass1"); + hullPass2 = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "HullPass2"); - if ((channel & RedChannel) != 0) - numRandomNumberPerPixel+=numRandPerChannel; - if ((channel & GreenChannel) != 0) - numRandomNumberPerPixel+=numRandPerChannel; - if ((channel & BlueChannel) != 0) - numRandomNumberPerPixel+=numRandPerChannel; - if ((channel & OpacityChannel) != 0) - numRandomNumberPerPixel+=numRandPerChannel; + clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)&imageBuffer); + clStatus |=clEnv->library->clSetKernelArg(hullPass1,1,sizeof(cl_mem),(void *)(tempImageBuffer+1)); + imageWidth = image->columns; + clStatus |=clEnv->library->clSetKernelArg(hullPass1,2,sizeof(unsigned int),(void *)&imageWidth); + imageHeight = image->rows; + clStatus |=clEnv->library->clSetKernelArg(hullPass1,3,sizeof(unsigned int),(void *)&imageHeight); + matte = (image->alpha_trait==BlendPixelTrait)?0:1; + clStatus |=clEnv->library->clSetKernelArg(hullPass1,6,sizeof(int),(void *)&matte); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); + goto cleanup; } - numRowsPerKernelLaunch = 512; - /* create a buffer for random numbers */ - numRandomNumberPerBuffer = (image->columns*numRowsPerKernelLaunch)*numRandomNumberPerPixel; - randomNumberBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, numRandomNumberPerBuffer*sizeof(float) - , NULL, &clStatus); - - - /* set up the random number generators */ - attenuate=1.0; - option=GetImageArtifact(image,"attenuate"); - if (option != (char *) NULL) - attenuate=StringToDouble(option,(char **) NULL); - random_info=AcquireRandomInfoThreadSet(); -#if defined(MAGICKCORE_OPENMP_SUPPORT) - key=GetRandomSecretKey(random_info[0]); -#endif + clStatus = clEnv->library->clSetKernelArg(hullPass2,0,sizeof(cl_mem),(void *)(tempImageBuffer+1)); + clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)tempImageBuffer); + imageWidth = image->columns; + clStatus |=clEnv->library->clSetKernelArg(hullPass2,2,sizeof(unsigned int),(void *)&imageWidth); + imageHeight = image->rows; + clStatus |=clEnv->library->clSetKernelArg(hullPass2,3,sizeof(unsigned int),(void *)&imageHeight); + matte = (image->alpha_trait==BlendPixelTrait)?0:1; + clStatus |=clEnv->library->clSetKernelArg(hullPass2,6,sizeof(int),(void *)&matte); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); + goto cleanup; + } - addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"AddNoiseImage"); - k = 0; - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&imageBuffer); - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&filteredImageBuffer); - inputColumns = image->columns; - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&inputColumns); - inputRows = image->rows; - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&inputRows); - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(ChannelType),(void *)&channel); - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(NoiseType),(void *)&noise_type); - attenuate=1.0f; - option=GetImageArtifact(image,"attenuate"); - if (option != (char *) NULL) - attenuate=(float)StringToDouble(option,(char **) NULL); - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(float),(void *)&attenuate); - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&randomNumberBuffer); - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&numRandomNumberPerPixel); + global_work_size[0] = image->columns; + global_work_size[1] = image->rows; - global_work_size[0] = inputColumns; - for (r = 0; r < inputRows; r+=numRowsPerKernelLaunch) + + for (k = 0; k < 4; k++) { - /* Generate random numbers in the buffer */ - randomNumberBufferPtr = (float*)clEnv->library->clEnqueueMapBuffer(queue, randomNumberBuffer, CL_TRUE, CL_MAP_WRITE, 0 - , numRandomNumberPerBuffer*sizeof(float), 0, NULL, NULL, &clStatus); + cl_int2 offset; + int polarity; + + + offset.s[0] = X[k]; + offset.s[1] = Y[k]; + polarity = 1; + clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset); + clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity); + clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset); + clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity); if (clStatus != CL_SUCCESS) { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.","."); + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); goto cleanup; } + /* launch the kernel */ + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } + /* launch the kernel */ + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } -#if defined(MAGICKCORE_OPENMP_SUPPORT) - #pragma omp parallel for schedule(static,4) \ - num_threads((key == ~0UL) == 0 ? 1 : (size_t) GetMagickResourceLimit(ThreadResource)) -#endif - for (i = 0; i < numRandomNumberPerBuffer; i++) + + if (k == 0) + clStatus =clEnv->library->clSetKernelArg(hullPass1,0,sizeof(cl_mem),(void *)(tempImageBuffer)); + offset.s[0] = -X[k]; + offset.s[1] = -Y[k]; + polarity = 1; + clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset); + clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity); + clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset); + clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity); + if (clStatus != CL_SUCCESS) { - const int id = GetOpenMPThreadId(); - randomNumberBufferPtr[i] = (float)GetPseudoRandomValue(random_info[id]); + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); + goto cleanup; } + /* launch the kernel */ + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } + /* launch the kernel */ + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } - clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, randomNumberBuffer, randomNumberBufferPtr, 0, NULL, NULL); + offset.s[0] = -X[k]; + offset.s[1] = -Y[k]; + polarity = -1; + clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset); + clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity); + clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset); + clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity); if (clStatus != CL_SUCCESS) { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.","."); + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); goto cleanup; } + /* launch the kernel */ + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } + /* launch the kernel */ + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } + + offset.s[0] = X[k]; + offset.s[1] = Y[k]; + polarity = -1; + clStatus = clEnv->library->clSetKernelArg(hullPass1,4,sizeof(cl_int2),(void *)&offset); + clStatus|= clEnv->library->clSetKernelArg(hullPass1,5,sizeof(int),(void *)&polarity); + clStatus|=clEnv->library->clSetKernelArg(hullPass2,4,sizeof(cl_int2),(void *)&offset); + clStatus|=clEnv->library->clSetKernelArg(hullPass2,5,sizeof(int),(void *)&polarity); + + if (k == 3) + clStatus |=clEnv->library->clSetKernelArg(hullPass2,1,sizeof(cl_mem),(void *)&filteredImageBuffer); - /* set the row offset */ - clEnv->library->clSetKernelArg(addNoiseKernel,k,sizeof(unsigned int),(void *)&r); - global_work_size[1] = MAGICK_MIN(numRowsPerKernelLaunch, inputRows - r); - clEnv->library->clEnqueueNDRangeKernel(queue,addNoiseKernel,2,NULL,global_work_size,NULL,0,NULL,NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); + goto cleanup; + } + /* launch the kernel */ + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass1, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } + /* launch the kernel */ + clStatus = clEnv->library->clEnqueueNDRangeKernel(queue, hullPass2, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueNDRangeKernel failed.", "'%s'", "."); + goto cleanup; + } } if (ALIGNED(filteredPixels,CLPixelPacket)) @@ -6133,20 +6124,40 @@ cleanup: if (filteredImage_view != NULL) filteredImage_view=DestroyCacheView(filteredImage_view); - if (queue!=NULL) RelinquishOpenCLCommandQueue(clEnv, queue); - if (addNoiseKernel!=NULL) RelinquishOpenCLKernel(clEnv, addNoiseKernel); - if (imageBuffer!=NULL) clEnv->library->clReleaseMemObject(imageBuffer); - if (randomNumberBuffer!=NULL) clEnv->library->clReleaseMemObject(randomNumberBuffer); - if (filteredImageBuffer!=NULL) clEnv->library->clReleaseMemObject(filteredImageBuffer); - if (outputReady == MagickFalse && filteredImage != NULL) + if (queue != NULL) RelinquishOpenCLCommandQueue(clEnv, queue); + if (imageBuffer!=NULL) clEnv->library->clReleaseMemObject(imageBuffer); + for (k = 0; k < 2; k++) + { + if (tempImageBuffer[k]!=NULL) clEnv->library->clReleaseMemObject(tempImageBuffer[k]); + } + if (filteredImageBuffer!=NULL) clEnv->library->clReleaseMemObject(filteredImageBuffer); + if (hullPass1!=NULL) RelinquishOpenCLKernel(clEnv, hullPass1); + if (hullPass2!=NULL) RelinquishOpenCLKernel(clEnv, hullPass2); + if (outputReady == MagickFalse && filteredImage != NULL) filteredImage=DestroyImage(filteredImage); + return(filteredImage); +} + +MagickExport Image *AccelerateDespeckleImage(const Image* image, + ExceptionInfo* exception) +{ + Image + *filteredImage; + + assert(image != NULL); + assert(exception != (ExceptionInfo *) NULL); + + if ((checkOpenCLEnvironment(exception) == MagickFalse) || + (checkAccelerateCondition(image, AllChannels) == MagickFalse)) + return NULL; + filteredImage=ComputeDespeckleImage(image,exception); return(filteredImage); } -static Image *ComputeAddNoiseImageOptRandomNum(const Image*image, +static Image *ComputeAddNoiseImage(const Image *image, const ChannelType channel,const NoiseType noise_type, - ExceptionInfo *exception) + ExceptionInfo *exception) { CacheView *filteredImage_view, @@ -6159,21 +6170,24 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image, context; cl_int + inputPixelCount, + pixelsPerWorkitem, clStatus; - cl_kernel - addNoiseKernel, - randomNumberGeneratorKernel; + cl_uint + seed0, + seed1; - cl_mem - filteredImageBuffer, - imageBuffer, - randomNumberBuffer, - randomNumberSeedsBuffer; + cl_kernel + addNoiseKernel; cl_mem_flags mem_flags; + cl_mem + filteredImageBuffer, + imageBuffer; + const char *option; @@ -6181,14 +6195,7 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image, *inputPixels; float - attenuate, - fNormalize; - - Image - *filteredImage; - - int - i; + attenuate; MagickBooleanType outputReady; @@ -6199,20 +6206,24 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image, MagickSizeType length; + Image + *filteredImage; + + RandomInfo + **restrict random_info; + size_t - global_work_size[2], - random_work_size; + global_work_size[1], + local_work_size[1]; unsigned int - initRandom, - inputColumns, - inputRows, k, - numRandomNumberGenerators, - numRandomNumberPerBuffer, - numRandomNumberPerPixel, - numRowsPerKernelLaunch, - r; + numRandomNumberPerPixel; + +#if defined(MAGICKCORE_OPENMP_SUPPORT) + unsigned long + key; +#endif void *filteredPixels, @@ -6226,12 +6237,9 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image, filteredPixels = NULL; context = NULL; imageBuffer = NULL; - randomNumberBuffer = NULL; filteredImageBuffer = NULL; - randomNumberSeedsBuffer = NULL; queue = NULL; addNoiseKernel = NULL; - randomNumberGeneratorKernel = NULL; clEnv = GetDefaultOpenCLEnv(); context = GetOpenCLContext(clEnv); @@ -6327,72 +6335,44 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image, numRandomNumberPerPixel+=numRandPerChannel; } - numRowsPerKernelLaunch = 512; + /* set up the random number generators */ + attenuate=1.0; + option=GetImageArtifact(image,"attenuate"); + if (option != (char *) NULL) + attenuate=StringToDouble(option,(char **) NULL); + random_info=AcquireRandomInfoThreadSet(); +#if defined(MAGICKCORE_OPENMP_SUPPORT) + key=GetRandomSecretKey(random_info[0]); +#endif - /* create a buffer for random numbers */ - numRandomNumberPerBuffer = (image->columns*numRowsPerKernelLaunch)*numRandomNumberPerPixel; - randomNumberBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_READ_WRITE, numRandomNumberPerBuffer*sizeof(float) - , NULL, &clStatus); + addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"GenerateNoiseImage"); { - /* setup the random number generators */ - unsigned long* seeds; - numRandomNumberGenerators = 512; - randomNumberSeedsBuffer = clEnv->library->clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR|CL_MEM_READ_WRITE - , numRandomNumberGenerators * 4 * sizeof(unsigned long), NULL, &clStatus); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.","."); - goto cleanup; - } - seeds = (unsigned long*) clEnv->library->clEnqueueMapBuffer(queue, randomNumberSeedsBuffer, CL_TRUE, CL_MAP_WRITE, 0 - , numRandomNumberGenerators*4*sizeof(unsigned long), 0, NULL, NULL, &clStatus); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueMapBuffer failed.","."); - goto cleanup; - } - - for (i = 0; i < numRandomNumberGenerators; i++) { - RandomInfo* randomInfo = AcquireRandomInfo(); - const unsigned long* s = GetRandomInfoSeed(randomInfo); - - if (i == 0) - fNormalize = GetRandomInfoNormalize(randomInfo); - - seeds[i*4] = s[0]; - randomInfo = DestroyRandomInfo(randomInfo); - } - - clStatus = clEnv->library->clEnqueueUnmapMemObject(queue, randomNumberSeedsBuffer, seeds, 0, NULL, NULL); - if (clStatus != CL_SUCCESS) - { - (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clEnqueueUnmapMemObject failed.","."); - goto cleanup; - } - - randomNumberGeneratorKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE - ,"randomNumberGeneratorKernel"); - - k = 0; - clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(cl_mem),(void *)&randomNumberSeedsBuffer); - clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(float),(void *)&fNormalize); - clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(cl_mem),(void *)&randomNumberBuffer); - initRandom = 1; - clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(unsigned int),(void *)&initRandom); - clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,k++,sizeof(unsigned int),(void *)&numRandomNumberPerBuffer); + cl_uint computeUnitCount; + cl_uint workItemCount; + clEnv->library->clGetDeviceInfo(clEnv->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &computeUnitCount, NULL); + workItemCount = computeUnitCount * 2 * 256; // 256 work items per group, 2 groups per CU + inputPixelCount = image->columns * image->rows; + pixelsPerWorkitem = (inputPixelCount + workItemCount - 1) / workItemCount; + pixelsPerWorkitem = ((pixelsPerWorkitem + 3) / 4) * 4; - random_work_size = numRandomNumberGenerators; + local_work_size[0] = 256; + global_work_size[0] = workItemCount; + } + { + RandomInfo* randomInfo = AcquireRandomInfo(); + const unsigned long* s = GetRandomInfoSeed(randomInfo); + seed0 = s[0]; + GetPseudoRandomValue(randomInfo); + seed1 = s[0]; + randomInfo = DestroyRandomInfo(randomInfo); } - addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"AddNoiseImage"); k = 0; clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&imageBuffer); clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&filteredImageBuffer); - inputColumns = image->columns; - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&inputColumns); - inputRows = image->rows; - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&inputRows); + clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&inputPixelCount); + clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&pixelsPerWorkitem); clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(ChannelType),(void *)&channel); clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(NoiseType),(void *)&noise_type); attenuate=1.0f; @@ -6400,28 +6380,11 @@ static Image *ComputeAddNoiseImageOptRandomNum(const Image*image, if (option != (char *) NULL) attenuate=(float)StringToDouble(option,(char **) NULL); clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(float),(void *)&attenuate); - clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_mem),(void *)&randomNumberBuffer); + clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&seed0); + clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(cl_uint),(void *)&seed1); clEnv->library->clSetKernelArg(addNoiseKernel,k++,sizeof(unsigned int),(void *)&numRandomNumberPerPixel); - global_work_size[0] = inputColumns; - for (r = 0; r < inputRows; r+=numRowsPerKernelLaunch) - { - size_t generator_local_size = 64; - /* Generate random numbers in the buffer */ - clEnv->library->clEnqueueNDRangeKernel(queue,randomNumberGeneratorKernel,1,NULL - ,&random_work_size,&generator_local_size,0,NULL,NULL); - if (initRandom != 0) - { - /* make sure we only do init once */ - initRandom = 0; - clEnv->library->clSetKernelArg(randomNumberGeneratorKernel,3,sizeof(unsigned int),(void *)&initRandom); - } - - /* set the row offset */ - clEnv->library->clSetKernelArg(addNoiseKernel,k,sizeof(unsigned int),(void *)&r); - global_work_size[1] = MAGICK_MIN(numRowsPerKernelLaunch, inputRows - r); - clEnv->library->clEnqueueNDRangeKernel(queue,addNoiseKernel,2,NULL,global_work_size,NULL,0,NULL,NULL); - } + clEnv->library->clEnqueueNDRangeKernel(queue,addNoiseKernel,1,NULL,global_work_size,NULL,0,NULL,NULL); if (ALIGNED(filteredPixels,CLPixelPacket)) { @@ -6450,17 +6413,15 @@ cleanup: if (queue!=NULL) RelinquishOpenCLCommandQueue(clEnv, queue); if (addNoiseKernel!=NULL) RelinquishOpenCLKernel(clEnv, addNoiseKernel); - if (randomNumberGeneratorKernel!=NULL) RelinquishOpenCLKernel(clEnv, randomNumberGeneratorKernel); if (imageBuffer!=NULL) clEnv->library->clReleaseMemObject(imageBuffer); - if (randomNumberBuffer!=NULL) clEnv->library->clReleaseMemObject(randomNumberBuffer); if (filteredImageBuffer!=NULL) clEnv->library->clReleaseMemObject(filteredImageBuffer); - if (randomNumberSeedsBuffer!=NULL) clEnv->library->clReleaseMemObject(randomNumberSeedsBuffer); if (outputReady == MagickFalse && filteredImage != NULL) filteredImage=DestroyImage(filteredImage); return(filteredImage); } + MagickExport Image *AccelerateAddNoiseImage(const Image *image, const ChannelType channel,const NoiseType noise_type, ExceptionInfo *exception) @@ -6475,12 +6436,7 @@ MagickExport Image *AccelerateAddNoiseImage(const Image *image, (checkAccelerateCondition(image, channel) == MagickFalse)) return NULL; -DisableMSCWarning(4127) - if (sizeof(unsigned long) == 4) -RestoreMSCWarning - filteredImage = ComputeAddNoiseImageOptRandomNum(image,channel,noise_type,exception); - else - filteredImage = ComputeAddNoiseImage(image,channel,noise_type,exception); + filteredImage = ComputeAddNoiseImage(image,channel,noise_type,exception); return(filteredImage); }