return intensity;
}
+
+ inline int mirrorBottom(int value)
+ {
+ return (value < 0) ? - (value) : value;
+ }
+
+ inline int mirrorTop(int value, int width)
+ {
+ return (value >= width) ? (2 * width - value - 1) : value;
+ }
)
/*
*/
STRINGIFY(
- inline int mirrorBottom(int value)
- {
- return (value < 0) ? - (value) : value;
- }
- inline int mirrorTop(int value, int width)
- {
- return (value >= width) ? (2 * width - value - 1) : value;
- }
__kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
const int radius,
STRINGIFY(
__kernel __attribute__((reqd_work_group_size(64, 4, 1)))
- void WaveletDenoise(__global CLPixelType *srcImage, __global CLPixelType *dstImage,
- const float threshold,const int passes,const int imageWidth,const int imageHeight)
+ void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
+ const unsigned int number_channels,const unsigned int max_channels,
+ const float threshold,const int passes,const unsigned int imageWidth,
+ const unsigned int imageHeight)
{
- const int pad = (1 << (passes - 1));;
+ const int pad = (1 << (passes - 1));
const int tileSize = 64;
const int tileRowPixels = 64;
const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
- CLPixelType stage[16];
+ CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
local float buffer[64 * 64];
int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad;
for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
- stage[i / 4] = srcImage[mirrorTop(mirrorBottom(srcx), imageWidth) + (mirrorTop(mirrorBottom(srcy + i) , imageHeight)) * imageWidth];
+ int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
+ (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
+
+ for (int channel = 0; channel < max_channels; ++channel)
+ stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
}
-
- for (int channel = 0; channel < 3; ++channel) {
+ for (int channel = 0; channel < max_channels; ++channel) {
// Load LDS
- switch (channel) {
- case 0:
- for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
- buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s0);
- break;
- case 1:
- for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
- buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s1);
- break;
- case 2:
- for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
- buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s2);
- break;
- }
-
+ for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
+ buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
// Process
float accum[16];
float pixel;
+ for (int i = 0; i < 16; i++)
+ accum[i]=0.0f;
+
for (int pass = 0; pass < passes; ++pass) {
const int radius = 1 << pass;
const int x = get_local_id(0);
const float thresh = threshold * noise[pass];
- if (pass == 0)
- accum[0] = accum[1] = accum[2] = accum[3] = accum[4] = accum[5] = accum[6] = accum[6] = accum[7] = accum[8] = accum[9] = accum[10] = accum[11] = accum[12] = accum[13] = accum[14] = accum[15] = 0.0f;
-
- // Snapshot input
-
// Apply horizontal hat
for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
const int offset = i * tileRowPixels;
buffer[x + offset] = pixel;
}
barrier(CLK_LOCAL_MEM_FENCE);
+
// Apply vertical hat
for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
else
delta = 0;
accum[i / 4] += delta;
-
}
barrier(CLK_LOCAL_MEM_FENCE);
+
if (pass < passes - 1)
for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
- buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
+ buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
else // last pass
for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
- accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
+ accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
barrier(CLK_LOCAL_MEM_FENCE);
}
- switch (channel) {
- case 0:
- for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
- stage[i / 4].s0 = ClampToQuantum(accum[i / 4]);
- break;
- case 1:
- for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
- stage[i / 4].s1 = ClampToQuantum(accum[i / 4]);
- break;
- case 2:
- for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
- stage[i / 4].s2 = ClampToQuantum(accum[i / 4]);
- break;
- }
+ for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
+ stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write from stage to output
- if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
- //for (int i = pad + get_local_id(1); i < tileSize - pad; i += get_local_size(1)) {
+ if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
- if ((i >= pad) && (i < tileSize - pad) && (srcy + i > 0) && (srcy + i < imageHeight)) {
- dstImage[srcx + (srcy + i) * imageWidth] = stage[i / 4];
+ if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
+ int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
+ for (int channel = 0; channel < max_channels; ++channel) {
+ dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
+ }
}
}
}
goto cleanup;
filteredImage=CloneImage(image,image->columns,image->rows,MagickTrue,exception);
- assert(filteredImage != (Image *) NULL);
+ if (filteredImage == (Image *) NULL)
+ goto cleanup;
if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue)
{
(void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
}
addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"AddNoise");
+ if (addNoiseKernel == NULL)
+ {
+ (void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
+ goto cleanup;
+ }
{
cl_uint computeUnitCount;
queue;
cl_context
- context;
+ context;
cl_int
clStatus;
filteredImageBuffer,
imageBuffer;
- cl_mem_flags
- mem_flags;
-
- const void
- *inputPixels;
-
Image
*filteredImage;
MagickCLEnv
clEnv;
- MagickSizeType
- length;
-
void
- *filteredPixels,
- *hostPtr;
+ *filteredPixels;
unsigned int
i;
- clEnv = NULL;
filteredImage = NULL;
filteredImage_view = NULL;
- context = NULL;
- imageBuffer = NULL;
filteredImageBuffer = NULL;
+ filteredPixels = NULL;
denoiseKernel = NULL;
- queue = NULL;
outputReady = MagickFalse;
clEnv = GetDefaultOpenCLEnv();
/* Create and initialize OpenCL buffers. */
image_view = AcquireVirtualCacheView(image, exception);
- inputPixels = GetCacheViewVirtualPixels(image_view, 0, 0, image->columns, image->rows, exception);
- if (inputPixels == (const void *)NULL)
- {
- (void)OpenCLThrowMagickException(exception, GetMagickModule(), CacheWarning, "UnableToReadPixelCache.", "`%s'", image->filename);
- goto cleanup;
- }
-
- /* If the host pointer is aligned to the size of CLPixelPacket,
- then use the host buffer directly from the GPU; otherwise,
- create a buffer on the GPU and copy the data over */
- if (ALIGNED(inputPixels, CLPixelPacket))
- {
- mem_flags = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR;
- }
- else
- {
- mem_flags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
- }
- /* create a CL buffer from image pixel buffer */
- length = image->columns * image->rows;
- imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus);
- if (clStatus != CL_SUCCESS)
- {
- (void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.", ".");
+ imageBuffer=createReadBuffer(image,image_view,clEnv,context,exception);
+ if (imageBuffer == (cl_mem) NULL)
goto cleanup;
- }
/* create output */
- filteredImage = CloneImage(image, image->columns, image->rows, MagickTrue, exception);
- assert(filteredImage != NULL);
+ filteredImage=CloneImage(image,0,0,MagickTrue,exception);
+ if (filteredImage == (Image *) NULL)
+ goto cleanup;
if (SetImageStorageClass(filteredImage, DirectClass, exception) != MagickTrue)
{
(void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", ".");
goto cleanup;
}
filteredImage_view = AcquireAuthenticCacheView(filteredImage, exception);
- filteredPixels = GetCacheViewAuthenticPixels(filteredImage_view, 0, 0, filteredImage->columns, filteredImage->rows, exception);
- if (filteredPixels == (void *)NULL)
- {
- (void)OpenCLThrowMagickException(exception, GetMagickModule(), CacheWarning, "UnableToReadPixelCache.", "`%s'", filteredImage->filename);
- goto cleanup;
- }
-
- if (ALIGNED(filteredPixels, CLPixelPacket))
- {
- mem_flags = CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR;
- hostPtr = filteredPixels;
- }
- else
- {
- mem_flags = CL_MEM_WRITE_ONLY;
- hostPtr = NULL;
- }
-
- /* create a CL buffer from image pixel buffer */
- length = image->columns * image->rows;
- filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus);
- if (clStatus != CL_SUCCESS)
- {
- (void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.", ".");
+ filteredImageBuffer=createWriteBuffer(filteredImage,filteredImage_view,clEnv,
+ context,filteredPixels,exception);
+ if (filteredImageBuffer == (cl_mem) NULL)
goto cleanup;
- }
/* get the opencl kernel */
denoiseKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "WaveletDenoise");
{
(void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", ".");
goto cleanup;
- };
+ }
// Process image
{
const int PASSES = 5;
- cl_int width = (cl_int)image->columns;
- cl_int height = (cl_int)image->rows;
+ cl_uint number_channels = (cl_uint)image->number_channels;
+ cl_uint width = (cl_uint)image->columns;
+ cl_uint height = (cl_uint)image->rows;
+ cl_uint max_channels = number_channels;
+ if ((max_channels == 4) || (max_channels == 2))
+ max_channels=max_channels-1;
cl_float thresh = threshold;
/* set the kernel arguments */
i = 0;
- clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_mem), (void *)&imageBuffer);
+ clStatus = clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_mem), (void *)&imageBuffer);
clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_mem), (void *)&filteredImageBuffer);
+ clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_uint), (void *)&number_channels);
+ clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_uint), (void *)&max_channels);
clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_float), (void *)&thresh);
clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_int), (void *)&PASSES);
- clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_int), (void *)&width);
- clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_int), (void *)&height);
+ clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_uint), (void *)&width);
+ clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_uint), (void *)&height);
+ if (clStatus != CL_SUCCESS)
+ {
+ (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", ".");
+ goto cleanup;
+ }
{
const int TILESIZE = 64;
clEnv->library->clReleaseEvent(event);
}
-
- /* get result */
- if (ALIGNED(filteredPixels, CLPixelPacket))
- {
- length = image->columns * image->rows;
- clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus);
- }
- else
- {
- length = image->columns * image->rows;
- clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL);
- }
- if (clStatus != CL_SUCCESS)
+ if (copyWriteBuffer(image,clEnv,queue,filteredImageBuffer,filteredPixels,exception) == MagickFalse)
{
(void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", ".");
goto cleanup;
assert(image != NULL);
assert(exception != (ExceptionInfo *)NULL);
- if ((checkAccelerateConditionRGBA(image) == MagickFalse) ||
+ if ((checkAccelerateCondition(image) == MagickFalse) ||
(checkOpenCLEnvironment(exception) == MagickFalse))
return (Image *) NULL;