From be04cd4a903ac006a2d6f9607cad24aa4fe491bf Mon Sep 17 00:00:00 2001 From: dirk Date: Mon, 28 Mar 2016 18:16:33 +0200 Subject: [PATCH] AccelerateWaveletDenoiseImage now supports R/RGB images. --- MagickCore/accelerate-private.h | 95 ++++++++++-------------- MagickCore/accelerate.c | 126 ++++++++++---------------------- MagickCore/opencl-private.h | 2 +- MagickCore/opencl.c | 1 - 4 files changed, 77 insertions(+), 147 deletions(-) diff --git a/MagickCore/accelerate-private.h b/MagickCore/accelerate-private.h index 1a410135a..09f858c0c 100644 --- a/MagickCore/accelerate-private.h +++ b/MagickCore/accelerate-private.h @@ -490,6 +490,16 @@ OPENCL_ENDIF() return intensity; } + + inline int mirrorBottom(int value) + { + return (value < 0) ? - (value) : value; + } + + inline int mirrorTop(int value, int width) + { + return (value >= width) ? (2 * width - value - 1) : value; + } ) /* @@ -2149,14 +2159,6 @@ OPENCL_ENDIF() */ STRINGIFY( - inline int mirrorBottom(int value) - { - return (value < 0) ? - (value) : value; - } - inline int mirrorTop(int value, int width) - { - return (value >= width) ? (2 * width - value - 1) : value; - } __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage, const int radius, @@ -3526,15 +3528,17 @@ STRINGIFY( STRINGIFY( __kernel __attribute__((reqd_work_group_size(64, 4, 1))) - void WaveletDenoise(__global CLPixelType *srcImage, __global CLPixelType *dstImage, - const float threshold,const int passes,const int imageWidth,const int imageHeight) + void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage, + const unsigned int number_channels,const unsigned int max_channels, + const float threshold,const int passes,const unsigned int imageWidth, + const unsigned int imageHeight) { - const int pad = (1 << (passes - 1));; + const int pad = (1 << (passes - 1)); const int tileSize = 64; const int tileRowPixels = 64; const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 }; - CLPixelType stage[16]; + CLQuantum stage[48]; // 16 * 3 (we only need 3 channels) local float buffer[64 * 64]; @@ -3542,27 +3546,17 @@ STRINGIFY( int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad; for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { - stage[i / 4] = srcImage[mirrorTop(mirrorBottom(srcx), imageWidth) + (mirrorTop(mirrorBottom(srcy + i) , imageHeight)) * imageWidth]; + int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + + (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels; + + for (int channel = 0; channel < max_channels; ++channel) + stage[(i / 4) + (16 * channel)] = srcImage[pos + channel]; } - - for (int channel = 0; channel < 3; ++channel) { + for (int channel = 0; channel < max_channels; ++channel) { // Load LDS - switch (channel) { - case 0: - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s0); - break; - case 1: - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s1); - break; - case 2: - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s2); - break; - } - + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) + buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]); // Process @@ -3570,16 +3564,14 @@ STRINGIFY( float accum[16]; float pixel; + for (int i = 0; i < 16; i++) + accum[i]=0.0f; + for (int pass = 0; pass < passes; ++pass) { const int radius = 1 << pass; const int x = get_local_id(0); const float thresh = threshold * noise[pass]; - if (pass == 0) - accum[0] = accum[1] = accum[2] = accum[3] = accum[4] = accum[5] = accum[6] = accum[6] = accum[7] = accum[8] = accum[9] = accum[10] = accum[11] = accum[12] = accum[13] = accum[14] = accum[15] = 0.0f; - - // Snapshot input - // Apply horizontal hat for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { const int offset = i * tileRowPixels; @@ -3590,6 +3582,7 @@ STRINGIFY( buffer[x + offset] = pixel; } barrier(CLK_LOCAL_MEM_FENCE); + // Apply vertical hat for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]); @@ -3602,43 +3595,33 @@ STRINGIFY( else delta = 0; accum[i / 4] += delta; - } barrier(CLK_LOCAL_MEM_FENCE); + if (pass < passes - 1) for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass + buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass else // last pass for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output + accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output barrier(CLK_LOCAL_MEM_FENCE); } - switch (channel) { - case 0: - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - stage[i / 4].s0 = ClampToQuantum(accum[i / 4]); - break; - case 1: - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - stage[i / 4].s1 = ClampToQuantum(accum[i / 4]); - break; - case 2: - for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) - stage[i / 4].s2 = ClampToQuantum(accum[i / 4]); - break; - } + for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) + stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]); barrier(CLK_LOCAL_MEM_FENCE); } // Write from stage to output - if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { - //for (int i = pad + get_local_id(1); i < tileSize - pad; i += get_local_size(1)) { + if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { - if ((i >= pad) && (i < tileSize - pad) && (srcy + i > 0) && (srcy + i < imageHeight)) { - dstImage[srcx + (srcy + i) * imageWidth] = stage[i / 4]; + if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) { + int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels)); + for (int channel = 0; channel < max_channels; ++channel) { + dstImage[pos + channel] = stage[(i / 4) + (16 * channel)]; + } } } } diff --git a/MagickCore/accelerate.c b/MagickCore/accelerate.c index 76e4940e7..7f089c042 100644 --- a/MagickCore/accelerate.c +++ b/MagickCore/accelerate.c @@ -469,7 +469,8 @@ static Image *ComputeAddNoiseImage(const Image *image, goto cleanup; filteredImage=CloneImage(image,image->columns,image->rows,MagickTrue,exception); - assert(filteredImage != (Image *) NULL); + if (filteredImage == (Image *) NULL) + goto cleanup; if (SetImageStorageClass(filteredImage,DirectClass,exception) != MagickTrue) { (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", "."); @@ -513,6 +514,11 @@ static Image *ComputeAddNoiseImage(const Image *image, } addNoiseKernel = AcquireOpenCLKernel(clEnv,MAGICK_OPENCL_ACCELERATE,"AddNoise"); + if (addNoiseKernel == NULL) + { + (void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", "."); + goto cleanup; + } { cl_uint computeUnitCount; @@ -7398,7 +7404,7 @@ static Image *ComputeWaveletDenoiseImage(const Image *image, queue; cl_context - context; + context; cl_int clStatus; @@ -7413,12 +7419,6 @@ static Image *ComputeWaveletDenoiseImage(const Image *image, filteredImageBuffer, imageBuffer; - cl_mem_flags - mem_flags; - - const void - *inputPixels; - Image *filteredImage; @@ -7428,24 +7428,17 @@ static Image *ComputeWaveletDenoiseImage(const Image *image, MagickCLEnv clEnv; - MagickSizeType - length; - void - *filteredPixels, - *hostPtr; + *filteredPixels; unsigned int i; - clEnv = NULL; filteredImage = NULL; filteredImage_view = NULL; - context = NULL; - imageBuffer = NULL; filteredImageBuffer = NULL; + filteredPixels = NULL; denoiseKernel = NULL; - queue = NULL; outputReady = MagickFalse; clEnv = GetDefaultOpenCLEnv(); @@ -7454,68 +7447,24 @@ static Image *ComputeWaveletDenoiseImage(const Image *image, /* Create and initialize OpenCL buffers. */ image_view = AcquireVirtualCacheView(image, exception); - inputPixels = GetCacheViewVirtualPixels(image_view, 0, 0, image->columns, image->rows, exception); - if (inputPixels == (const void *)NULL) - { - (void)OpenCLThrowMagickException(exception, GetMagickModule(), CacheWarning, "UnableToReadPixelCache.", "`%s'", image->filename); - goto cleanup; - } - - /* If the host pointer is aligned to the size of CLPixelPacket, - then use the host buffer directly from the GPU; otherwise, - create a buffer on the GPU and copy the data over */ - if (ALIGNED(inputPixels, CLPixelPacket)) - { - mem_flags = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR; - } - else - { - mem_flags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR; - } - /* create a CL buffer from image pixel buffer */ - length = image->columns * image->rows; - imageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), (void*)inputPixels, &clStatus); - if (clStatus != CL_SUCCESS) - { - (void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.", "."); + imageBuffer=createReadBuffer(image,image_view,clEnv,context,exception); + if (imageBuffer == (cl_mem) NULL) goto cleanup; - } /* create output */ - filteredImage = CloneImage(image, image->columns, image->rows, MagickTrue, exception); - assert(filteredImage != NULL); + filteredImage=CloneImage(image,0,0,MagickTrue,exception); + if (filteredImage == (Image *) NULL) + goto cleanup; if (SetImageStorageClass(filteredImage, DirectClass, exception) != MagickTrue) { (void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "CloneImage failed.", "'%s'", "."); goto cleanup; } filteredImage_view = AcquireAuthenticCacheView(filteredImage, exception); - filteredPixels = GetCacheViewAuthenticPixels(filteredImage_view, 0, 0, filteredImage->columns, filteredImage->rows, exception); - if (filteredPixels == (void *)NULL) - { - (void)OpenCLThrowMagickException(exception, GetMagickModule(), CacheWarning, "UnableToReadPixelCache.", "`%s'", filteredImage->filename); - goto cleanup; - } - - if (ALIGNED(filteredPixels, CLPixelPacket)) - { - mem_flags = CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR; - hostPtr = filteredPixels; - } - else - { - mem_flags = CL_MEM_WRITE_ONLY; - hostPtr = NULL; - } - - /* create a CL buffer from image pixel buffer */ - length = image->columns * image->rows; - filteredImageBuffer = clEnv->library->clCreateBuffer(context, mem_flags, length * sizeof(CLPixelPacket), hostPtr, &clStatus); - if (clStatus != CL_SUCCESS) - { - (void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clCreateBuffer failed.", "."); + filteredImageBuffer=createWriteBuffer(filteredImage,filteredImage_view,clEnv, + context,filteredPixels,exception); + if (filteredImageBuffer == (cl_mem) NULL) goto cleanup; - } /* get the opencl kernel */ denoiseKernel = AcquireOpenCLKernel(clEnv, MAGICK_OPENCL_ACCELERATE, "WaveletDenoise"); @@ -7523,23 +7472,34 @@ static Image *ComputeWaveletDenoiseImage(const Image *image, { (void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "AcquireOpenCLKernel failed.", "'%s'", "."); goto cleanup; - }; + } // Process image { const int PASSES = 5; - cl_int width = (cl_int)image->columns; - cl_int height = (cl_int)image->rows; + cl_uint number_channels = (cl_uint)image->number_channels; + cl_uint width = (cl_uint)image->columns; + cl_uint height = (cl_uint)image->rows; + cl_uint max_channels = number_channels; + if ((max_channels == 4) || (max_channels == 2)) + max_channels=max_channels-1; cl_float thresh = threshold; /* set the kernel arguments */ i = 0; - clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_mem), (void *)&imageBuffer); + clStatus = clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_mem), (void *)&imageBuffer); clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_mem), (void *)&filteredImageBuffer); + clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_uint), (void *)&number_channels); + clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_uint), (void *)&max_channels); clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_float), (void *)&thresh); clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_int), (void *)&PASSES); - clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_int), (void *)&width); - clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_int), (void *)&height); + clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_uint), (void *)&width); + clStatus |= clEnv->library->clSetKernelArg(denoiseKernel, i++, sizeof(cl_uint), (void *)&height); + if (clStatus != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "clEnv->library->clSetKernelArg failed.", "'%s'", "."); + goto cleanup; + } { const int TILESIZE = 64; @@ -7565,19 +7525,7 @@ static Image *ComputeWaveletDenoiseImage(const Image *image, clEnv->library->clReleaseEvent(event); } - - /* get result */ - if (ALIGNED(filteredPixels, CLPixelPacket)) - { - length = image->columns * image->rows; - clEnv->library->clEnqueueMapBuffer(queue, filteredImageBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, length * sizeof(CLPixelPacket), 0, NULL, NULL, &clStatus); - } - else - { - length = image->columns * image->rows; - clStatus = clEnv->library->clEnqueueReadBuffer(queue, filteredImageBuffer, CL_TRUE, 0, length * sizeof(CLPixelPacket), filteredPixels, 0, NULL, NULL); - } - if (clStatus != CL_SUCCESS) + if (copyWriteBuffer(image,clEnv,queue,filteredImageBuffer,filteredPixels,exception) == MagickFalse) { (void)OpenCLThrowMagickException(exception, GetMagickModule(), ResourceLimitWarning, "Reading output image from CL buffer failed.", "'%s'", "."); goto cleanup; @@ -7616,7 +7564,7 @@ MagickExport Image *AccelerateWaveletDenoiseImage(const Image *image, assert(image != NULL); assert(exception != (ExceptionInfo *)NULL); - if ((checkAccelerateConditionRGBA(image) == MagickFalse) || + if ((checkAccelerateCondition(image) == MagickFalse) || (checkOpenCLEnvironment(exception) == MagickFalse)) return (Image *) NULL; diff --git a/MagickCore/opencl-private.h b/MagickCore/opencl-private.h index 02cd7a215..bfd6b8d72 100644 --- a/MagickCore/opencl-private.h +++ b/MagickCore/opencl-private.h @@ -317,7 +317,7 @@ struct _MagickCLEnv { #define CLOptions "-cl-single-precision-constant -cl-mad-enable -DMAGICKCORE_HDRI_SUPPORT=1 "\ "-DCLQuantum=float -DCLSignedQuantum=float -DCLPixelType=float4 -DQuantumRange=%f " \ "-DQuantumScale=%f -DCharQuantumScale=%f -DMagickEpsilon=%f -DMagickPI=%f "\ - " -DMaxMap=%u -DMAGICKCORE_QUANTUM_DEPTH=%u" + "-DMaxMap=%u -DMAGICKCORE_QUANTUM_DEPTH=%u" #define CLQuantum cl_float #define CLPixelPacket cl_float4 #define CLCharQuantumScale 1.0f diff --git a/MagickCore/opencl.c b/MagickCore/opencl.c index fcc9755ed..f4df318ed 100644 --- a/MagickCore/opencl.c +++ b/MagickCore/opencl.c @@ -1569,7 +1569,6 @@ MagickPrivate cl_kernel AcquireOpenCLKernel(MagickCLEnv clEnv, { kernel=clEnv->library->clCreateKernel(clEnv->programs[program],kernelName, &clStatus); - assert(kernel != (cl_kernel) NULL); } return(kernel); } -- 2.40.0