From ad91ea878dff57bf0bbe70c06dd915ec14923dcf Mon Sep 17 00:00:00 2001 From: dirk Date: Tue, 20 Sep 2016 22:38:49 +0200 Subject: [PATCH] Work around some issues with OpenCL runtimes. --- MagickCore/accelerate-kernels-private.h | 7 ++- MagickCore/accelerate.c | 76 +++++++++++++++++-------- MagickCore/opencl-private.h | 3 +- MagickCore/opencl.c | 4 +- 4 files changed, 61 insertions(+), 29 deletions(-) diff --git a/MagickCore/accelerate-kernels-private.h b/MagickCore/accelerate-kernels-private.h index 7c60b7e91..0949c3961 100644 --- a/MagickCore/accelerate-kernels-private.h +++ b/MagickCore/accelerate-kernels-private.h @@ -1864,6 +1864,9 @@ OPENCL_ENDIF() int x = get_local_id(0); int y = get_global_id(1); + if ((x >= imageWidth) || (y >= imageHeight)) + return; + global CLPixelType *src = srcImage + y * imageWidth; for (int i = x; i < imageWidth; i += get_local_size(0)) { @@ -3035,8 +3038,8 @@ OPENCL_ENDIF() local float buffer[64 * 64]; - int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0); - int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad; + int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0); + int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad; for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + diff --git a/MagickCore/accelerate.c b/MagickCore/accelerate.c index 32fce29a0..cf8b1f172 100644 --- a/MagickCore/accelerate.c +++ b/MagickCore/accelerate.c @@ -537,7 +537,7 @@ static Image *ComputeAddNoiseImage(const Image *image,MagickCLEnv clEnv, } outputReady=EnqueueOpenCLKernel(queue,addNoiseKernel,1,(const size_t *) NULL,gsize, - lsize,image,filteredImage,exception); + lsize,image,filteredImage,MagickFalse,exception); cleanup: @@ -698,7 +698,7 @@ static Image *ComputeBlurImage(const Image* image,MagickCLEnv clEnv, lsize[1]=1; outputReady=EnqueueOpenCLKernel(queue,blurRowKernel,2,(size_t *) NULL,gsize, - lsize,image,filteredImage,exception); + lsize,image,filteredImage,MagickFalse,exception); if (outputReady == MagickFalse) goto cleanup; @@ -733,7 +733,7 @@ static Image *ComputeBlurImage(const Image* image,MagickCLEnv clEnv, lsize[1]=chunkSize; outputReady=EnqueueOpenCLKernel(queue,blurColumnKernel,2,(size_t *) NULL,gsize, - lsize,image,filteredImage,exception); + lsize,image,filteredImage,MagickFalse,exception); cleanup: @@ -857,7 +857,7 @@ static MagickBooleanType ComputeContrastImage(Image *image,MagickCLEnv clEnv, gsize[1]=image->rows; outputReady=EnqueueOpenCLKernel(queue,contrastKernel,2,(const size_t *) NULL, - gsize,(const size_t *) NULL,image,(Image *) NULL,exception); + gsize,(const size_t *) NULL,image,(Image *) NULL,MagickFalse,exception); cleanup: @@ -1587,6 +1587,10 @@ static Image *ComputeConvolveImage(const Image* image,MagickCLEnv clEnv, device = RequestOpenCLDevice(clEnv); + /* Work around an issue on NVIDIA devices */ + if (strcmp("NVIDIA Corporation",device->vendor_name) == 0) + goto cleanup; + image_view=AcquireAuthenticCacheView(image,exception); inputPixels=GetCacheViewAuthenticPixels(image_view,0,0,image->columns,image->rows,exception); if (inputPixels == (const void *) NULL) @@ -2820,7 +2824,8 @@ static MagickBooleanType ComputeFunctionImage(Image *image,MagickCLEnv clEnv, gsize[0]=image->columns; gsize[1]=image->rows; outputReady=EnqueueOpenCLKernel(queue,functionKernel,2,(const size_t *) NULL, - gsize,(const size_t *) NULL,image,(const Image *) NULL,exception); + gsize,(const size_t *) NULL,image,(const Image *) NULL,MagickFalse, + exception); cleanup: @@ -2941,7 +2946,7 @@ static MagickBooleanType ComputeGrayscaleImage(Image *image,MagickCLEnv clEnv, gsize[1]=image->rows; outputReady=EnqueueOpenCLKernel(queue,grayscaleKernel,2, (const size_t *) NULL,gsize,(const size_t *) NULL,image,(Image *) NULL, - exception); + MagickFalse,exception); cleanup: @@ -3199,7 +3204,7 @@ static Image *ComputeLocalContrastImage(const Image *image,MagickCLEnv clEnv, size_t goffset[2]; gsize[0] = 256; - gsize[1] = image->rows / passes; + gsize[1] = (image->rows + passes - 1) / passes; wsize[0] = 256; wsize[1] = 1; goffset[0] = 0; @@ -4101,7 +4106,9 @@ RestoreMSCWarning lsize[0]=workgroupSize; lsize[1]=1; outputReady=EnqueueOpenCLKernel(queue,horizontalKernel,2, - (const size_t *) NULL,gsize,lsize,image,filteredImage,exception); + (const size_t *) NULL,gsize,lsize,image,filteredImage,MagickFalse, + exception); + cleanup: if (horizontalKernel != (cl_kernel) NULL) @@ -4283,7 +4290,7 @@ RestoreMSCWarning lsize[0]=1; lsize[1]=workgroupSize; outputReady=EnqueueOpenCLKernel(queue,verticalKernel,2,(const size_t *) NULL, - gsize,lsize,image,filteredImage,exception); + gsize,lsize,image,filteredImage,MagickFalse,exception); cleanup: @@ -4631,7 +4638,7 @@ static Image* ComputeRotationalBlurImage(const Image *image,MagickCLEnv clEnv, gsize[1]=image->rows; outputReady=EnqueueOpenCLKernel(queue,rotationalBlurKernel,2, (const size_t *) NULL,gsize,(const size_t *) NULL,image,filteredImage, - exception); + MagickFalse,exception); cleanup: @@ -4815,8 +4822,9 @@ static Image *ComputeUnsharpMaskImage(const Image *image,MagickCLEnv clEnv, lsize[0]=chunkSize; lsize[1]=1; outputReady=EnqueueOpenCLKernel(queue,blurRowKernel,2, - (const size_t *) NULL,gsize,lsize,image,filteredImage,exception); - + (const size_t *) NULL,gsize,lsize,image,filteredImage,MagickFalse, + exception); + chunkSize=256; fGain=(float) gain; fThreshold=(float) threshold; @@ -4847,7 +4855,8 @@ static Image *ComputeUnsharpMaskImage(const Image *image,MagickCLEnv clEnv, lsize[0]=1; lsize[1]=chunkSize; outputReady=EnqueueOpenCLKernel(queue,unsharpMaskBlurColumnKernel,2, - (const size_t *) NULL,gsize,lsize,image,filteredImage,exception); + (const size_t *) NULL,gsize,lsize,image,filteredImage,MagickFalse, + exception); cleanup: @@ -4971,7 +4980,7 @@ static Image *ComputeUnsharpMaskImageSingle(const Image *image, lsize[0]=8; lsize[1]=32; outputReady=EnqueueOpenCLKernel(queue,unsharpMaskKernel,2,(const size_t *) NULL, - gsize,lsize,image,filteredImage,exception); + gsize,lsize,image,filteredImage,MagickFalse,exception); cleanup: @@ -5061,15 +5070,22 @@ static Image *ComputeWaveletDenoiseImage(const Image *image,MagickCLEnv clEnv, device; size_t + goffset[2], gsize[2], i, - lsize[2]; + lsize[2], + passes, + x; filteredImage=NULL; denoiseKernel=NULL; + queue=NULL; outputReady=MagickFalse; device=RequestOpenCLDevice(clEnv); + /* Work around an issue on low end Intel devices */ + if (strcmp("Intel(R) HD Graphics",device->name) == 0) + goto cleanup; queue=AcquireOpenCLCommandQueue(device); filteredImage=CloneImage(image,image->columns,image->rows,MagickTrue, exception); @@ -5099,6 +5115,8 @@ static Image *ComputeWaveletDenoiseImage(const Image *image,MagickCLEnv clEnv, if ((max_channels == 4) || (max_channels == 2)) max_channels=max_channels-1; thresh=threshold; + passes=(((1.0f*image->columns)*image->rows)+1999999.0f)/2000000.0f; + passes=(passes < 1) ? 1 : passes; i=0; status =SetOpenCLKernelArg(denoiseKernel,i++,sizeof(cl_mem),(void *)&imageBuffer); @@ -5110,18 +5128,26 @@ static Image *ComputeWaveletDenoiseImage(const Image *image,MagickCLEnv clEnv, status|=SetOpenCLKernelArg(denoiseKernel,i++,sizeof(cl_uint),(void *)&width); status|=SetOpenCLKernelArg(denoiseKernel,i++,sizeof(cl_uint),(void *)&height); if (status != CL_SUCCESS) + { + (void) OpenCLThrowMagickException(device,exception,GetMagickModule(), + ResourceLimitWarning,"SetOpenCLKernelArg failed.","."); + goto cleanup; + } + + for (x = 0; x < passes; ++x) { - (void) OpenCLThrowMagickException(device,exception,GetMagickModule(), - ResourceLimitWarning,"SetOpenCLKernelArg failed.","."); - goto cleanup; - } + gsize[0]=((width+(SIZE-1))/SIZE)*TILESIZE; + gsize[1]=((((height+(SIZE-1))/SIZE)+passes-1)/passes)*4; + lsize[0]=TILESIZE; + lsize[1]=4; + goffset[0]=0; + goffset[1]=x*gsize[1]; - gsize[0]=((width+(SIZE-1))/SIZE)*TILESIZE; - gsize[1]=((height+(SIZE-1))/SIZE)*4; - lsize[0]=TILESIZE; - lsize[1]=4; - outputReady=EnqueueOpenCLKernel(queue,denoiseKernel,2,(const size_t *) NULL, - gsize,lsize,image,filteredImage,exception); + outputReady=EnqueueOpenCLKernel(queue,denoiseKernel,2,goffset,gsize,lsize, + image,filteredImage,MagickTrue,exception); + if (outputReady == MagickFalse) + break; + } cleanup: diff --git a/MagickCore/opencl-private.h b/MagickCore/opencl-private.h index 3b014537c..840f9b2a3 100644 --- a/MagickCore/opencl-private.h +++ b/MagickCore/opencl-private.h @@ -413,7 +413,8 @@ extern MagickPrivate cl_mem extern MagickPrivate MagickBooleanType EnqueueOpenCLKernel(cl_command_queue,cl_kernel,cl_uint,const size_t *, - const size_t *,const size_t *,const Image *,const Image *,ExceptionInfo *), + const size_t *,const size_t *,const Image *,const Image *, + MagickBooleanType,ExceptionInfo *), InitializeOpenCL(MagickCLEnv,ExceptionInfo *), OpenCLThrowMagickException(MagickCLDevice,ExceptionInfo *, const char *,const char *,const size_t,const ExceptionType,const char *, diff --git a/MagickCore/opencl.c b/MagickCore/opencl.c index 7c18bf8b8..21298119d 100644 --- a/MagickCore/opencl.c +++ b/MagickCore/opencl.c @@ -1580,7 +1580,7 @@ static void RegisterCacheEvent(MagickCLCacheInfo info,cl_event event) MagickPrivate MagickBooleanType EnqueueOpenCLKernel(cl_command_queue queue, cl_kernel kernel,cl_uint work_dim,const size_t *offset,const size_t *gsize, const size_t *lsize,const Image *input_image,const Image *output_image, - ExceptionInfo *exception) + MagickBooleanType flush,ExceptionInfo *exception) { CacheInfo *output_info, @@ -1639,6 +1639,8 @@ MagickPrivate MagickBooleanType EnqueueOpenCLKernel(cl_command_queue queue, "clEnqueueNDRangeKernel failed.","'%s'","."); return(MagickFalse); } + if (flush != MagickFalse) + openCL_library->clFlush(queue); if (RecordProfileData(input_info->opencl->device,kernel,event) == MagickFalse) { RegisterCacheEvent(input_info->opencl,event); -- 2.40.0