2 Copyright 1999-2014 ImageMagick Studio LLC, a non-profit organization
3 dedicated to making software imaging solutions freely available.
5 You may not use this file except in compliance with the License.
6 obtain a copy of the License at
8 http://www.imagemagick.org/script/license.php
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
16 MagickCore private methods for accelerated functions.
19 #ifndef _MAGICKCORE_ACCELERATE_PRIVATE_H
20 #define _MAGICKCORE_ACCELERATE_PRIVATE_H
22 #if defined(__cplusplus) || defined(c_plusplus)
27 #if defined(MAGICKCORE_OPENCL_SUPPORT)
29 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
30 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
31 #define OPENCL_ELSE() "\n #""else " " \n"
32 #define OPENCL_ENDIF() "\n #""endif " " \n"
33 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
34 #define STRINGIFY(...) #__VA_ARGS__ "\n"
36 typedef struct _FloatPixelPacket
38 #ifdef MAGICK_PIXEL_RGBA
45 #ifdef MAGICK_PIXEL_BGRA
54 const char* accelerateKernels =
62 GreenChannel = 0x0002,
63 MagentaChannel = 0x0002,
65 YellowChannel = 0x0004,
66 AlphaChannel = 0x0008,
67 OpacityChannel = 0x0008,
68 MatteChannel = 0x0008, /* deprecated */
69 BlackChannel = 0x0020,
70 IndexChannel = 0x0020,
71 CompositeChannels = 0x002F,
72 AllChannels = 0x7ffffff,
74 Special purpose channel types.
76 TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
77 RGBChannels = 0x0080, /* set alpha from grayscale mask in RGB */
78 GrayChannels = 0x0080,
79 SyncChannels = 0x0100, /* channels should be modified equally */
80 DefaultChannels = ((AllChannels | SyncChannels) &~ OpacityChannel)
84 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
87 inline CLQuantum ScaleCharToQuantum(const unsigned char value)
89 return((CLQuantum) value);
93 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
96 inline CLQuantum ScaleCharToQuantum(const unsigned char value)
98 return((CLQuantum) (257.0f*value));
102 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
105 inline CLQuantum ScaleCharToQuantum(const unsigned char value)
107 return((Quantum) (16843009.0*value));
115 inline int ClampToCanvas(const int offset,const int range)
117 return clamp(offset, (int)0, range-1);
122 inline int ClampToCanvasWithHalo(const int offset,const int range, const int edge, const int section)
124 return clamp(offset, section?(int)(0-edge):(int)0, section?(range-1):(range-1+edge));
129 inline CLQuantum ClampToQuantum(const float value)
131 return (CLQuantum) (clamp(value, 0.0f, (float) QuantumRange) + 0.5f);
136 inline uint ScaleQuantumToMap(CLQuantum value)
138 if (value >= (CLQuantum) MaxMap)
139 return ((uint)MaxMap);
141 return ((uint)value);
146 inline float PerceptibleReciprocal(const float x)
148 float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
149 return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
153 OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
157 inline CLQuantum getBlue(CLPixelType p) { return p.x; }
158 inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
159 inline float getBlueF4(float4 p) { return p.x; }
160 inline void setBlueF4(float4* p, float value) { (*p).x = value; }
162 inline CLQuantum getGreen(CLPixelType p) { return p.y; }
163 inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
164 inline float getGreenF4(float4 p) { return p.y; }
165 inline void setGreenF4(float4* p, float value) { (*p).y = value; }
167 inline CLQuantum getRed(CLPixelType p) { return p.z; }
168 inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
169 inline float getRedF4(float4 p) { return p.z; }
170 inline void setRedF4(float4* p, float value) { (*p).z = value; }
172 inline CLQuantum getOpacity(CLPixelType p) { return p.w; }
173 inline void setOpacity(CLPixelType* p, CLQuantum value) { (*p).w = value; }
174 inline float getOpacityF4(float4 p) { return p.w; }
175 inline void setOpacityF4(float4* p, float value) { (*p).w = value; }
177 inline float GetPixelIntensity(int colorspace, CLPixelType p)
179 // this is for default intensity and sRGB (not RGB) color space
180 float red = getRed(p);
181 float green = getGreen(p);
182 float blue = getBlue(p);
185 return 0.212656*red+0.715158*green+0.072186*blue;
196 void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
197 const unsigned int imageWidth, const unsigned int imageHeight,
198 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
199 const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
202 blockID.x = get_group_id(0);
203 blockID.y = get_group_id(1);
205 // image area processed by this workgroup
207 imageAreaOrg.x = blockID.x * get_local_size(0);
208 imageAreaOrg.y = blockID.y * get_local_size(1);
211 midFilterDimen.x = (filterWidth-1)/2;
212 midFilterDimen.y = (filterHeight-1)/2;
214 int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
216 // dimension of the local cache
217 int2 cachedAreaDimen;
218 cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
219 cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
221 // cache the pixels accessed by this workgroup in local memory
222 int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
223 int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
224 int groupSize = get_local_size(0) * get_local_size(1);
225 for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
227 int2 cachedAreaIndex;
228 cachedAreaIndex.x = i % cachedAreaDimen.x;
229 cachedAreaIndex.y = i / cachedAreaDimen.x;
231 int2 imagePixelIndex;
232 imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
234 // only support EdgeVirtualPixelMethod through ClampToCanvas
235 // TODO: implement other virtual pixel method
236 imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
237 imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
239 pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
243 for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
244 filterCache[i] = filter[i];
246 barrier(CLK_LOCAL_MEM_FENCE);
250 imageIndex.x = imageAreaOrg.x + get_local_id(0);
251 imageIndex.y = imageAreaOrg.y + get_local_id(1);
253 // if out-of-range, stops here and quit
254 if (imageIndex.x >= imageWidth
255 || imageIndex.y >= imageHeight) {
260 float4 sum = (float4)0.0f;
262 if (((channel & OpacityChannel) == 0) || (matte == 0)) {
263 int cacheIndexY = get_local_id(1);
264 for (int j = 0; j < filterHeight; j++) {
265 int cacheIndexX = get_local_id(0);
266 for (int i = 0; i < filterWidth; i++) {
267 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
268 float f = filterCache[filterIndex];
283 int cacheIndexY = get_local_id(1);
284 for (int j = 0; j < filterHeight; j++) {
285 int cacheIndexX = get_local_id(0);
286 for (int i = 0; i < filterWidth; i++) {
288 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
289 float alpha = QuantumScale*(QuantumRange-p.w);
290 float f = filterCache[filterIndex];
304 gamma = PerceptibleReciprocal(gamma);
305 sum.xyz = gamma*sum.xyz;
307 CLPixelType outputPixel;
308 outputPixel.x = ClampToQuantum(sum.x);
309 outputPixel.y = ClampToQuantum(sum.y);
310 outputPixel.z = ClampToQuantum(sum.z);
311 outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
313 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
319 void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
320 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
321 const uint matte, const ChannelType channel) {
324 imageIndex.x = get_global_id(0);
325 imageIndex.y = get_global_id(1);
327 unsigned int imageWidth = get_global_size(0);
328 unsigned int imageHeight = get_global_size(1);
330 if (imageIndex.x >= imageWidth
331 || imageIndex.y >= imageHeight)
335 midFilterDimen.x = (filterWidth-1)/2;
336 midFilterDimen.y = (filterHeight-1)/2;
339 float4 sum = (float4)0.0f;
341 if (((channel & OpacityChannel) == 0) || (matte == 0)) {
342 for (int j = 0; j < filterHeight; j++) {
343 int2 inputPixelIndex;
344 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
345 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
346 for (int i = 0; i < filterWidth; i++) {
347 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
348 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
350 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
351 float f = filter[filterIndex];
366 for (int j = 0; j < filterHeight; j++) {
367 int2 inputPixelIndex;
368 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
369 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
370 for (int i = 0; i < filterWidth; i++) {
371 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
372 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
374 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
375 float alpha = QuantumScale*(QuantumRange-p.w);
376 float f = filter[filterIndex];
390 gamma = PerceptibleReciprocal(gamma);
391 sum.xyz = gamma*sum.xyz;
394 CLPixelType outputPixel;
395 outputPixel.x = ClampToQuantum(sum.x);
396 outputPixel.y = ClampToQuantum(sum.y);
397 outputPixel.z = ClampToQuantum(sum.z);
398 outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
400 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
418 apply FunctionImageChannel(braightness-contrast)
420 CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
421 const unsigned int number_parameters,
422 __constant float *parameters)
424 float4 result = (float4) 0.0f;
427 case PolynomialFunction:
429 for (unsigned int i=0; i < number_parameters; i++)
430 result = result*QuantumScale*convert_float4(pixel) + parameters[i];
431 result *= QuantumRange;
434 case SinusoidFunction:
436 float freq,phase,ampl,bias;
437 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
438 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
439 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
440 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
441 result = QuantumRange*(ampl*sin(2.0f*MagickPI*
442 (freq*QuantumScale*convert_float4(pixel) + phase/360.0f)) + bias);
447 float width,range,center,bias;
448 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
449 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
450 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
451 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
452 result = 2.0f/width*(QuantumScale*convert_float4(pixel) - center);
453 result = range/MagickPI*asin(result)+bias;
454 result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
455 result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
456 result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
457 result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
458 result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
459 result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
460 result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
461 result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
463 result *= QuantumRange;
468 float slope,range,center,bias;
469 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
470 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
471 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
472 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
473 result = MagickPI*slope*(QuantumScale*convert_float4(pixel)-center);
474 result = QuantumRange*(range/MagickPI*atan(result) + bias);
477 case UndefinedFunction:
480 return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
481 ClampToQuantum(result.z), ClampToQuantum(result.w));
487 Improve brightness / contrast of the image
488 channel : define which channel is improved
489 function : the function called to enchance the brightness contrast
490 number_parameters : numbers of parameters
491 parameters : the parameter
493 __kernel void FunctionImage(__global CLPixelType *im,
494 const ChannelType channel, const MagickFunction function,
495 const unsigned int number_parameters, __constant float *parameters)
497 const int x = get_global_id(0);
498 const int y = get_global_id(1);
499 const int columns = get_global_size(0);
500 const int c = x + y * columns;
501 im[c] = ApplyFunction(im[c], function, number_parameters, parameters);
508 __kernel void Equalize(__global CLPixelType * restrict im,
509 const ChannelType channel,
510 __global CLPixelType * restrict equalize_map,
511 const float4 white, const float4 black)
513 const int x = get_global_id(0);
514 const int y = get_global_id(1);
515 const int columns = get_global_size(0);
516 const int c = x + y * columns;
519 CLPixelType oValue, eValue;
520 CLQuantum red, green, blue, opacity;
525 if ((channel & SyncChannels) != 0)
527 if (getRedF4(white) != getRedF4(black))
529 ePos = ScaleQuantumToMap(getRed(oValue));
530 eValue = equalize_map[ePos];
531 red = getRed(eValue);
532 ePos = ScaleQuantumToMap(getGreen(oValue));
533 eValue = equalize_map[ePos];
534 green = getRed(eValue);
535 ePos = ScaleQuantumToMap(getBlue(oValue));
536 eValue = equalize_map[ePos];
537 blue = getRed(eValue);
538 ePos = ScaleQuantumToMap(getOpacity(oValue));
539 eValue = equalize_map[ePos];
540 opacity = getRed(eValue);
543 im[c]=(CLPixelType)(blue, green, red, opacity);
548 // for equalizing, we always need all channels?
549 // otherwise something more
557 __kernel void Histogram(__global CLPixelType * restrict im,
558 const ChannelType channel, const int colorspace,
559 __global uint4 * restrict histogram)
561 const int x = get_global_id(0);
562 const int y = get_global_id(1);
563 const int columns = get_global_size(0);
564 const int c = x + y * columns;
565 if ((channel & SyncChannels) != 0)
567 float intensity = GetPixelIntensity(colorspace,im[c]);
568 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
569 atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
573 // for equalizing, we always need all channels?
574 // otherwise something more
581 Reduce image noise and reduce detail levels by row
582 im: input pixels filtered_in filtered_im: output pixels
583 filter : convolve kernel width: convolve kernel size
584 channel : define which channel is blured
585 is_RGBA_BGRA : define the input is RGBA or BGRA
587 __kernel void BlurRow(__global CLPixelType *im, __global float4 *filtered_im,
588 const ChannelType channel, __constant float *filter,
589 const unsigned int width,
590 const unsigned int imageColumns, const unsigned int imageRows,
591 __local CLPixelType *temp)
593 const int x = get_global_id(0);
594 const int y = get_global_id(1);
596 const int columns = imageColumns;
598 const unsigned int radius = (width-1)/2;
599 const int wsize = get_local_size(0);
600 const unsigned int loadSize = wsize+width;
602 //load chunk only for now
603 //event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
604 //wait_group_events(1,&e);
606 //parallel load and clamp
609 for (int i=0; i < loadSize; i=i+wsize)
611 int currentX = x + wsize*(count++);
613 int localId = get_local_id(0);
615 if ((localId+i) > loadSize)
618 temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
620 if (y==0 && get_group_id(0) == 0)
622 printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
628 const int groupX=get_local_size(0)*get_group_id(0);
629 const int groupY=get_local_size(1)*get_group_id(1);
631 //parallel load and clamp
632 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
634 //int cx = ClampToCanvas(groupX+i, columns);
635 temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
637 if (0 && y==0 && get_group_id(1) == 0)
639 printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
644 barrier(CLK_LOCAL_MEM_FENCE);
646 // only do the work if this is not a patched item
647 if (get_global_id(0) < columns)
650 float4 result = (float4) 0;
654 \n #ifndef UFACTOR \n
655 \n #define UFACTOR 8 \n
658 for ( ; i+UFACTOR < width; )
660 \n #pragma unroll UFACTOR\n
661 for (int j=0; j < UFACTOR; j++, i++)
663 result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
667 for ( ; i < width; i++)
669 result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
672 result.x = ClampToQuantum(result.x);
673 result.y = ClampToQuantum(result.y);
674 result.z = ClampToQuantum(result.z);
675 result.w = ClampToQuantum(result.w);
677 // write back to global
678 filtered_im[y*columns+x] = result;
685 Reduce image noise and reduce detail levels by row
686 im: input pixels filtered_in filtered_im: output pixels
687 filter : convolve kernel width: convolve kernel size
688 channel : define which channel is blured
689 is_RGBA_BGRA : define the input is RGBA or BGRA
691 __kernel void BlurRowSection(__global CLPixelType *im, __global float4 *filtered_im,
692 const ChannelType channel, __constant float *filter,
693 const unsigned int width,
694 const unsigned int imageColumns, const unsigned int imageRows,
695 __local CLPixelType *temp,
696 const unsigned int offsetRows, const unsigned int section)
698 const int x = get_global_id(0);
699 const int y = get_global_id(1);
701 const int columns = imageColumns;
703 const unsigned int radius = (width-1)/2;
704 const int wsize = get_local_size(0);
705 const unsigned int loadSize = wsize+width;
708 const int groupX=get_local_size(0)*get_group_id(0);
709 const int groupY=get_local_size(1)*get_group_id(1);
711 //offset the input data, assuming section is 0, 1
712 im += imageColumns * (offsetRows - radius * section);
714 //parallel load and clamp
715 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
717 //int cx = ClampToCanvas(groupX+i, columns);
718 temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
720 if (0 && y==0 && get_group_id(1) == 0)
722 printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
727 barrier(CLK_LOCAL_MEM_FENCE);
729 // only do the work if this is not a patched item
730 if (get_global_id(0) < columns)
733 float4 result = (float4) 0;
737 \n #ifndef UFACTOR \n
738 \n #define UFACTOR 8 \n
741 for ( ; i+UFACTOR < width; )
743 \n #pragma unroll UFACTOR\n
744 for (int j=0; j < UFACTOR; j++, i++)
746 result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
750 for ( ; i < width; i++)
752 result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
755 result.x = ClampToQuantum(result.x);
756 result.y = ClampToQuantum(result.y);
757 result.z = ClampToQuantum(result.z);
758 result.w = ClampToQuantum(result.w);
760 // write back to global
761 filtered_im[y*columns+x] = result;
769 Reduce image noise and reduce detail levels by line
770 im: input pixels filtered_in filtered_im: output pixels
771 filter : convolve kernel width: convolve kernel size
772 channel : define which channel is blured\
773 is_RGBA_BGRA : define the input is RGBA or BGRA
775 __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
776 const ChannelType channel, __constant float *filter,
777 const unsigned int width,
778 const unsigned int imageColumns, const unsigned int imageRows,
779 __local float4 *temp)
781 const int x = get_global_id(0);
782 const int y = get_global_id(1);
784 //const int columns = get_global_size(0);
785 //const int rows = get_global_size(1);
786 const int columns = imageColumns;
787 const int rows = imageRows;
789 unsigned int radius = (width-1)/2;
790 const int wsize = get_local_size(1);
791 const unsigned int loadSize = wsize+width;
794 const int groupX=get_local_size(0)*get_group_id(0);
795 const int groupY=get_local_size(1)*get_group_id(1);
796 //notice that get_local_size(0) is 1, so
797 //groupX=get_group_id(0);
799 //parallel load and clamp
800 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
802 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
806 barrier(CLK_LOCAL_MEM_FENCE);
808 // only do the work if this is not a patched item
809 if (get_global_id(1) < rows)
812 float4 result = (float4) 0;
816 \n #ifndef UFACTOR \n
817 \n #define UFACTOR 8 \n
820 for ( ; i+UFACTOR < width; )
822 \n #pragma unroll UFACTOR \n
823 for (int j=0; j < UFACTOR; j++, i++)
825 result+=filter[i]*temp[i+get_local_id(1)];
829 for ( ; i < width; i++)
831 result+=filter[i]*temp[i+get_local_id(1)];
834 result.x = ClampToQuantum(result.x);
835 result.y = ClampToQuantum(result.y);
836 result.z = ClampToQuantum(result.z);
837 result.w = ClampToQuantum(result.w);
839 // write back to global
840 filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
849 Reduce image noise and reduce detail levels by line
850 im: input pixels filtered_in filtered_im: output pixels
851 filter : convolve kernel width: convolve kernel size
852 channel : define which channel is blured\
853 is_RGBA_BGRA : define the input is RGBA or BGRA
855 __kernel void BlurColumnSection(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
856 const ChannelType channel, __constant float *filter,
857 const unsigned int width,
858 const unsigned int imageColumns, const unsigned int imageRows,
859 __local float4 *temp,
860 const unsigned int offsetRows, const unsigned int section)
862 const int x = get_global_id(0);
863 const int y = get_global_id(1);
865 //const int columns = get_global_size(0);
866 //const int rows = get_global_size(1);
867 const int columns = imageColumns;
868 const int rows = imageRows;
870 unsigned int radius = (width-1)/2;
871 const int wsize = get_local_size(1);
872 const unsigned int loadSize = wsize+width;
875 const int groupX=get_local_size(0)*get_group_id(0);
876 const int groupY=get_local_size(1)*get_group_id(1);
877 //notice that get_local_size(0) is 1, so
878 //groupX=get_group_id(0);
880 // offset the input data
881 blurRowData += imageColumns * radius * section;
883 //parallel load and clamp
884 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
886 int pos = ClampToCanvasWithHalo(i+groupY-radius, rows, radius, section) * columns + groupX;
887 temp[i] = *(blurRowData+pos);
891 barrier(CLK_LOCAL_MEM_FENCE);
893 // only do the work if this is not a patched item
894 if (get_global_id(1) < rows)
897 float4 result = (float4) 0;
901 \n #ifndef UFACTOR \n
902 \n #define UFACTOR 8 \n
905 for ( ; i+UFACTOR < width; )
907 \n #pragma unroll UFACTOR \n
908 for (int j=0; j < UFACTOR; j++, i++)
910 result+=filter[i]*temp[i+get_local_id(1)];
913 for ( ; i < width; i++)
915 result+=filter[i]*temp[i+get_local_id(1)];
918 result.x = ClampToQuantum(result.x);
919 result.y = ClampToQuantum(result.y);
920 result.z = ClampToQuantum(result.z);
921 result.w = ClampToQuantum(result.w);
923 // offset the output data
924 filtered_im += imageColumns * offsetRows;
926 // write back to global
927 filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
935 __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage,
936 const __global float4 *blurRowData, __global CLPixelType *filtered_im,
937 const unsigned int imageColumns, const unsigned int imageRows,
938 __local float4* cachedData, __local float* cachedFilter,
939 const ChannelType channel, const __global float *filter, const unsigned int width,
940 const float gain, const float threshold)
942 const unsigned int radius = (width-1)/2;
944 // cache the pixel shared by the workgroup
945 const int groupX = get_group_id(0);
946 const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
947 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
950 && groupStopY < imageRows) {
951 event_t e = async_work_group_strided_copy(cachedData
952 ,blurRowData+groupStartY*imageColumns+groupX
953 ,groupStopY-groupStartY,imageColumns,0);
954 wait_group_events(1,&e);
957 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
958 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
960 barrier(CLK_LOCAL_MEM_FENCE);
962 // cache the filter as well
963 event_t e = async_work_group_copy(cachedFilter,filter,width,0);
964 wait_group_events(1,&e);
966 // only do the work if this is not a patched item
967 //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
968 const int cy = get_global_id(1);
970 if (cy < imageRows) {
971 float4 blurredPixel = (float4) 0.0f;
975 \n #ifndef UFACTOR \n
976 \n #define UFACTOR 8 \n
979 for ( ; i+UFACTOR < width; )
981 \n #pragma unroll UFACTOR \n
982 for (int j=0; j < UFACTOR; j++, i++)
984 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
988 for ( ; i < width; i++)
990 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
993 blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
994 ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
996 float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
997 float4 outputPixel = inputImagePixel - blurredPixel;
999 float quantumThreshold = QuantumRange*threshold;
1001 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
1002 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
1005 filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
1006 ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
1011 __kernel void UnsharpMaskBlurColumnSection(const __global CLPixelType* inputImage,
1012 const __global float4 *blurRowData, __global CLPixelType *filtered_im,
1013 const unsigned int imageColumns, const unsigned int imageRows,
1014 __local float4* cachedData, __local float* cachedFilter,
1015 const ChannelType channel, const __global float *filter, const unsigned int width,
1016 const float gain, const float threshold,
1017 const unsigned int offsetRows, const unsigned int section)
1019 const unsigned int radius = (width-1)/2;
1021 // cache the pixel shared by the workgroup
1022 const int groupX = get_group_id(0);
1023 const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
1024 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
1026 // offset the input data
1027 blurRowData += imageColumns * radius * section;
1029 if (groupStartY >= 0
1030 && groupStopY < imageRows) {
1031 event_t e = async_work_group_strided_copy(cachedData
1032 ,blurRowData+groupStartY*imageColumns+groupX
1033 ,groupStopY-groupStartY,imageColumns,0);
1034 wait_group_events(1,&e);
1037 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
1038 int pos = ClampToCanvasWithHalo(groupStartY+i,imageRows, radius, section)*imageColumns+ groupX;
1039 cachedData[i] = *(blurRowData + pos);
1041 barrier(CLK_LOCAL_MEM_FENCE);
1043 // cache the filter as well
1044 event_t e = async_work_group_copy(cachedFilter,filter,width,0);
1045 wait_group_events(1,&e);
1047 // only do the work if this is not a patched item
1048 //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
1049 const int cy = get_global_id(1);
1051 if (cy < imageRows) {
1052 float4 blurredPixel = (float4) 0.0f;
1056 \n #ifndef UFACTOR \n
1057 \n #define UFACTOR 8 \n
1060 for ( ; i+UFACTOR < width; )
1062 \n #pragma unroll UFACTOR \n
1063 for (int j=0; j < UFACTOR; j++, i++)
1065 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
1069 for ( ; i < width; i++)
1071 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
1074 blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
1075 ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
1077 // offset the output data
1078 inputImage += imageColumns * offsetRows;
1079 filtered_im += imageColumns * offsetRows;
1081 float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
1082 float4 outputPixel = inputImagePixel - blurredPixel;
1084 float quantumThreshold = QuantumRange*threshold;
1086 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
1087 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
1090 filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
1091 ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
1102 __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1103 , const unsigned int imageWidth, const unsigned int imageHeight
1104 , const int2 offset, const int polarity, const int matte) {
1106 int x = get_global_id(0);
1107 int y = get_global_id(1);
1109 CLPixelType v = inputImage[y*imageWidth+x];
1112 neighbor.y = y + offset.y;
1113 neighbor.x = x + offset.x;
1115 int2 clampedNeighbor;
1116 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1117 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1119 CLPixelType r = (clampedNeighbor.x == neighbor.x
1120 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1136 \n #pragma unroll 4\n
1137 for (unsigned int i = 0; i < 4; i++) {
1138 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1142 \n #pragma unroll 4\n
1143 for (unsigned int i = 0; i < 4; i++) {
1144 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1149 v.x = (CLQuantum)sv[0];
1150 v.y = (CLQuantum)sv[1];
1151 v.z = (CLQuantum)sv[2];
1154 v.w = (CLQuantum)sv[3];
1156 outputImage[y*imageWidth+x] = v;
1167 __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1168 , const unsigned int imageWidth, const unsigned int imageHeight
1169 , const int2 offset, const int polarity, const int matte) {
1171 int x = get_global_id(0);
1172 int y = get_global_id(1);
1174 CLPixelType v = inputImage[y*imageWidth+x];
1176 int2 neighbor, clampedNeighbor;
1178 neighbor.y = y + offset.y;
1179 neighbor.x = x + offset.x;
1180 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1181 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1183 CLPixelType r = (clampedNeighbor.x == neighbor.x
1184 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1188 neighbor.y = y - offset.y;
1189 neighbor.x = x - offset.x;
1190 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1191 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1193 CLPixelType s = (clampedNeighbor.x == neighbor.x
1194 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1217 \n #pragma unroll 4\n
1218 for (unsigned int i = 0; i < 4; i++) {
1219 //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1221 //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1222 //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1223 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1227 \n #pragma unroll 4\n
1228 for (unsigned int i = 0; i < 4; i++) {
1229 //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1231 //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1232 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1236 v.x = (CLQuantum)sv[0];
1237 v.y = (CLQuantum)sv[1];
1238 v.z = (CLQuantum)sv[2];
1241 v.w = (CLQuantum)sv[3];
1243 outputImage[y*imageWidth+x] = v;
1252 __kernel void RadialBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
1254 const unsigned int channel, const unsigned int matte,
1255 const float2 blurCenter,
1256 __constant float *cos_theta, __constant float *sin_theta,
1257 const unsigned int cossin_theta_size)
1259 const int x = get_global_id(0);
1260 const int y = get_global_id(1);
1261 const int columns = get_global_size(0);
1262 const int rows = get_global_size(1);
1263 unsigned int step = 1;
1264 float center_x = (float) x - blurCenter.x;
1265 float center_y = (float) y - blurCenter.y;
1266 float radius = hypot(center_x, center_y);
1268 //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
1269 float blur_radius = hypot(blurCenter.x, blurCenter.y);
1271 if (radius > MagickEpsilon)
1273 step = (unsigned int) (blur_radius / radius);
1276 if (step >= cossin_theta_size)
1277 step = cossin_theta_size-1;
1281 result.x = (float)bias.x;
1282 result.y = (float)bias.y;
1283 result.z = (float)bias.z;
1284 result.w = (float)bias.w;
1285 float normalize = 0.0f;
1287 if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1288 for (unsigned int i=0; i<cossin_theta_size; i+=step)
1290 result += convert_float4(im[
1291 ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
1292 ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
1295 normalize = PerceptibleReciprocal(normalize);
1296 result = result * normalize;
1300 for (unsigned int i=0; i<cossin_theta_size; i+=step)
1302 float4 p = convert_float4(im[
1303 ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
1304 ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
1306 float alpha = (float)(QuantumScale*(QuantumRange-p.w));
1307 result.x += alpha * p.x;
1308 result.y += alpha * p.y;
1309 result.z += alpha * p.z;
1314 gamma = PerceptibleReciprocal(gamma);
1315 normalize = PerceptibleReciprocal(normalize);
1316 result.x = gamma*result.x;
1317 result.y = gamma*result.y;
1318 result.z = gamma*result.z;
1319 result.w = normalize*result.w;
1321 filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
1322 ClampToQuantum(result.z), ClampToQuantum(result.w));
1329 UndefinedColorspace,
1330 RGBColorspace, /* Linear RGB colorspace */
1331 GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */
1332 TransparentColorspace,
1341 CMYKColorspace, /* negared linear RGB with black separated */
1342 sRGBColorspace, /* Default: non-lienar sRGB colorspace */
1346 Rec601LumaColorspace,
1347 Rec601YCbCrColorspace,
1348 Rec709LumaColorspace,
1349 Rec709YCbCrColorspace,
1351 CMYColorspace, /* negated linear RGB colorspace */
1354 LCHColorspace, /* alias for LCHuv */
1356 LCHabColorspace, /* Cylindrical (Polar) Lab */
1357 LCHuvColorspace, /* Cylindrical (Polar) Luv */
1360 HSVColorspace, /* alias for HSB */
1369 inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1370 float3 HueSaturationBrightness;
1371 HueSaturationBrightness.x = 0.0f; // Hue
1372 HueSaturationBrightness.y = 0.0f; // Saturation
1373 HueSaturationBrightness.z = 0.0f; // Brightness
1375 float r=(float) getRed(pixel);
1376 float g=(float) getGreen(pixel);
1377 float b=(float) getBlue(pixel);
1379 float tmin=min(min(r,g),b);
1380 float tmax=max(max(r,g),b);
1383 float delta=tmax-tmin;
1384 HueSaturationBrightness.y=delta/tmax;
1385 HueSaturationBrightness.z=QuantumScale*tmax;
1387 if (delta != 0.0f) {
1388 HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1389 HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1390 HueSaturationBrightness.x/=6.0f;
1391 HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1394 return HueSaturationBrightness;
1397 inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1399 float hue = HueSaturationBrightness.x;
1400 float brightness = HueSaturationBrightness.z;
1401 float saturation = HueSaturationBrightness.y;
1405 if (saturation == 0.0f) {
1406 setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1407 setGreen(&rgb,getRed(rgb));
1408 setBlue(&rgb,getRed(rgb));
1412 float h=6.0f*(hue-floor(hue));
1414 float p=brightness*(1.0f-saturation);
1415 float q=brightness*(1.0f-saturation*f);
1416 float t=brightness*(1.0f-(saturation*(1.0f-f)));
1418 float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1419 float clamped_t = ClampToQuantum(QuantumRange*t);
1420 float clamped_p = ClampToQuantum(QuantumRange*p);
1421 float clamped_q = ClampToQuantum(QuantumRange*q);
1423 setRed(&rgb, (ih == 1)?clamped_q:
1424 (ih == 2 || ih == 3)?clamped_p:
1425 (ih == 4)?clamped_t:
1428 setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1429 (ih == 3)?clamped_q:
1430 (ih == 4 || ih == 5)?clamped_p:
1433 setBlue(&rgb, (ih == 2)?clamped_t:
1434 (ih == 3 || ih == 4)?clampedBrightness:
1435 (ih == 5)?clamped_q:
1441 __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1444 const int sign = sharpen!=0?1:-1;
1445 const int x = get_global_id(0);
1446 const int y = get_global_id(1);
1447 const int columns = get_global_size(0);
1448 const int c = x + y * columns;
1450 CLPixelType pixel = im[c];
1451 float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1452 float brightness = HueSaturationBrightness.z;
1453 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1454 brightness = clamp(brightness,0.0f,1.0f);
1455 HueSaturationBrightness.z = brightness;
1457 CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1458 filteredPixel.w = pixel.w;
1459 im[c] = filteredPixel;
1467 inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1468 float *hue, float *saturation, float *lightness)
1476 Convert RGB to HSL colorspace.
1478 tmax=max(QuantumScale*red,max(QuantumScale*green, QuantumScale*blue));
1479 tmin=min(QuantumScale*red,min(QuantumScale*green, QuantumScale*blue));
1483 *lightness=(tmax+tmin)/2.0;
1491 if (tmax == (QuantumScale*red))
1493 *hue=(QuantumScale*green-QuantumScale*blue)/c;
1494 if ((QuantumScale*green) < (QuantumScale*blue))
1498 if (tmax == (QuantumScale*green))
1499 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
1501 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
1504 if (*lightness <= 0.5)
1505 *saturation=c/(2.0*(*lightness));
1507 *saturation=c/(2.0-2.0*(*lightness));
1510 inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
1511 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
1523 Convert HSL to RGB colorspace.
1526 if (lightness <= 0.5)
1527 c=2.0*lightness*saturation;
1529 c=(2.0-2.0*lightness)*saturation;
1530 tmin=lightness-0.5*c;
1531 h-=360.0*floor(h/360.0);
1533 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
1534 switch ((int) floor(h))
1585 *red=ClampToQuantum(QuantumRange*r);
1586 *green=ClampToQuantum(QuantumRange*g);
1587 *blue=ClampToQuantum(QuantumRange*b);
1590 inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
1591 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
1599 Increase or decrease color lightness, saturation, or hue.
1601 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
1602 hue+=0.5*(0.01*percent_hue-1.0);
1607 saturation*=0.01*percent_saturation;
1608 lightness*=0.01*percent_lightness;
1609 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
1612 __kernel void Modulate(__global CLPixelType *im,
1613 const float percent_brightness,
1614 const float percent_hue,
1615 const float percent_saturation,
1616 const int colorspace)
1619 const int x = get_global_id(0);
1620 const int y = get_global_id(1);
1621 const int columns = get_global_size(0);
1622 const int c = x + y * columns;
1624 CLPixelType pixel = im[c];
1632 green=getGreen(pixel);
1633 blue=getBlue(pixel);
1640 ModulateHSL(percent_hue, percent_saturation, percent_brightness,
1641 &red, &green, &blue);
1646 CLPixelType filteredPixel;
1648 setRed(&filteredPixel, red);
1649 setGreen(&filteredPixel, green);
1650 setBlue(&filteredPixel, blue);
1651 filteredPixel.w = pixel.w;
1653 im[c] = filteredPixel;
1658 // Based on Box from resize.c
1659 float BoxResizeFilter(const float x)
1666 // Based on CubicBC from resize.c
1667 float CubicBC(const float x,const __global float* resizeFilterCoefficients)
1670 Cubic Filters using B,C determined values:
1671 Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
1672 Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
1673 Spline B = 1 C = 0 B-Spline Gaussian approximation
1674 Hermite B = 0 C = 0 B-Spline interpolator
1676 See paper by Mitchell and Netravali, Reconstruction Filters in Computer
1677 Graphics Computer Graphics, Volume 22, Number 4, August 1988
1678 http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
1681 Coefficents are determined from B,C values:
1682 P0 = ( 6 - 2*B )/6 = coeff[0]
1684 P2 = (-18 +12*B + 6*C )/6 = coeff[1]
1685 P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
1686 Q0 = ( 8*B +24*C )/6 = coeff[3]
1687 Q1 = ( -12*B -48*C )/6 = coeff[4]
1688 Q2 = ( 6*B +30*C )/6 = coeff[5]
1689 Q3 = ( - 1*B - 6*C )/6 = coeff[6]
1691 which are used to define the filter:
1693 P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
1694 Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
1696 which ensures function is continuous in value and derivative (slope).
1699 return(resizeFilterCoefficients[0]+x*(x*
1700 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
1702 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
1703 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
1709 float Sinc(const float x)
1713 const float alpha=(float) (MagickPI*x);
1714 return sinpi(x)/alpha;
1721 float Triangle(const float x)
1724 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
1725 a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
1728 return ((x<1.0f)?(1.0f-x):0.0f);
1734 float Hanning(const float x)
1737 Cosine window function:
1740 const float cosine=cos((MagickPI*x));
1741 return(0.5f+0.5f*cosine);
1746 float Hamming(const float x)
1749 Offset cosine window function:
1750 .54 + .46 cos(pi x).
1752 const float cosine=cos((MagickPI*x));
1753 return(0.54f+0.46f*cosine);
1758 float Blackman(const float x)
1761 Blackman: 2nd order cosine windowing function:
1762 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
1764 Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
1767 const float cosine=cos((MagickPI*x));
1768 return(0.34f+cosine*(0.5f+cosine*0.16f));
1775 BoxWeightingFunction = 0,
1776 TriangleWeightingFunction,
1777 CubicBCWeightingFunction,
1778 HanningWeightingFunction,
1779 HammingWeightingFunction,
1780 BlackmanWeightingFunction,
1781 GaussianWeightingFunction,
1782 QuadraticWeightingFunction,
1783 JincWeightingFunction,
1784 SincWeightingFunction,
1785 SincFastWeightingFunction,
1786 KaiserWeightingFunction,
1787 WelshWeightingFunction,
1788 BohmanWeightingFunction,
1789 LagrangeWeightingFunction,
1790 CosineWeightingFunction,
1791 } ResizeWeightingFunctionType;
1795 inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
1799 /* Call Sinc even for SincFast to get better precision on GPU
1800 and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
1801 case SincWeightingFunction:
1802 case SincFastWeightingFunction:
1804 case CubicBCWeightingFunction:
1805 return CubicBC(x,filterCoefficients);
1806 case BoxWeightingFunction:
1807 return BoxResizeFilter(x);
1808 case TriangleWeightingFunction:
1810 case HanningWeightingFunction:
1812 case HammingWeightingFunction:
1814 case BlackmanWeightingFunction:
1825 inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
1826 , const ResizeWeightingFunctionType resizeWindowType
1827 , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
1830 float xBlur = fabs(x/resizeFilterBlur);
1831 if (resizeWindowSupport < MagickEpsilon
1832 || resizeWindowType == BoxWeightingFunction)
1838 scale = resizeFilterScale;
1839 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
1841 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
1848 const char* accelerateKernels2 =
1852 inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
1853 return (numWorkItems/pixelPerWorkgroup);
1856 // returns the index of the pixel for the current workitem to compute.
1857 // returns -1 if this workitem doesn't need to participate in any computation
1858 inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
1859 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
1860 int pixelIndex = itemID/numWorkItemsPerPixel;
1861 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
1868 __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
1869 void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
1870 , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
1871 , const int resizeFilterType, const int resizeWindowType
1872 , const __global float* resizeFilterCubicCoefficients
1873 , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
1874 , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
1875 , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
1878 // calculate the range of resized image pixels computed by this workgroup
1879 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
1880 const unsigned int stopX = min(startX + pixelPerWorkgroup,filteredColumns);
1881 const unsigned int actualNumPixelToCompute = stopX - startX;
1883 // calculate the range of input image pixels to cache
1884 float scale = max(1.0/xFactor+MagickEpsilon ,1.0f);
1885 const float support = max(scale*resizeFilterSupport,0.5f);
1886 scale = PerceptibleReciprocal(scale);
1888 const int cacheRangeStartX = max((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
1889 const int cacheRangeEndX = min((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
1891 // cache the input pixels into local memory
1892 const unsigned int y = get_global_id(1);
1893 event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
1894 wait_group_events(1,&e);
1896 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
1897 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
1900 const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
1901 const unsigned int chunkStopX = min(chunkStartX + pixelChunkSize, stopX);
1902 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
1904 // determine which resized pixel computed by this workitem
1905 const unsigned int itemID = get_local_id(0);
1906 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
1908 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
1910 float4 filteredPixel = (float4)0.0f;
1911 float density = 0.0f;
1913 // -1 means this workitem doesn't participate in the computation
1914 if (pixelIndex != -1) {
1916 // x coordinated of the resized pixel computed by this workitem
1917 const int x = chunkStartX + pixelIndex;
1919 // calculate how many steps required for this pixel
1920 const float bisect = (x+0.5)/xFactor+MagickEpsilon;
1921 const unsigned int start = (unsigned int)max(bisect-support+0.5f,0.0f);
1922 const unsigned int stop = (unsigned int)min(bisect+support+0.5f,(float)inputColumns);
1923 const unsigned int n = stop - start;
1925 // calculate how many steps this workitem will contribute
1926 unsigned int numStepsPerWorkItem = n / numItems;
1927 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
1929 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
1930 if (startStep < n) {
1931 const unsigned int stopStep = min(startStep+numStepsPerWorkItem, n);
1933 unsigned int cacheIndex = start+startStep-cacheRangeStartX;
1936 for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
1937 float4 cp = convert_float4(inputImageCache[cacheIndex]);
1939 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
1940 , (ResizeWeightingFunctionType)resizeWindowType
1941 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
1943 filteredPixel += ((float4)weight)*cp;
1950 for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
1951 CLPixelType p = inputImageCache[cacheIndex];
1953 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
1954 , (ResizeWeightingFunctionType)resizeWindowType
1955 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
1957 float alpha = weight * QuantumScale * GetPixelAlpha(p);
1958 float4 cp = convert_float4(p);
1960 filteredPixel.x += alpha * cp.x;
1961 filteredPixel.y += alpha * cp.y;
1962 filteredPixel.z += alpha * cp.z;
1963 filteredPixel.w += weight * cp.w;
1972 // initialize the accumulators to zero
1973 if (itemID < actualNumPixelInThisChunk) {
1974 outputPixelCache[itemID] = (float4)0.0f;
1975 densityCache[itemID] = 0.0f;
1977 gammaCache[itemID] = 0.0f;
1979 barrier(CLK_LOCAL_MEM_FENCE);
1981 // accumulatte the filtered pixel value and the density
1982 for (unsigned int i = 0; i < numItems; i++) {
1983 if (pixelIndex != -1) {
1984 if (itemID%numItems == i) {
1985 outputPixelCache[pixelIndex]+=filteredPixel;
1986 densityCache[pixelIndex]+=density;
1988 gammaCache[pixelIndex]+=gamma;
1992 barrier(CLK_LOCAL_MEM_FENCE);
1995 if (itemID < actualNumPixelInThisChunk) {
1997 float density = densityCache[itemID];
1998 float4 filteredPixel = outputPixelCache[itemID];
1999 if (density!= 0.0f && density != 1.0)
2001 density = PerceptibleReciprocal(density);
2002 filteredPixel *= (float4)density;
2004 filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
2005 , ClampToQuantum(filteredPixel.y)
2006 , ClampToQuantum(filteredPixel.z)
2007 , ClampToQuantum(filteredPixel.w));
2010 float density = densityCache[itemID];
2011 float gamma = gammaCache[itemID];
2012 float4 filteredPixel = outputPixelCache[itemID];
2014 if (density!= 0.0f && density != 1.0) {
2015 density = PerceptibleReciprocal(density);
2016 filteredPixel *= (float4)density;
2019 gamma = PerceptibleReciprocal(gamma);
2022 fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
2023 , ClampToQuantum(gamma*filteredPixel.y)
2024 , ClampToQuantum(gamma*filteredPixel.z)
2025 , ClampToQuantum(filteredPixel.w));
2027 filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
2032 } // end of chunking loop
2039 __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2040 void ResizeHorizontalFilterSinc(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2041 , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2042 , const int resizeFilterType, const int resizeWindowType
2043 , const __global float* resizeFilterCubicCoefficients
2044 , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2045 , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2046 , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2048 ResizeHorizontalFilter(inputImage,inputColumns,inputRows,matte
2049 ,xFactor, filteredImage, filteredColumns, filteredRows
2050 ,SincWeightingFunction, SincWeightingFunction
2051 ,resizeFilterCubicCoefficients
2052 ,resizeFilterScale, resizeFilterSupport, resizeFilterWindowSupport, resizeFilterBlur
2053 ,inputImageCache, numCachedPixels, pixelPerWorkgroup, pixelChunkSize
2054 ,outputPixelCache, densityCache, gammaCache);
2061 __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2062 void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2063 , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2064 , const int resizeFilterType, const int resizeWindowType
2065 , const __global float* resizeFilterCubicCoefficients
2066 , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2067 , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2068 , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2071 // calculate the range of resized image pixels computed by this workgroup
2072 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2073 const unsigned int stopY = min(startY + pixelPerWorkgroup,filteredRows);
2074 const unsigned int actualNumPixelToCompute = stopY - startY;
2076 // calculate the range of input image pixels to cache
2077 float scale = max(1.0/yFactor+MagickEpsilon ,1.0f);
2078 const float support = max(scale*resizeFilterSupport,0.5f);
2079 scale = PerceptibleReciprocal(scale);
2081 const int cacheRangeStartY = max((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2082 const int cacheRangeEndY = min((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2084 // cache the input pixels into local memory
2085 const unsigned int x = get_global_id(0);
2086 event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
2087 wait_group_events(1,&e);
2089 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2090 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2093 const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2094 const unsigned int chunkStopY = min(chunkStartY + pixelChunkSize, stopY);
2095 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2097 // determine which resized pixel computed by this workitem
2098 const unsigned int itemID = get_local_id(1);
2099 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2101 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2103 float4 filteredPixel = (float4)0.0f;
2104 float density = 0.0f;
2106 // -1 means this workitem doesn't participate in the computation
2107 if (pixelIndex != -1) {
2109 // x coordinated of the resized pixel computed by this workitem
2110 const int y = chunkStartY + pixelIndex;
2112 // calculate how many steps required for this pixel
2113 const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2114 const unsigned int start = (unsigned int)max(bisect-support+0.5f,0.0f);
2115 const unsigned int stop = (unsigned int)min(bisect+support+0.5f,(float)inputRows);
2116 const unsigned int n = stop - start;
2118 // calculate how many steps this workitem will contribute
2119 unsigned int numStepsPerWorkItem = n / numItems;
2120 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2122 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2123 if (startStep < n) {
2124 const unsigned int stopStep = min(startStep+numStepsPerWorkItem, n);
2126 unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2129 for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2130 float4 cp = convert_float4(inputImageCache[cacheIndex]);
2132 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2133 , (ResizeWeightingFunctionType)resizeWindowType
2134 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2136 filteredPixel += ((float4)weight)*cp;
2143 for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2144 CLPixelType p = inputImageCache[cacheIndex];
2146 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2147 , (ResizeWeightingFunctionType)resizeWindowType
2148 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2150 float alpha = weight * QuantumScale * GetPixelAlpha(p);
2151 float4 cp = convert_float4(p);
2153 filteredPixel.x += alpha * cp.x;
2154 filteredPixel.y += alpha * cp.y;
2155 filteredPixel.z += alpha * cp.z;
2156 filteredPixel.w += weight * cp.w;
2165 // initialize the accumulators to zero
2166 if (itemID < actualNumPixelInThisChunk) {
2167 outputPixelCache[itemID] = (float4)0.0f;
2168 densityCache[itemID] = 0.0f;
2170 gammaCache[itemID] = 0.0f;
2172 barrier(CLK_LOCAL_MEM_FENCE);
2174 // accumulatte the filtered pixel value and the density
2175 for (unsigned int i = 0; i < numItems; i++) {
2176 if (pixelIndex != -1) {
2177 if (itemID%numItems == i) {
2178 outputPixelCache[pixelIndex]+=filteredPixel;
2179 densityCache[pixelIndex]+=density;
2181 gammaCache[pixelIndex]+=gamma;
2185 barrier(CLK_LOCAL_MEM_FENCE);
2188 if (itemID < actualNumPixelInThisChunk) {
2190 float density = densityCache[itemID];
2191 float4 filteredPixel = outputPixelCache[itemID];
2192 if (density!= 0.0f && density != 1.0)
2194 density = PerceptibleReciprocal(density);
2195 filteredPixel *= (float4)density;
2197 filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
2198 , ClampToQuantum(filteredPixel.y)
2199 , ClampToQuantum(filteredPixel.z)
2200 , ClampToQuantum(filteredPixel.w));
2203 float density = densityCache[itemID];
2204 float gamma = gammaCache[itemID];
2205 float4 filteredPixel = outputPixelCache[itemID];
2207 if (density!= 0.0f && density != 1.0) {
2208 density = PerceptibleReciprocal(density);
2209 filteredPixel *= (float4)density;
2212 gamma = PerceptibleReciprocal(gamma);
2215 fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
2216 , ClampToQuantum(gamma*filteredPixel.y)
2217 , ClampToQuantum(gamma*filteredPixel.z)
2218 , ClampToQuantum(filteredPixel.w));
2220 filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
2225 } // end of chunking loop
2232 __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2233 void ResizeVerticalFilterSinc(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2234 , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2235 , const int resizeFilterType, const int resizeWindowType
2236 , const __global float* resizeFilterCubicCoefficients
2237 , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2238 , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2239 , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2240 ResizeVerticalFilter(inputImage,inputColumns,inputRows,matte
2241 ,yFactor,filteredImage,filteredColumns,filteredRows
2242 ,SincWeightingFunction, SincWeightingFunction
2243 ,resizeFilterCubicCoefficients
2244 ,resizeFilterScale,resizeFilterSupport,resizeFilterWindowSupport,resizeFilterBlur
2245 ,inputImageCache,numCachedPixels,pixelPerWorkgroup,pixelChunkSize
2246 ,outputPixelCache,densityCache,gammaCache);
2253 __kernel void randomNumberGeneratorKernel(__global uint* seeds, const float normalizeRand
2254 , __global float* randomNumbers, const uint init
2255 ,const uint numRandomNumbers) {
2257 unsigned int id = get_global_id(0);
2258 unsigned int seed[4];
2261 seed[0] = seeds[id*4];
2262 seed[1] = 0x50a7f451;
2263 seed[2] = 0x5365417e;
2264 seed[3] = 0xc3a4171a;
2267 seed[0] = seeds[id*4];
2268 seed[1] = seeds[id*4+1];
2269 seed[2] = seeds[id*4+2];
2270 seed[3] = seeds[id*4+3];
2273 unsigned int numRandomNumbersPerItem = (numRandomNumbers+get_global_size(0)-1)/get_global_size(0);
2274 for (unsigned int i = 0; i < numRandomNumbersPerItem; i++) {
2277 unsigned int alpha=(unsigned int) (seed[1] ^ (seed[1] << 11));
2281 seed[0]=(seed[0] ^ (seed[0] >> 19)) ^ (alpha ^ (alpha >> 8));
2282 } while (seed[0] == ~0UL);
2283 unsigned int pos = (get_group_id(0)*get_local_size(0)*numRandomNumbersPerItem)
2284 + get_local_size(0) * i + get_local_id(0);
2286 if (pos >= numRandomNumbers)
2288 randomNumbers[pos] = normalizeRand*seed[0];
2291 /* save the seeds for the time*/
2292 seeds[id*4] = seed[0];
2293 seeds[id*4+1] = seed[1];
2294 seeds[id*4+2] = seed[2];
2295 seeds[id*4+3] = seed[3];
2308 MultiplicativeGaussianNoise,
2316 const global float* rns;
2320 float GetPseudoRandomValue(RandomNumbers* r) {
2327 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
2328 OPENCL_DEFINE(SigmaGaussian,(attenuate*0.015625f))
2329 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
2330 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
2331 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
2332 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
2333 OPENCL_DEFINE(SigmaRandom, (attenuate))
2334 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
2337 float GenerateDifferentialNoise(RandomNumbers* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
2346 alpha=GetPseudoRandomValue(r);
2347 switch(noise_type) {
2351 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
2362 beta=GetPseudoRandomValue(r);
2363 gamma=sqrt(-2.0f*log(alpha));
2364 sigma=gamma*cospi((2.0f*beta));
2365 tau=gamma*sinpi((2.0f*beta));
2366 noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
2367 QuantumRange*TauGaussian*tau);
2374 if (alpha < (SigmaImpulse/2.0f))
2377 if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
2378 noise=(float)QuantumRange;
2383 case LaplacianNoise:
2387 if (alpha <= MagickEpsilon)
2388 noise=(float) (pixel-QuantumRange);
2390 noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
2395 if (beta <= (0.5f*MagickEpsilon))
2396 noise=(float) (pixel+QuantumRange);
2398 noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
2401 case MultiplicativeGaussianNoise:
2404 if (alpha > MagickEpsilon)
2405 sigma=sqrt(-2.0f*log(alpha));
2406 beta=GetPseudoRandomValue(r);
2407 noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
2408 cospi((float) (2.0f*beta))/2.0f);
2416 poisson=exp(-SigmaPoisson*QuantumScale*pixel);
2417 for (i=0; alpha > poisson; i++)
2419 beta=GetPseudoRandomValue(r);
2422 noise=(float) (QuantumRange*i/SigmaPoisson);
2427 noise=(float) (QuantumRange*SigmaRandom*alpha);
2436 void AddNoiseImage(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
2437 ,const unsigned int inputColumns, const unsigned int inputRows
2438 ,const ChannelType channel
2439 ,const NoiseType noise_type, const float attenuate
2440 ,const __global float* randomNumbers, const unsigned int numRandomNumbersPerPixel
2441 ,const unsigned int rowOffset) {
2443 unsigned int x = get_global_id(0);
2444 unsigned int y = get_global_id(1) + rowOffset;
2446 r.rns = randomNumbers + (get_global_id(1) * inputColumns + get_global_id(0))*numRandomNumbersPerPixel;
2448 CLPixelType p = inputImage[y*inputColumns+x];
2449 CLPixelType q = filteredImage[y*inputColumns+x];
2451 if ((channel&RedChannel)!=0) {
2452 setRed(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getRed(p),noise_type,attenuate)));
2455 if ((channel&GreenChannel)!=0) {
2456 setGreen(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getGreen(p),noise_type,attenuate)));
2459 if ((channel&BlueChannel)!=0) {
2460 setBlue(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getBlue(p),noise_type,attenuate)));
2463 if ((channel & OpacityChannel) != 0) {
2464 setOpacity(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getOpacity(p),noise_type,attenuate)));
2467 filteredImage[y*inputColumns+x] = q;
2476 #endif // MAGICKCORE_OPENCL_SUPPORT
2478 #if defined(__cplusplus) || defined(c_plusplus)
2482 #endif // _MAGICKCORE_ACCELERATE_PRIVATE_H