accelerate-kernels-private.h

Go to the documentation of this file.
00001 /*
00002   Copyright 1999-2020 ImageMagick Studio LLC, a non-profit organization
00003   dedicated to making software imaging solutions freely available.
00004   
00005   You may not use this file except in compliance with the License.  You may
00006   obtain a copy of the License at
00007   
00008     https://imagemagick.org/script/license.php
00009   
00010   Unless required by applicable law or agreed to in writing, software
00011   distributed under the License is distributed on an "AS IS" BASIS,
00012   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00013   See the License for the specific language governing permissions and
00014   limitations under the License.
00015 
00016   MagickCore private kernels for accelerated functions.
00017 */
00018 
00019 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
00020 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
00021 
00022 #if defined(__cplusplus) || defined(c_plusplus)
00023 extern "C" {
00024 #endif
00025 
00026 #if defined(MAGICKCORE_OPENCL_SUPPORT)
00027 
00028 /*
00029   Define declarations.
00030 */
00031 #define OPENCL_DEFINE(VAR,...)  "\n #""define " #VAR " " #__VA_ARGS__ " \n"
00032 #define OPENCL_ELIF(...)        "\n #""elif " #__VA_ARGS__ " \n"
00033 #define OPENCL_ELSE()           "\n #""else " " \n"
00034 #define OPENCL_ENDIF()          "\n #""endif " " \n"
00035 #define OPENCL_IF(...)          "\n #""if " #__VA_ARGS__ " \n"
00036 #define STRINGIFY(...) #__VA_ARGS__ "\n"
00037 
00038 const char *accelerateKernels =
00039 
00040 /*
00041   Define declarations.
00042 */
00043   OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
00044   OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
00045   OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
00046   OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
00047   OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
00048   OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
00049   OPENCL_DEFINE(SigmaRandom, (attenuate))
00050   OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
00051   OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
00052   OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
00053 
00054 /*
00055   Typedef declarations.
00056 */
00057   STRINGIFY(
00058     typedef enum
00059     {
00060       UndefinedColorspace,
00061       CMYColorspace,           /* negated linear RGB colorspace */
00062       CMYKColorspace,          /* CMY with Black separation */
00063       GRAYColorspace,          /* Single Channel greyscale (non-linear) image */
00064       HCLColorspace,
00065       HCLpColorspace,
00066       HSBColorspace,
00067       HSIColorspace,
00068       HSLColorspace,
00069       HSVColorspace,           /* alias for HSB */
00070       HWBColorspace,
00071       LabColorspace,
00072       LCHColorspace,           /* alias for LCHuv */
00073       LCHabColorspace,         /* Cylindrical (Polar) Lab */
00074       LCHuvColorspace,         /* Cylindrical (Polar) Luv */
00075       LogColorspace,
00076       LMSColorspace,
00077       LuvColorspace,
00078       OHTAColorspace,
00079       Rec601YCbCrColorspace,
00080       Rec709YCbCrColorspace,
00081       RGBColorspace,           /* Linear RGB colorspace */
00082       scRGBColorspace,         /* ??? */
00083       sRGBColorspace,          /* Default: non-linear sRGB colorspace */
00084       TransparentColorspace,
00085       xyYColorspace,
00086       XYZColorspace,           /* IEEE Color Reference colorspace */
00087       YCbCrColorspace,
00088       YCCColorspace,
00089       YDbDrColorspace,
00090       YIQColorspace,
00091       YPbPrColorspace,
00092       YUVColorspace,
00093       LinearGRAYColorspace     /* Single Channel greyscale (linear) image */
00094     } ColorspaceType;
00095   )
00096 
00097   STRINGIFY(
00098     typedef enum
00099     {
00100       UndefinedCompositeOp,
00101       AlphaCompositeOp,
00102       AtopCompositeOp,
00103       BlendCompositeOp,
00104       BlurCompositeOp,
00105       BumpmapCompositeOp,
00106       ChangeMaskCompositeOp,
00107       ClearCompositeOp,
00108       ColorBurnCompositeOp,
00109       ColorDodgeCompositeOp,
00110       ColorizeCompositeOp,
00111       CopyBlackCompositeOp,
00112       CopyBlueCompositeOp,
00113       CopyCompositeOp,
00114       CopyCyanCompositeOp,
00115       CopyGreenCompositeOp,
00116       CopyMagentaCompositeOp,
00117       CopyAlphaCompositeOp,
00118       CopyRedCompositeOp,
00119       CopyYellowCompositeOp,
00120       DarkenCompositeOp,
00121       DarkenIntensityCompositeOp,
00122       DifferenceCompositeOp,
00123       DisplaceCompositeOp,
00124       DissolveCompositeOp,
00125       DistortCompositeOp,
00126       DivideDstCompositeOp,
00127       DivideSrcCompositeOp,
00128       DstAtopCompositeOp,
00129       DstCompositeOp,
00130       DstInCompositeOp,
00131       DstOutCompositeOp,
00132       DstOverCompositeOp,
00133       ExclusionCompositeOp,
00134       HardLightCompositeOp,
00135       HardMixCompositeOp,
00136       HueCompositeOp,
00137       InCompositeOp,
00138       IntensityCompositeOp,
00139       LightenCompositeOp,
00140       LightenIntensityCompositeOp,
00141       LinearBurnCompositeOp,
00142       LinearDodgeCompositeOp,
00143       LinearLightCompositeOp,
00144       LuminizeCompositeOp,
00145       MathematicsCompositeOp,
00146       MinusDstCompositeOp,
00147       MinusSrcCompositeOp,
00148       ModulateCompositeOp,
00149       ModulusAddCompositeOp,
00150       ModulusSubtractCompositeOp,
00151       MultiplyCompositeOp,
00152       NoCompositeOp,
00153       OutCompositeOp,
00154       OverCompositeOp,
00155       OverlayCompositeOp,
00156       PegtopLightCompositeOp,
00157       PinLightCompositeOp,
00158       PlusCompositeOp,
00159       ReplaceCompositeOp,
00160       SaturateCompositeOp,
00161       ScreenCompositeOp,
00162       SoftLightCompositeOp,
00163       SrcAtopCompositeOp,
00164       SrcCompositeOp,
00165       SrcInCompositeOp,
00166       SrcOutCompositeOp,
00167       SrcOverCompositeOp,
00168       ThresholdCompositeOp,
00169       VividLightCompositeOp,
00170       XorCompositeOp,
00171       StereoCompositeOp
00172     } CompositeOperator;
00173   )
00174 
00175   STRINGIFY(
00176     typedef enum
00177     {
00178       UndefinedFunction,
00179       ArcsinFunction,
00180       ArctanFunction,
00181       PolynomialFunction,
00182       SinusoidFunction
00183     } MagickFunction;
00184   )
00185 
00186   STRINGIFY(
00187     typedef enum
00188     {
00189       UndefinedNoise,
00190       UniformNoise,
00191       GaussianNoise,
00192       MultiplicativeGaussianNoise,
00193       ImpulseNoise,
00194       LaplacianNoise,
00195       PoissonNoise,
00196       RandomNoise
00197     } NoiseType;
00198   )
00199 
00200   STRINGIFY(
00201     typedef enum
00202     {
00203       UndefinedPixelIntensityMethod = 0,
00204       AveragePixelIntensityMethod,
00205       BrightnessPixelIntensityMethod,
00206       LightnessPixelIntensityMethod,
00207       MSPixelIntensityMethod,
00208       Rec601LumaPixelIntensityMethod,
00209       Rec601LuminancePixelIntensityMethod,
00210       Rec709LumaPixelIntensityMethod,
00211       Rec709LuminancePixelIntensityMethod,
00212       RMSPixelIntensityMethod
00213     } PixelIntensityMethod;
00214   )
00215 
00216   STRINGIFY(
00217     typedef enum
00218     {
00219       BoxWeightingFunction = 0,
00220       TriangleWeightingFunction,
00221       CubicBCWeightingFunction,
00222       HannWeightingFunction,
00223       HammingWeightingFunction,
00224       BlackmanWeightingFunction,
00225       GaussianWeightingFunction,
00226       QuadraticWeightingFunction,
00227       JincWeightingFunction,
00228       SincWeightingFunction,
00229       SincFastWeightingFunction,
00230       KaiserWeightingFunction,
00231       WelchWeightingFunction,
00232       BohmanWeightingFunction,
00233       LagrangeWeightingFunction,
00234       CosineWeightingFunction
00235     } ResizeWeightingFunctionType;
00236   )
00237 
00238   STRINGIFY(
00239     typedef enum
00240     {
00241       UndefinedChannel = 0x0000,
00242       RedChannel = 0x0001,
00243       GrayChannel = 0x0001,
00244       CyanChannel = 0x0001,
00245       GreenChannel = 0x0002,
00246       MagentaChannel = 0x0002,
00247       BlueChannel = 0x0004,
00248       YellowChannel = 0x0004,
00249       BlackChannel = 0x0008,
00250       AlphaChannel = 0x0010,
00251       OpacityChannel = 0x0010,
00252       IndexChannel = 0x0020,             /* Color Index Table? */
00253       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
00254       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
00255       MetaChannel = 0x0100,              /* ???? */
00256       CompositeChannels = 0x001F,
00257       AllChannels = 0x7ffffff,
00258       /*
00259         Special purpose channel types.
00260         FUTURE: are these needed any more - they are more like hacks
00261         SyncChannels for example is NOT a real channel but a 'flag'
00262         It really says -- "User has not defined channels"
00263         Though it does have extra meaning in the "-auto-level" operator
00264       */
00265       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
00266       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
00267       GrayChannels = 0x0400,
00268       SyncChannels = 0x20000,    /* channels modified as a single unit */
00269       DefaultChannels = AllChannels
00270     } ChannelType;  /* must correspond to PixelChannel */
00271   )
00272 
00273 /*
00274   Helper functions.
00275 */
00276 
00277 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
00278 
00279   STRINGIFY(
00280     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
00281     {
00282       return((CLQuantum) value);
00283     }
00284   )
00285 
00286 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
00287 
00288   STRINGIFY(
00289     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
00290     {
00291       return((CLQuantum) (257.0f*value));
00292     }
00293   )
00294 
00295 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
00296 
00297   STRINGIFY(
00298     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
00299     {
00300       return((CLQuantum) (16843009.0*value));
00301     }
00302   )
00303 
00304 OPENCL_ENDIF()
00305 
00306 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
00307 
00308   STRINGIFY(
00309     static inline CLQuantum ClampToQuantum(const float value)
00310       {
00311         return (CLQuantum) clamp(value, 0.0f, QuantumRange);
00312       }
00313   )
00314 
00315 OPENCL_ELSE()
00316 
00317   STRINGIFY(
00318     static inline CLQuantum ClampToQuantum(const float value)
00319       {
00320         return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
00321       }
00322   )
00323 
00324 OPENCL_ENDIF()
00325 
00326   STRINGIFY(
00327     static inline int ClampToCanvas(const int offset,const int range)
00328       {
00329         return clamp(offset, (int)0, range-1);
00330       }
00331   )
00332 
00333   STRINGIFY(
00334     static inline uint ScaleQuantumToMap(CLQuantum value)
00335       {
00336         if (value >= (CLQuantum) MaxMap)
00337           return ((uint)MaxMap);
00338         else 
00339           return ((uint)value);
00340       }
00341   )
00342 
00343   STRINGIFY(
00344     static inline float PerceptibleReciprocal(const float x)
00345     {
00346       float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
00347       return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
00348     }
00349   )
00350 
00351   STRINGIFY(
00352 
00353   static inline unsigned int getPixelIndex(const unsigned int number_channels,
00354     const unsigned int columns, const unsigned int x, const unsigned int y)
00355   {
00356     return (x * number_channels) + (y * columns * number_channels);
00357   }
00358 
00359   static inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
00360   static inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
00361   static inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
00362   static inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
00363 
00364   static inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
00365   static inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
00366   static inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
00367   static inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
00368 
00369   static inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
00370   static inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
00371   static inline float getBlueF4(float4 p)                      { return p.x; }
00372   static inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
00373 
00374   static inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
00375   static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
00376   static inline float getGreenF4(float4 p)                     { return p.y; }
00377   static inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
00378 
00379   static inline CLQuantum getRed(CLPixelType p)                { return p.z; }
00380   static inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
00381   static inline float getRedF4(float4 p)                       { return p.z; }
00382   static inline void setRedF4(float4* p, float value)          { (*p).z = value; }
00383 
00384   static inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
00385   static inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
00386   static inline float getAlphaF4(float4 p)                     { return p.w; }
00387   static inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
00388 
00389   static inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
00390     const ChannelType channel, float *red, float *green, float *blue, float *alpha)
00391   {
00392     if ((channel & RedChannel) != 0)
00393       *red=getPixelRed(p);
00394 
00395     if (number_channels > 2)
00396       {
00397         if ((channel & GreenChannel) != 0)
00398           *green=getPixelGreen(p);
00399 
00400         if ((channel & BlueChannel) != 0)
00401           *blue=getPixelBlue(p);
00402       }
00403 
00404     if (((number_channels == 4) || (number_channels == 2)) &&
00405         ((channel & AlphaChannel) != 0))
00406       *alpha=getPixelAlpha(p,number_channels);
00407   }
00408 
00409   static inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
00410     const unsigned int columns, const unsigned int x, const unsigned int y)
00411   {
00412     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
00413 
00414     float4 pixel;
00415 
00416     pixel.x=getPixelRed(p);
00417 
00418     if (number_channels > 2)
00419       {
00420         pixel.y=getPixelGreen(p);
00421         pixel.z=getPixelBlue(p);
00422       }
00423 
00424     if ((number_channels == 4) || (number_channels == 2))
00425       pixel.w=getPixelAlpha(p,number_channels);
00426     return(pixel);
00427   }
00428 
00429   static inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
00430     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
00431   {
00432     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
00433 
00434     float red = 0.0f;
00435     float green = 0.0f;
00436     float blue = 0.0f;
00437     float alpha = 0.0f;
00438 
00439     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
00440     return (float4)(red, green, blue, alpha);
00441   }
00442 
00443   static inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
00444     const ChannelType channel, float red, float green, float blue, float alpha)
00445   {
00446     if ((channel & RedChannel) != 0)
00447       setPixelRed(p,ClampToQuantum(red));
00448 
00449     if (number_channels > 2)
00450       {
00451         if ((channel & GreenChannel) != 0)
00452           setPixelGreen(p,ClampToQuantum(green));
00453 
00454         if ((channel & BlueChannel) != 0)
00455           setPixelBlue(p,ClampToQuantum(blue));
00456       }
00457 
00458     if (((number_channels == 4) || (number_channels == 2)) &&
00459         ((channel & AlphaChannel) != 0))
00460       setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
00461   }
00462 
00463   static inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
00464     const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
00465   {
00466     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
00467 
00468     setPixelRed(p,ClampToQuantum(pixel.x));
00469 
00470     if (number_channels > 2)
00471       {
00472         setPixelGreen(p,ClampToQuantum(pixel.y));
00473         setPixelBlue(p,ClampToQuantum(pixel.z));
00474       }
00475 
00476     if ((number_channels == 4) || (number_channels == 2))
00477       setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
00478   }
00479 
00480   static inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
00481     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
00482     float4 pixel)
00483   {
00484     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
00485     WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
00486   }
00487 
00488   static inline float GetPixelIntensity(const unsigned int colorspace,
00489     const unsigned int method,float red,float green,float blue)
00490   {
00491     float intensity;
00492 
00493     if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace))
00494       return red;
00495 
00496     switch (method)
00497     {
00498       case AveragePixelIntensityMethod:
00499         {
00500           intensity=(red+green+blue)/3.0;
00501           break;
00502         }
00503       case BrightnessPixelIntensityMethod:
00504         {
00505           intensity=MagickMax(MagickMax(red,green),blue);
00506           break;
00507         }
00508       case LightnessPixelIntensityMethod:
00509         {
00510           intensity=(MagickMin(MagickMin(red,green),blue)+
00511               MagickMax(MagickMax(red,green),blue))/2.0;
00512           break;
00513         }
00514       case MSPixelIntensityMethod:
00515         {
00516           intensity=(float) (((float) red*red+green*green+blue*blue)/
00517               (3.0*QuantumRange));
00518           break;
00519         }
00520       case Rec601LumaPixelIntensityMethod:
00521         {
00522           /*
00523           if (image->colorspace == RGBColorspace)
00524           {
00525             red=EncodePixelGamma(red);
00526             green=EncodePixelGamma(green);
00527             blue=EncodePixelGamma(blue);
00528           }
00529           */
00530           intensity=0.298839*red+0.586811*green+0.114350*blue;
00531           break;
00532         }
00533       case Rec601LuminancePixelIntensityMethod:
00534         {
00535           /*
00536           if (image->colorspace == sRGBColorspace)
00537           {
00538             red=DecodePixelGamma(red);
00539             green=DecodePixelGamma(green);
00540             blue=DecodePixelGamma(blue);
00541           }
00542           */
00543           intensity=0.298839*red+0.586811*green+0.114350*blue;
00544           break;
00545         }
00546       case Rec709LumaPixelIntensityMethod:
00547       default:
00548         {
00549           /*
00550           if (image->colorspace == RGBColorspace)
00551           {
00552             red=EncodePixelGamma(red);
00553             green=EncodePixelGamma(green);
00554             blue=EncodePixelGamma(blue);
00555           }
00556           */
00557           intensity=0.212656*red+0.715158*green+0.072186*blue;
00558           break;
00559         }
00560       case Rec709LuminancePixelIntensityMethod:
00561         {
00562           /*
00563           if (image->colorspace == sRGBColorspace)
00564           {
00565             red=DecodePixelGamma(red);
00566             green=DecodePixelGamma(green);
00567             blue=DecodePixelGamma(blue);
00568           }
00569           */
00570           intensity=0.212656*red+0.715158*green+0.072186*blue;
00571           break;
00572         }
00573       case RMSPixelIntensityMethod:
00574         {
00575           intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
00576               sqrt(3.0));
00577           break;
00578         }
00579     }
00580 
00581     return intensity;
00582   }
00583 
00584   static inline int mirrorBottom(int value)
00585   {
00586       return (value < 0) ? - (value) : value;
00587   }
00588 
00589   static inline int mirrorTop(int value, int width)
00590   {
00591       return (value >= width) ? (2 * width - value - 1) : value;
00592   }
00593   )
00594 
00595 /*
00596 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00597 %                                                                             %
00598 %                                                                             %
00599 %                                                                             %
00600 %     A d d N o i s e                                                         %
00601 %                                                                             %
00602 %                                                                             %
00603 %                                                                             %
00604 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00605 */
00606 
00607   STRINGIFY(
00608   /*
00609   Part of MWC64X by David Thomas, dt10@imperial.ac.uk
00610   This is provided under BSD, full license is with the main package.
00611   See http://www.doc.ic.ac.uk/~dt10/research
00612   */
00613 
00614   // Pre: a<M, b<M
00615   // Post: r=(a+b) mod M
00616   ulong MWC_AddMod64(ulong a, ulong b, ulong M)
00617   {
00618     ulong v=a+b;
00619     //if( (v>=M) || (v<a) )
00620     if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
00621       v=v-M;
00622     return v;
00623   }
00624 
00625   // Pre: a<M,b<M
00626   // Post: r=(a*b) mod M
00627   // This could be done more efficently, but it is portable, and should
00628   // be easy to understand. It can be replaced with any of the better
00629   // modular multiplication algorithms (for example if you know you have
00630   // double precision available or something).
00631   ulong MWC_MulMod64(ulong a, ulong b, ulong M)
00632   {
00633     ulong r=0;
00634     while(a!=0){
00635       if(a&1)
00636         r=MWC_AddMod64(r,b,M);
00637       b=MWC_AddMod64(b,b,M);
00638       a=a>>1;
00639     }
00640     return r;
00641   }
00642 
00643   // Pre: a<M, e>=0
00644   // Post: r=(a^b) mod M
00645   // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
00646   // most architectures
00647   ulong MWC_PowMod64(ulong a, ulong e, ulong M)
00648   {
00649     ulong sqr=a, acc=1;
00650     while(e!=0){
00651       if(e&1)
00652         acc=MWC_MulMod64(acc,sqr,M);
00653         sqr=MWC_MulMod64(sqr,sqr,M);
00654       e=e>>1;
00655     }
00656     return acc;
00657   }
00658 
00659   uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
00660   {
00661     ulong m=MWC_PowMod64(A, distance, M);
00662     ulong x=curr.x*(ulong)A+curr.y;
00663     x=MWC_MulMod64(x, m, M);
00664     return (uint2)((uint)(x/A), (uint)(x%A));
00665   }
00666 
00667   uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
00668   {
00669     // This is an arbitrary constant for starting LCG jumping from. I didn't
00670     // want to start from 1, as then you end up with the two or three first values
00671     // being a bit poor in ones - once you've decided that, one constant is as
00672     // good as any another. There is no deep mathematical reason for it, I just
00673     // generated a random number.
00674     enum{ MWC_BASEID = 4077358422479273989UL };
00675 
00676     ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
00677     ulong m=MWC_PowMod64(A, dist, M);
00678 
00679     ulong x=MWC_MulMod64(MWC_BASEID, m, M);
00680     return (uint2)((uint)(x/A), (uint)(x%A));
00681   }
00682 
00684   typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
00685 
00686   void MWC64X_Step(mwc64x_state_t *s)
00687   {
00688     uint X=s->x, C=s->c;
00689 
00690     uint Xn=s->seed0*X+C;
00691     uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
00692     uint Cn=mad_hi(s->seed0,X,carry);
00693 
00694     s->x=Xn;
00695     s->c=Cn;
00696   }
00697 
00698   void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
00699   {
00700     uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
00701     s->x=tmp.x;
00702     s->c=tmp.y;
00703   }
00704 
00705   void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
00706   {
00707     uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
00708     s->x=tmp.x;
00709     s->c=tmp.y;
00710   }
00711 
00713   uint MWC64X_NextUint(mwc64x_state_t *s)
00714   {
00715     uint res=s->x ^ s->c;
00716     MWC64X_Step(s);
00717     return res;
00718   }
00719 
00720   //
00721   // End of MWC64X excerpt
00722   //
00723 
00724   float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
00725   {
00726     return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
00727   }
00728 
00729   float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
00730   {
00731     float
00732       alpha,
00733       beta,
00734       noise,
00735       sigma;
00736 
00737     noise = 0.0f;
00738     alpha=mwcReadPseudoRandomValue(r);
00739     switch(noise_type)
00740     {
00741       case UniformNoise:
00742       default:
00743         {
00744           noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
00745           break;
00746         }
00747       case GaussianNoise:
00748         {
00749           float
00750             gamma,
00751             tau;
00752 
00753           if (alpha == 0.0f)
00754             alpha=1.0f;
00755           beta=mwcReadPseudoRandomValue(r);
00756           gamma=sqrt(-2.0f*log(alpha));
00757           sigma=gamma*cospi((2.0f*beta));
00758           tau=gamma*sinpi((2.0f*beta));
00759           noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
00760           break;
00761         }
00762       case ImpulseNoise:
00763       {
00764         if (alpha < (SigmaImpulse/2.0f))
00765           noise=0.0f;
00766         else
00767           if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
00768             noise=QuantumRange;
00769           else
00770             noise=pixel;
00771         break;
00772       }
00773       case LaplacianNoise:
00774       {
00775         if (alpha <= 0.5f)
00776           {
00777             if (alpha <= MagickEpsilon)
00778               noise=(pixel-QuantumRange);
00779             else
00780               noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
00781                 0.5f);
00782             break;
00783           }
00784         beta=1.0f-alpha;
00785         if (beta <= (0.5f*MagickEpsilon))
00786           noise=(pixel+QuantumRange);
00787         else
00788           noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
00789         break;
00790       }
00791       case MultiplicativeGaussianNoise:
00792       {
00793         sigma=1.0f;
00794         if (alpha > MagickEpsilon)
00795           sigma=sqrt(-2.0f*log(alpha));
00796         beta=mwcReadPseudoRandomValue(r);
00797         noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
00798           cospi((2.0f*beta))/2.0f);
00799         break;
00800       }
00801       case PoissonNoise:
00802       {
00803         float 
00804           poisson;
00805         unsigned int i;
00806         poisson=exp(-SigmaPoisson*QuantumScale*pixel);
00807         for (i=0; alpha > poisson; i++)
00808         {
00809           beta=mwcReadPseudoRandomValue(r);
00810           alpha*=beta;
00811         }
00812         noise=(QuantumRange*i/SigmaPoisson);
00813         break;
00814       }
00815       case RandomNoise:
00816       {
00817         noise=(QuantumRange*SigmaRandom*alpha);
00818         break;
00819       }
00820     }
00821     return noise;
00822   }
00823 
00824   __kernel
00825   void AddNoise(const __global CLQuantum *image,
00826     const unsigned int number_channels,const ChannelType channel,
00827     const unsigned int length,const unsigned int pixelsPerWorkItem,
00828     const NoiseType noise_type,const float attenuate,const unsigned int seed0,
00829     const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
00830     __global CLQuantum *filteredImage)
00831   {
00832     mwc64x_state_t rng;
00833     rng.seed0 = seed0;
00834     rng.seed1 = seed1;
00835 
00836     uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
00837     uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
00838     MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
00839 
00840     uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
00841     uint count = pixelsPerWorkItem;
00842 
00843     while (count > 0 && pos < length)
00844     {
00845       const __global CLQuantum *p = image + pos;
00846       __global CLQuantum *q = filteredImage + pos;
00847 
00848       float red;
00849       float green;
00850       float blue;
00851       float alpha;
00852 
00853       ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
00854 
00855       if ((channel & RedChannel) != 0)
00856         red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
00857 
00858       if (number_channels > 2)
00859       {
00860         if ((channel & GreenChannel) != 0)
00861           green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
00862 
00863         if ((channel & BlueChannel) != 0)
00864           blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
00865       }
00866 
00867       if (((number_channels == 4) || (number_channels == 2)) &&
00868           ((channel & AlphaChannel) != 0))
00869         alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
00870 
00871       WriteChannels(q, number_channels, channel, red, green, blue, alpha);
00872 
00873       pos += (get_local_size(0) * number_channels);
00874       count--;
00875     }
00876   }
00877   )
00878 
00879 /*
00880 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00881 %                                                                             %
00882 %                                                                             %
00883 %                                                                             %
00884 %    B l u r                                                                  %
00885 %                                                                             %
00886 %                                                                             %
00887 %                                                                             %
00888 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00889 */
00890 
00891   STRINGIFY(
00892   /*
00893   Reduce image noise and reduce detail levels by row
00894   */
00895   __kernel void BlurRow(const __global CLQuantum *image,
00896     const unsigned int number_channels,const ChannelType channel,
00897     __constant float *filter,const unsigned int width,
00898     const unsigned int imageColumns,const unsigned int imageRows,
00899     __local float4 *temp,__global float4 *tempImage)
00900   {
00901     const int x = get_global_id(0);
00902     const int y = get_global_id(1);
00903 
00904     const int columns = imageColumns;
00905 
00906     const unsigned int radius = (width-1)/2;
00907     const int wsize = get_local_size(0);
00908     const unsigned int loadSize = wsize+width;
00909 
00910     //group coordinate
00911     const int groupX=get_local_size(0)*get_group_id(0);
00912 
00913     //parallel load and clamp
00914     for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
00915     {
00916       int cx = ClampToCanvas(i + groupX - radius, columns);
00917       temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
00918     }
00919 
00920     // barrier
00921     barrier(CLK_LOCAL_MEM_FENCE);
00922 
00923     // only do the work if this is not a patched item
00924     if (get_global_id(0) < columns)
00925     {
00926       // compute
00927       float4 result = (float4) 0;
00928 
00929       int i = 0;
00930 
00931       for ( ; i+7 < width; )
00932       {
00933         for (int j=0; j < 8; j++)
00934           result+=filter[i+j]*temp[i+j+get_local_id(0)];
00935         i+=8;
00936       }
00937 
00938       for ( ; i < width; i++)
00939         result+=filter[i]*temp[i+get_local_id(0)];
00940 
00941       // write back to global
00942       tempImage[y*columns+x] = result;
00943     }
00944   }
00945   )
00946 
00947   STRINGIFY(
00948   /*
00949   Reduce image noise and reduce detail levels by line
00950   */
00951   __kernel void BlurColumn(const __global float4 *blurRowData,
00952     const unsigned int number_channels,const ChannelType channel,
00953     __constant float *filter,const unsigned int width,
00954     const unsigned int imageColumns,const unsigned int imageRows,
00955     __local float4 *temp,__global CLQuantum *filteredImage)
00956   {
00957     const int x = get_global_id(0);
00958     const int y = get_global_id(1);
00959 
00960     const int columns = imageColumns;
00961     const int rows = imageRows;
00962 
00963     unsigned int radius = (width-1)/2;
00964     const int wsize = get_local_size(1);
00965     const unsigned int loadSize = wsize+width;
00966 
00967     //group coordinate
00968     const int groupX=get_local_size(0)*get_group_id(0);
00969     const int groupY=get_local_size(1)*get_group_id(1);
00970     //notice that get_local_size(0) is 1, so
00971     //groupX=get_group_id(0);
00972 
00973     //parallel load and clamp
00974     for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
00975       temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
00976 
00977     // barrier
00978     barrier(CLK_LOCAL_MEM_FENCE);
00979 
00980     // only do the work if this is not a patched item
00981     if (get_global_id(1) < rows)
00982     {
00983       // compute
00984       float4 result = (float4) 0;
00985 
00986       int i = 0;
00987 
00988       for ( ; i+7 < width; )
00989       {
00990         for (int j=0; j < 8; j++)
00991           result+=filter[i+j]*temp[i+j+get_local_id(1)];
00992         i+=8;
00993       }
00994 
00995       for ( ; i < width; i++)
00996         result+=filter[i]*temp[i+get_local_id(1)];
00997 
00998       // write back to global
00999       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
01000     }
01001   }
01002   )
01003 
01004 /*
01005 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01006 %                                                                             %
01007 %                                                                             %
01008 %                                                                             %
01009 %    C o n t r a s t                                                          %
01010 %                                                                             %
01011 %                                                                             %
01012 %                                                                             %
01013 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01014 */
01015 
01016   STRINGIFY(
01017 
01018   static inline float4 ConvertRGBToHSB(const float4 pixel)
01019   {
01020     float4 result=0.0f;
01021     result.w=pixel.w;
01022     float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
01023     if (tmax != 0.0f)
01024     {
01025       float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
01026       float delta=tmax-tmin;
01027 
01028       result.y=delta/tmax;
01029       result.z=QuantumScale*tmax;
01030       if (delta != 0.0f)
01031       {
01032         result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
01033         result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
01034           (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
01035         result.x/=6.0f;
01036         result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
01037       }
01038     }
01039     return(result);
01040   }
01041 
01042   static inline float4 ConvertHSBToRGB(const float4 pixel)
01043   {
01044     float hue=pixel.x;
01045     float saturation=pixel.y;
01046     float brightness=pixel.z;
01047 
01048     float4 result=pixel;
01049 
01050     if (saturation == 0.0f)
01051     {
01052       result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
01053     }
01054     else
01055     {
01056       float h=6.0f*(hue-floor(hue));
01057       float f=h-floor(h);
01058       float p=brightness*(1.0f-saturation);
01059       float q=brightness*(1.0f-saturation*f);
01060       float t=brightness*(1.0f-(saturation*(1.0f-f)));
01061       int ih = (int)h;
01062 
01063       if (ih == 1)
01064       {
01065         result.x=ClampToQuantum(QuantumRange*q);
01066         result.y=ClampToQuantum(QuantumRange*brightness);
01067         result.z=ClampToQuantum(QuantumRange*p);
01068       }
01069       else if (ih == 2)
01070       {
01071         result.x=ClampToQuantum(QuantumRange*p);
01072         result.y=ClampToQuantum(QuantumRange*brightness);
01073         result.z=ClampToQuantum(QuantumRange*t);
01074       }
01075       else if (ih == 3)
01076       {
01077         result.x=ClampToQuantum(QuantumRange*p);
01078         result.y=ClampToQuantum(QuantumRange*q);
01079         result.z=ClampToQuantum(QuantumRange*brightness);
01080       }
01081       else if (ih == 4)
01082       {
01083         result.x=ClampToQuantum(QuantumRange*t);
01084         result.y=ClampToQuantum(QuantumRange*p);
01085         result.z=ClampToQuantum(QuantumRange*brightness);
01086       }
01087       else if (ih == 5)
01088       {
01089         result.x=ClampToQuantum(QuantumRange*brightness);
01090         result.y=ClampToQuantum(QuantumRange*p);
01091         result.z=ClampToQuantum(QuantumRange*q);
01092       }
01093       else
01094       {
01095         result.x=ClampToQuantum(QuantumRange*brightness);
01096         result.y=ClampToQuantum(QuantumRange*t);
01097         result.z=ClampToQuantum(QuantumRange*p);
01098       }
01099     }
01100     return(result);
01101   }
01102 
01103   __kernel void Contrast(__global CLQuantum *image,
01104     const unsigned int number_channels,const int sign)
01105   {
01106     const int x=get_global_id(0);
01107     const int y=get_global_id(1);
01108     const unsigned int columns=get_global_size(0);
01109 
01110     float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
01111     if (number_channels < 3)
01112       pixel.y=pixel.z=pixel.x;
01113 
01114     pixel=ConvertRGBToHSB(pixel);
01115     float brightness=pixel.z;
01116     brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
01117     brightness=clamp(brightness,0.0f,1.0f);
01118     pixel.z=brightness;
01119     pixel=ConvertHSBToRGB(pixel);
01120 
01121     WriteAllChannels(image,number_channels,columns,x,y,pixel);
01122   }
01123   )
01124 
01125 /*
01126 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01127 %                                                                             %
01128 %                                                                             %
01129 %                                                                             %
01130 %    C o n t r a s t S t r e t c h                                            %
01131 %                                                                             %
01132 %                                                                             %
01133 %                                                                             %
01134 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01135 */
01136 
01137     STRINGIFY(
01138     /*
01139     */
01140     __kernel void Histogram(__global CLPixelType * restrict im,
01141       const ChannelType channel, 
01142       const unsigned int colorspace,
01143       const unsigned int method,
01144       __global uint4 * restrict histogram)
01145       {
01146         const int x = get_global_id(0);  
01147         const int y = get_global_id(1);  
01148         const int columns = get_global_size(0);  
01149         const int c = x + y * columns;
01150         if ((channel & SyncChannels) != 0)
01151         {
01152           float red=(float)getRed(im[c]);
01153           float green=(float)getGreen(im[c]);
01154           float blue=(float)getBlue(im[c]);
01155 
01156           float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
01157           uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
01158           atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
01159         }
01160         else
01161         {
01162           // for equalizing, we always need all channels?
01163           // otherwise something more
01164         }
01165       }
01166     )
01167 
01168     STRINGIFY(
01169     /*
01170     */
01171     __kernel void ContrastStretch(__global CLPixelType * restrict im,
01172       const ChannelType channel,  
01173       __global CLPixelType * restrict stretch_map,
01174       const float4 white, const float4 black)
01175       {
01176         const int x = get_global_id(0);  
01177         const int y = get_global_id(1);  
01178         const int columns = get_global_size(0);  
01179         const int c = x + y * columns;
01180 
01181         uint ePos;
01182         CLPixelType oValue, eValue;
01183         CLQuantum red, green, blue, alpha;
01184 
01185         //read from global
01186         oValue=im[c];
01187 
01188         if ((channel & RedChannel) != 0)
01189         {
01190           if (getRedF4(white) != getRedF4(black))
01191           {
01192             ePos = ScaleQuantumToMap(getRed(oValue)); 
01193             eValue = stretch_map[ePos];
01194             red = getRed(eValue);
01195           }
01196         }
01197 
01198         if ((channel & GreenChannel) != 0)
01199         {
01200           if (getGreenF4(white) != getGreenF4(black))
01201           {
01202             ePos = ScaleQuantumToMap(getGreen(oValue)); 
01203             eValue = stretch_map[ePos];
01204             green = getGreen(eValue);
01205           }
01206         }
01207 
01208         if ((channel & BlueChannel) != 0)
01209         {
01210           if (getBlueF4(white) != getBlueF4(black))
01211           {
01212             ePos = ScaleQuantumToMap(getBlue(oValue)); 
01213             eValue = stretch_map[ePos];
01214             blue = getBlue(eValue);
01215           }
01216         }
01217 
01218         if ((channel & AlphaChannel) != 0)
01219         {
01220           if (getAlphaF4(white) != getAlphaF4(black))
01221           {
01222             ePos = ScaleQuantumToMap(getAlpha(oValue)); 
01223             eValue = stretch_map[ePos];
01224             alpha = getAlpha(eValue);
01225           }
01226         }
01227 
01228         //write back
01229         im[c]=(CLPixelType)(blue, green, red, alpha);
01230 
01231       }
01232     )
01233 
01234 /*
01235 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01236 %                                                                             %
01237 %                                                                             %
01238 %                                                                             %
01239 %    C o n v o l v e                                                          %
01240 %                                                                             %
01241 %                                                                             %
01242 %                                                                             %
01243 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01244 */
01245 
01246   STRINGIFY(
01247     __kernel 
01248     void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
01249     const unsigned int imageWidth, const unsigned int imageHeight,
01250     __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
01251     const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
01252 
01253       int2 blockID;
01254       blockID.x = get_global_id(0) / get_local_size(0);
01255       blockID.y = get_global_id(1) / get_local_size(1);
01256 
01257       // image area processed by this workgroup
01258       int2 imageAreaOrg;
01259       imageAreaOrg.x = blockID.x * get_local_size(0);
01260       imageAreaOrg.y = blockID.y * get_local_size(1);
01261 
01262       int2 midFilterDimen;
01263       midFilterDimen.x = (filterWidth-1)/2;
01264       midFilterDimen.y = (filterHeight-1)/2;
01265 
01266       int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
01267 
01268       // dimension of the local cache
01269       int2 cachedAreaDimen;
01270       cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
01271       cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
01272 
01273       // cache the pixels accessed by this workgroup in local memory
01274       int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
01275       int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
01276       int groupSize = get_local_size(0) * get_local_size(1);
01277       for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
01278 
01279         int2 cachedAreaIndex;
01280         cachedAreaIndex.x = i % cachedAreaDimen.x;
01281         cachedAreaIndex.y = i / cachedAreaDimen.x;
01282 
01283         int2 imagePixelIndex;
01284         imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
01285 
01286         // only support EdgeVirtualPixelMethod through ClampToCanvas
01287         // TODO: implement other virtual pixel method
01288         imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
01289         imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
01290 
01291         pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
01292       }
01293 
01294       // cache the filter
01295       for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
01296         filterCache[i] = filter[i];
01297       }
01298       barrier(CLK_LOCAL_MEM_FENCE);
01299 
01300 
01301       int2 imageIndex;
01302       imageIndex.x = imageAreaOrg.x + get_local_id(0);
01303       imageIndex.y = imageAreaOrg.y + get_local_id(1);
01304 
01305       // if out-of-range, stops here and quit
01306       if (imageIndex.x >= imageWidth
01307         || imageIndex.y >= imageHeight) {
01308           return;
01309       }
01310 
01311       int filterIndex = 0;
01312       float4 sum = (float4)0.0f;
01313       float gamma = 0.0f;
01314       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
01315         int cacheIndexY = get_local_id(1);
01316         for (int j = 0; j < filterHeight; j++) {
01317           int cacheIndexX = get_local_id(0);
01318           for (int i = 0; i < filterWidth; i++) {
01319             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
01320             float f = filterCache[filterIndex];
01321 
01322             sum.x += f * p.x;
01323             sum.y += f * p.y;
01324             sum.z += f * p.z; 
01325             sum.w += f * p.w;
01326 
01327             gamma += f;
01328             filterIndex++;
01329             cacheIndexX++;
01330           }
01331           cacheIndexY++;
01332         }
01333       }
01334       else {
01335         int cacheIndexY = get_local_id(1);
01336         for (int j = 0; j < filterHeight; j++) {
01337           int cacheIndexX = get_local_id(0);
01338           for (int i = 0; i < filterWidth; i++) {
01339 
01340             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
01341             float alpha = QuantumScale*p.w;
01342             float f = filterCache[filterIndex];
01343             float g = alpha * f;
01344 
01345             sum.x += g*p.x;
01346             sum.y += g*p.y;
01347             sum.z += g*p.z;
01348             sum.w += f*p.w;
01349 
01350             gamma += g;
01351             filterIndex++;
01352             cacheIndexX++;
01353           }
01354           cacheIndexY++;
01355         }
01356         gamma = PerceptibleReciprocal(gamma);
01357         sum.xyz = gamma*sum.xyz;
01358       }
01359       CLPixelType outputPixel;
01360       outputPixel.x = ClampToQuantum(sum.x);
01361       outputPixel.y = ClampToQuantum(sum.y);
01362       outputPixel.z = ClampToQuantum(sum.z);
01363       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
01364 
01365       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
01366     }
01367   )
01368 
01369   STRINGIFY(
01370     __kernel 
01371     void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
01372                   const uint imageWidth, const uint imageHeight,
01373                   __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
01374                   const uint matte, const ChannelType channel) {
01375 
01376       int2 imageIndex;
01377       imageIndex.x = get_global_id(0);
01378       imageIndex.y = get_global_id(1);
01379 
01380       /*
01381       unsigned int imageWidth = get_global_size(0);
01382       unsigned int imageHeight = get_global_size(1);
01383       */
01384       if (imageIndex.x >= imageWidth
01385           || imageIndex.y >= imageHeight)
01386           return;
01387 
01388       int2 midFilterDimen;
01389       midFilterDimen.x = (filterWidth-1)/2;
01390       midFilterDimen.y = (filterHeight-1)/2;
01391 
01392       int filterIndex = 0;
01393       float4 sum = (float4)0.0f;
01394       float gamma = 0.0f;
01395       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
01396         for (int j = 0; j < filterHeight; j++) {
01397           int2 inputPixelIndex;
01398           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
01399           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
01400           for (int i = 0; i < filterWidth; i++) {
01401             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
01402             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
01403         
01404             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
01405             float f = filter[filterIndex];
01406 
01407             sum.x += f * p.x;
01408             sum.y += f * p.y;
01409             sum.z += f * p.z; 
01410             sum.w += f * p.w;
01411 
01412             gamma += f;
01413 
01414             filterIndex++;
01415           }
01416         }
01417       }
01418       else {
01419 
01420         for (int j = 0; j < filterHeight; j++) {
01421           int2 inputPixelIndex;
01422           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
01423           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
01424           for (int i = 0; i < filterWidth; i++) {
01425             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
01426             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
01427         
01428             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
01429             float alpha = QuantumScale*p.w;
01430             float f = filter[filterIndex];
01431             float g = alpha * f;
01432 
01433             sum.x += g*p.x;
01434             sum.y += g*p.y;
01435             sum.z += g*p.z;
01436             sum.w += f*p.w;
01437 
01438             gamma += g;
01439 
01440 
01441             filterIndex++;
01442           }
01443         }
01444         gamma = PerceptibleReciprocal(gamma);
01445         sum.xyz = gamma*sum.xyz;
01446       }
01447 
01448       CLPixelType outputPixel;
01449       outputPixel.x = ClampToQuantum(sum.x);
01450       outputPixel.y = ClampToQuantum(sum.y);
01451       outputPixel.z = ClampToQuantum(sum.z);
01452       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
01453 
01454       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
01455     }
01456   )
01457 
01458 /*
01459 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01460 %                                                                             %
01461 %                                                                             %
01462 %                                                                             %
01463 %     D e s p e c k l e                                                       %
01464 %                                                                             %
01465 %                                                                             %
01466 %                                                                             %
01467 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01468 */
01469 
01470   STRINGIFY(
01471 
01472   __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
01473   , const unsigned int imageWidth, const unsigned int imageHeight
01474   , const int2 offset, const int polarity, const int matte) {
01475 
01476     int x = get_global_id(0);
01477     int y = get_global_id(1);
01478 
01479     CLPixelType v = inputImage[y*imageWidth+x];
01480 
01481     int2 neighbor;
01482     neighbor.y = y + offset.y;
01483     neighbor.x = x + offset.x;
01484 
01485     int2 clampedNeighbor;
01486     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
01487     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
01488 
01489     CLPixelType r = (clampedNeighbor.x == neighbor.x
01490                      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
01491     :(CLPixelType)0;
01492 
01493     int sv[4];
01494     sv[0] = (int)v.x;
01495     sv[1] = (int)v.y;
01496     sv[2] = (int)v.z;
01497     sv[3] = (int)v.w;
01498 
01499     int sr[4];
01500     sr[0] = (int)r.x;
01501     sr[1] = (int)r.y;
01502     sr[2] = (int)r.z;
01503     sr[3] = (int)r.w;
01504 
01505     if (polarity > 0) {
01506       \n #pragma unroll 4\n
01507       for (unsigned int i = 0; i < 4; i++) {
01508         sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
01509       }
01510     }
01511     else {
01512       \n #pragma unroll 4\n
01513       for (unsigned int i = 0; i < 4; i++) {
01514         sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
01515       }
01516 
01517     }
01518 
01519     v.x = (CLQuantum)sv[0];
01520     v.y = (CLQuantum)sv[1];
01521     v.z = (CLQuantum)sv[2];
01522 
01523     if (matte!=0)
01524       v.w = (CLQuantum)sv[3];
01525 
01526     outputImage[y*imageWidth+x] = v;
01527 
01528     }
01529 
01530 
01531   )
01532 
01533   STRINGIFY(
01534 
01535   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
01536   , const unsigned int imageWidth, const unsigned int imageHeight
01537   , const int2 offset, const int polarity, const int matte) {
01538 
01539     int x = get_global_id(0);
01540     int y = get_global_id(1);
01541 
01542     CLPixelType v = inputImage[y*imageWidth+x];
01543 
01544     int2 neighbor, clampedNeighbor;
01545 
01546     neighbor.y = y + offset.y;
01547     neighbor.x = x + offset.x;
01548     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
01549     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
01550 
01551     CLPixelType r = (clampedNeighbor.x == neighbor.x
01552       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
01553     :(CLPixelType)0;
01554 
01555 
01556     neighbor.y = y - offset.y;
01557     neighbor.x = x - offset.x;
01558     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
01559     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
01560 
01561     CLPixelType s = (clampedNeighbor.x == neighbor.x
01562       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
01563     :(CLPixelType)0;
01564 
01565 
01566     int sv[4];
01567     sv[0] = (int)v.x;
01568     sv[1] = (int)v.y;
01569     sv[2] = (int)v.z;
01570     sv[3] = (int)v.w;
01571 
01572     int sr[4];
01573     sr[0] = (int)r.x;
01574     sr[1] = (int)r.y;
01575     sr[2] = (int)r.z;
01576     sr[3] = (int)r.w;
01577 
01578     int ss[4];
01579     ss[0] = (int)s.x;
01580     ss[1] = (int)s.y;
01581     ss[2] = (int)s.z;
01582     ss[3] = (int)s.w;
01583 
01584     if (polarity > 0) {
01585       \n #pragma unroll 4\n
01586       for (unsigned int i = 0; i < 4; i++) {
01587         //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
01588         //
01589         //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
01590         //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
01591         sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
01592       }
01593     }
01594     else {
01595       \n #pragma unroll 4\n
01596       for (unsigned int i = 0; i < 4; i++) {
01597         //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
01598         //
01599         //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
01600         sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
01601       }
01602     }
01603 
01604     v.x = (CLQuantum)sv[0];
01605     v.y = (CLQuantum)sv[1];
01606     v.z = (CLQuantum)sv[2];
01607 
01608     if (matte!=0)
01609       v.w = (CLQuantum)sv[3];
01610 
01611     outputImage[y*imageWidth+x] = v;
01612 
01613     }
01614   )
01615 
01616 /*
01617 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01618 %                                                                             %
01619 %                                                                             %
01620 %                                                                             %
01621 %     E q u a l i z e                                                         %
01622 %                                                                             %
01623 %                                                                             %
01624 %                                                                             %
01625 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01626 */
01627 
01628     STRINGIFY(
01629     /*
01630     */
01631     __kernel void Equalize(__global CLPixelType * restrict im,
01632       const ChannelType channel,  
01633       __global CLPixelType * restrict equalize_map,
01634       const float4 white, const float4 black)
01635       {
01636         const int x = get_global_id(0);  
01637         const int y = get_global_id(1);  
01638         const int columns = get_global_size(0);  
01639         const int c = x + y * columns;
01640 
01641         uint ePos;
01642         CLPixelType oValue, eValue;
01643         CLQuantum red, green, blue, alpha;
01644 
01645         //read from global
01646         oValue=im[c];
01647 
01648         if ((channel & SyncChannels) != 0)
01649         {
01650           if (getRedF4(white) != getRedF4(black))
01651           {
01652             ePos = ScaleQuantumToMap(getRed(oValue)); 
01653             eValue = equalize_map[ePos];
01654             red = getRed(eValue);
01655             ePos = ScaleQuantumToMap(getGreen(oValue)); 
01656             eValue = equalize_map[ePos];
01657             green = getRed(eValue);
01658             ePos = ScaleQuantumToMap(getBlue(oValue)); 
01659             eValue = equalize_map[ePos];
01660             blue = getRed(eValue);
01661             ePos = ScaleQuantumToMap(getAlpha(oValue)); 
01662             eValue = equalize_map[ePos];
01663             alpha = getRed(eValue);
01664  
01665             //write back
01666             im[c]=(CLPixelType)(blue, green, red, alpha);
01667           }
01668 
01669         }
01670 
01671         // for equalizing, we always need all channels?
01672         // otherwise something more
01673 
01674      }
01675     )
01676 
01677 /*
01678 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01679 %                                                                             %
01680 %                                                                             %
01681 %                                                                             %
01682 %     F u n c t i o n                                                         %
01683 %                                                                             %
01684 %                                                                             %
01685 %                                                                             %
01686 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01687 */
01688 
01689   STRINGIFY(
01690   /*
01691   apply FunctionImageChannel(braightness-contrast)
01692   */
01693   CLQuantum ApplyFunction(float pixel,const MagickFunction function,
01694     const unsigned int number_parameters,__constant float *parameters)
01695   {
01696     float result = 0.0f;
01697 
01698     switch (function)
01699     {
01700     case PolynomialFunction:
01701       {
01702         for (unsigned int i=0; i < number_parameters; i++)
01703           result = result*QuantumScale*pixel + parameters[i];
01704         result *= QuantumRange;
01705         break;
01706       }
01707     case SinusoidFunction:
01708       {
01709         float  freq,phase,ampl,bias;
01710         freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
01711         phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
01712         ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
01713         bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
01714         result = QuantumRange*(ampl*sin(2.0f*MagickPI*
01715           (freq*QuantumScale*pixel + phase/360.0f)) + bias);
01716         break;
01717       }
01718     case ArcsinFunction:
01719       {
01720         float  width,range,center,bias;
01721         width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
01722         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
01723         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
01724         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
01725 
01726         result = 2.0f/width*(QuantumScale*pixel - center);
01727         result = range/MagickPI*asin(result)+bias;
01728         result = ( result <= -1.0f ) ? bias - range/2.0f : result;
01729         result = ( result >= 1.0f ) ? bias + range/2.0f : result;
01730         result *= QuantumRange;
01731         break;
01732       }
01733     case ArctanFunction:
01734       {
01735         float slope,range,center,bias;
01736         slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
01737         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
01738         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
01739         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
01740         result = MagickPI*slope*(QuantumScale*pixel-center);
01741         result = QuantumRange*(range/MagickPI*atan(result) + bias);
01742         break;
01743       }
01744     case UndefinedFunction:
01745       break;
01746     }
01747     return(ClampToQuantum(result));
01748   }
01749   )
01750 
01751   STRINGIFY(
01752   /*
01753   Improve brightness / contrast of the image
01754   channel : define which channel is improved
01755   function : the function called to enchance the brightness contrast
01756   number_parameters : numbers of parameters 
01757   parameters : the parameter
01758   */
01759   __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
01760     const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
01761     __constant float *parameters)
01762   {
01763     const unsigned int x = get_global_id(0);
01764     const unsigned int y = get_global_id(1);
01765     const unsigned int columns = get_global_size(0);
01766     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
01767 
01768     float red;
01769     float green;
01770     float blue;
01771     float alpha;
01772 
01773     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
01774 
01775     if ((channel & RedChannel) != 0)
01776       red=ApplyFunction(red, function, number_parameters, parameters);
01777 
01778     if (number_channels > 2)
01779       {
01780         if ((channel & GreenChannel) != 0)
01781           green=ApplyFunction(green, function, number_parameters, parameters);
01782 
01783         if ((channel & BlueChannel) != 0)
01784           blue=ApplyFunction(blue, function, number_parameters, parameters);
01785       }
01786 
01787     if (((number_channels == 4) || (number_channels == 2)) &&
01788         ((channel & AlphaChannel) != 0))
01789       alpha=ApplyFunction(alpha, function, number_parameters, parameters);
01790 
01791     WriteChannels(p, number_channels, channel, red, green, blue, alpha);
01792   }
01793   )
01794 
01795 /*
01796 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01797 %                                                                             %
01798 %                                                                             %
01799 %                                                                             %
01800 %     G r a y s c a l e                                                       %
01801 %                                                                             %
01802 %                                                                             %
01803 %                                                                             %
01804 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01805 */
01806 
01807   STRINGIFY(
01808   __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
01809     const unsigned int colorspace,const unsigned int method)
01810   {
01811     const unsigned int x = get_global_id(0);
01812     const unsigned int y = get_global_id(1);
01813     const unsigned int columns = get_global_size(0);
01814     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
01815 
01816     float
01817       blue,
01818       green,
01819       red;
01820 
01821     red=getPixelRed(p);
01822     green=getPixelGreen(p);
01823     blue=getPixelBlue(p);
01824 
01825     CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
01826 
01827     setPixelRed(p,intensity);
01828     setPixelGreen(p,intensity);
01829     setPixelBlue(p,intensity);
01830   }
01831   )
01832 
01833 /*
01834 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01835 %                                                                             %
01836 %                                                                             %
01837 %                                                                             %
01838 %     L o c a l C o n t r a s t                                               %
01839 %                                                                             %
01840 %                                                                             %
01841 %                                                                             %
01842 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01843 */
01844 
01845     STRINGIFY(
01846 
01847       __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
01848           const int radius, 
01849           const int imageWidth,
01850           const int imageHeight)
01851       {
01852         const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
01853 
01854         int x = get_local_id(0);
01855         int y = get_global_id(1);
01856 
01857         if ((x >= imageWidth) || (y >= imageHeight))
01858           return;
01859 
01860         global CLPixelType *src = srcImage + y * imageWidth;
01861 
01862         for (int i = x; i < imageWidth; i += get_local_size(0)) {
01863             float sum = 0.0f;
01864             float weight = 1.0f;
01865 
01866             int j = i - radius;
01867             while ((j + 7) < i) {
01868                 for (int k = 0; k < 8; ++k) // Unroll 8x
01869                     sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
01870                 weight += 8.0f;
01871                 j+=8;
01872             }
01873             while (j < i) {
01874                 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
01875                 weight += 1.0f;
01876                 ++j;
01877             }
01878 
01879             while ((j + 7) < radius + i) {
01880                 for (int k = 0; k < 8; ++k) // Unroll 8x
01881                     sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
01882                 weight -= 8.0f;
01883                 j+=8;
01884             }
01885             while (j < radius + i) {
01886                 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
01887                 weight -= 1.0f;
01888                 ++j;
01889             }
01890 
01891             tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
01892         }
01893       }
01894     )
01895 
01896     STRINGIFY(
01897       __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
01898           const int radius, 
01899           const float strength,
01900           const int imageWidth,
01901           const int imageHeight)
01902       {
01903         const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
01904 
01905         int x = get_global_id(0);
01906         int y = get_global_id(1);
01907 
01908         if ((x >= imageWidth) || (y >= imageHeight))
01909                 return;
01910 
01911         global float *src = blurImage + x;
01912 
01913         float sum = 0.0f;
01914         float weight = 1.0f;
01915 
01916         int j = y - radius;
01917         while ((j + 7) < y) {
01918             for (int k = 0; k < 8; ++k) // Unroll 8x
01919                 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
01920             weight += 8.0f;
01921             j+=8;
01922         }
01923         while (j < y) {
01924             sum += weight * src[mirrorBottom(j) * imageWidth];
01925             weight += 1.0f;
01926             ++j;
01927         }
01928 
01929         while ((j + 7) < radius + y) {
01930             for (int k = 0; k < 8; ++k) // Unroll 8x
01931                 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
01932             weight -= 8.0f;
01933             j+=8;
01934         }
01935         while (j < radius + y) {
01936             sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
01937             weight -= 1.0f;
01938             ++j;
01939         }
01940 
01941         CLPixelType pixel = srcImage[x + y * imageWidth];
01942         float srcVal = dot(RGB, convert_float4(pixel));
01943         float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
01944         mult = (srcVal + mult) / srcVal;
01945 
01946         pixel.x = ClampToQuantum(pixel.x * mult);
01947         pixel.y = ClampToQuantum(pixel.y * mult);
01948         pixel.z = ClampToQuantum(pixel.z * mult);
01949 
01950         dstImage[x + y * imageWidth] = pixel;
01951       }
01952     )
01953 
01954 /*
01955 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01956 %                                                                             %
01957 %                                                                             %
01958 %                                                                             %
01959 %     M o d u l a t e                                                         %
01960 %                                                                             %
01961 %                                                                             %
01962 %                                                                             %
01963 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01964 */
01965 
01966   STRINGIFY(
01967 
01968   static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
01969     float *hue, float *saturation, float *lightness)
01970   {
01971   float
01972     c,
01973     tmax,
01974     tmin;
01975 
01976   /*
01977      Convert RGB to HSL colorspace.
01978      */
01979   tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
01980   tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
01981 
01982   c=tmax-tmin;
01983 
01984   *lightness=(tmax+tmin)/2.0;
01985   if (c <= 0.0)
01986   {
01987     *hue=0.0;
01988     *saturation=0.0;
01989     return;
01990   }
01991 
01992   if (tmax == (QuantumScale*red))
01993   {
01994     *hue=(QuantumScale*green-QuantumScale*blue)/c;
01995     if ((QuantumScale*green) < (QuantumScale*blue))
01996       *hue+=6.0;
01997   }
01998   else
01999     if (tmax == (QuantumScale*green))
02000       *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
02001     else
02002       *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
02003 
02004   *hue*=60.0/360.0;
02005   if (*lightness <= 0.5)
02006     *saturation=c/(2.0*(*lightness));
02007   else
02008     *saturation=c/(2.0-2.0*(*lightness));
02009   }
02010 
02011   static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
02012       CLQuantum *red,CLQuantum *green,CLQuantum *blue)
02013   {
02014     float
02015       b,
02016       c,
02017       g,
02018       h,
02019       tmin,
02020       r,
02021       x;
02022 
02023     /*
02024        Convert HSL to RGB colorspace.
02025        */
02026     h=hue*360.0;
02027     if (lightness <= 0.5)
02028       c=2.0*lightness*saturation;
02029     else
02030       c=(2.0-2.0*lightness)*saturation;
02031     tmin=lightness-0.5*c;
02032     h-=360.0*floor(h/360.0);
02033     h/=60.0;
02034     x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
02035     switch ((int) floor(h) % 6)
02036     {
02037       case 0:
02038         {
02039           r=tmin+c;
02040           g=tmin+x;
02041           b=tmin;
02042           break;
02043         }
02044       case 1:
02045         {
02046           r=tmin+x;
02047           g=tmin+c;
02048           b=tmin;
02049           break;
02050         }
02051       case 2:
02052         {
02053           r=tmin;
02054           g=tmin+c;
02055           b=tmin+x;
02056           break;
02057         }
02058       case 3:
02059         {
02060           r=tmin;
02061           g=tmin+x;
02062           b=tmin+c;
02063           break;
02064         }
02065       case 4:
02066         {
02067           r=tmin+x;
02068           g=tmin;
02069           b=tmin+c;
02070           break;
02071         }
02072       case 5:
02073         {
02074           r=tmin+c;
02075           g=tmin;
02076           b=tmin+x;
02077           break;
02078         }
02079       default:
02080         {
02081           r=0.0;
02082           g=0.0;
02083           b=0.0;
02084         }
02085     }
02086     *red=ClampToQuantum(QuantumRange*r);
02087     *green=ClampToQuantum(QuantumRange*g);
02088     *blue=ClampToQuantum(QuantumRange*b);
02089   }
02090 
02091   static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 
02092     CLQuantum *red,CLQuantum *green,CLQuantum *blue)
02093   {
02094     float
02095       hue,
02096       lightness,
02097       saturation;
02098 
02099     /*
02100     Increase or decrease color lightness, saturation, or hue.
02101     */
02102     ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
02103     hue+=0.5*(0.01*percent_hue-1.0);
02104     while (hue < 0.0)
02105       hue+=1.0;
02106     while (hue >= 1.0)
02107       hue-=1.0;
02108     saturation*=0.01*percent_saturation;
02109     lightness*=0.01*percent_lightness;
02110     ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
02111   }
02112 
02113   __kernel void Modulate(__global CLPixelType *im, 
02114     const float percent_brightness, 
02115     const float percent_hue, 
02116     const float percent_saturation, 
02117     const int colorspace)
02118   {
02119 
02120     const int x = get_global_id(0);  
02121     const int y = get_global_id(1);
02122     const int columns = get_global_size(0);
02123     const int c = x + y * columns;
02124 
02125     CLPixelType pixel = im[c];
02126 
02127     CLQuantum
02128         blue,
02129         green,
02130         red;
02131 
02132     red=getRed(pixel);
02133     green=getGreen(pixel);
02134     blue=getBlue(pixel);
02135 
02136     switch (colorspace)
02137     {
02138       case HSLColorspace:
02139       default:
02140         {
02141           ModulateHSL(percent_hue, percent_saturation, percent_brightness, 
02142               &red, &green, &blue);
02143         }
02144 
02145     }
02146 
02147     CLPixelType filteredPixel;
02148    
02149     setRed(&filteredPixel, red);
02150     setGreen(&filteredPixel, green);
02151     setBlue(&filteredPixel, blue);
02152     filteredPixel.w = pixel.w;
02153 
02154     im[c] = filteredPixel;
02155   }
02156   )
02157 
02158 /*
02159 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02160 %                                                                             %
02161 %                                                                             %
02162 %                                                                             %
02163 %     M o t i o n B l u r                                                     %
02164 %                                                                             %
02165 %                                                                             %
02166 %                                                                             %
02167 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02168 */
02169 
02170   STRINGIFY(
02171     __kernel 
02172     void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
02173                     const unsigned int imageWidth, const unsigned int imageHeight,
02174                     const __global float *filter, const unsigned int width, const __global int2* offset,
02175                     const float4 bias,
02176                     const ChannelType channel, const unsigned int matte) {
02177 
02178       int2 currentPixel;
02179       currentPixel.x = get_global_id(0);
02180       currentPixel.y = get_global_id(1);
02181 
02182       if (currentPixel.x >= imageWidth
02183           || currentPixel.y >= imageHeight)
02184           return;
02185 
02186       float4 pixel;
02187       pixel.x = (float)bias.x;
02188       pixel.y = (float)bias.y;
02189       pixel.z = (float)bias.z;
02190       pixel.w = (float)bias.w;
02191 
02192       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
02193         
02194         for (int i = 0; i < width; i++) {
02195           // only support EdgeVirtualPixelMethod through ClampToCanvas
02196           // TODO: implement other virtual pixel method
02197           int2 samplePixel = currentPixel + offset[i];
02198           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
02199           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
02200           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
02201 
02202           pixel.x += (filter[i] * (float)samplePixelValue.x);
02203           pixel.y += (filter[i] * (float)samplePixelValue.y);
02204           pixel.z += (filter[i] * (float)samplePixelValue.z);
02205           pixel.w += (filter[i] * (float)samplePixelValue.w);
02206         }
02207 
02208         CLPixelType outputPixel;
02209         outputPixel.x = ClampToQuantum(pixel.x);
02210         outputPixel.y = ClampToQuantum(pixel.y);
02211         outputPixel.z = ClampToQuantum(pixel.z);
02212         outputPixel.w = ClampToQuantum(pixel.w);
02213         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
02214       }
02215       else {
02216 
02217         float gamma = 0.0f;
02218         for (int i = 0; i < width; i++) {
02219           // only support EdgeVirtualPixelMethod through ClampToCanvas
02220           // TODO: implement other virtual pixel method
02221           int2 samplePixel = currentPixel + offset[i];
02222           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
02223           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
02224 
02225           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
02226 
02227           float alpha = QuantumScale*samplePixelValue.w;
02228           float k = filter[i];
02229           pixel.x = pixel.x + k * alpha * samplePixelValue.x;
02230           pixel.y = pixel.y + k * alpha * samplePixelValue.y;
02231           pixel.z = pixel.z + k * alpha * samplePixelValue.z;
02232 
02233           pixel.w += k * alpha * samplePixelValue.w;
02234 
02235           gamma+=k*alpha;
02236         }
02237         gamma = PerceptibleReciprocal(gamma);
02238         pixel.xyz = gamma*pixel.xyz;
02239 
02240         CLPixelType outputPixel;
02241         outputPixel.x = ClampToQuantum(pixel.x);
02242         outputPixel.y = ClampToQuantum(pixel.y);
02243         outputPixel.z = ClampToQuantum(pixel.z);
02244         outputPixel.w = ClampToQuantum(pixel.w);
02245         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
02246       }
02247     }
02248   )
02249 
02250 /*
02251 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02252 %                                                                             %
02253 %                                                                             %
02254 %                                                                             %
02255 %     R e s i z e                                                             %
02256 %                                                                             %
02257 %                                                                             %
02258 %                                                                             %
02259 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02260 */
02261 
02262   STRINGIFY(
02263   // Based on Box from resize.c
02264   float BoxResizeFilter(const float x)
02265   {
02266     return 1.0f;
02267   }
02268   )
02269     
02270   STRINGIFY(
02271   // Based on CubicBC from resize.c
02272   float CubicBC(const float x,const __global float* resizeFilterCoefficients)
02273   {
02274     /*
02275     Cubic Filters using B,C determined values:
02276     Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
02277     Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
02278     Spline              B = 1   C = 0    B-Spline Gaussian approximation
02279     Hermite             B = 0   C = 0    B-Spline interpolator
02280 
02281     See paper by Mitchell and Netravali, Reconstruction Filters in Computer
02282     Graphics Computer Graphics, Volume 22, Number 4, August 1988
02283     http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
02284     Mitchell.pdf.
02285 
02286     Coefficents are determined from B,C values:
02287     P0 = (  6 - 2*B       )/6 = coeff[0]
02288     P1 =         0
02289     P2 = (-18 +12*B + 6*C )/6 = coeff[1]
02290     P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
02291     Q0 = (      8*B +24*C )/6 = coeff[3]
02292     Q1 = (    -12*B -48*C )/6 = coeff[4]
02293     Q2 = (      6*B +30*C )/6 = coeff[5]
02294     Q3 = (    - 1*B - 6*C )/6 = coeff[6]
02295 
02296     which are used to define the filter:
02297 
02298     P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
02299     Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
02300 
02301     which ensures function is continuous in value and derivative (slope).
02302     */
02303     if (x < 1.0)
02304       return(resizeFilterCoefficients[0]+x*(x*
02305       (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
02306     if (x < 2.0)
02307       return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
02308       (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
02309     return(0.0);
02310   }
02311   )
02312 
02313   STRINGIFY(
02314   float Sinc(const float x)
02315   {
02316     if (x != 0.0f)
02317     {
02318       const float alpha=(float) (MagickPI*x);
02319       return sinpi(x)/alpha;
02320     }
02321     return(1.0f);
02322   }
02323   )
02324 
02325   STRINGIFY(
02326   float Triangle(const float x)
02327   {
02328     /*
02329     1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
02330     a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
02331     for Sinc().
02332     */
02333     return ((x<1.0f)?(1.0f-x):0.0f);
02334   }
02335   )
02336 
02337 
02338   STRINGIFY(
02339   float Hann(const float x)
02340   {
02341     /*
02342     Cosine window function:
02343       0.5+0.5*cos(pi*x).
02344     */
02345     const float cosine=cos((MagickPI*x));
02346     return(0.5f+0.5f*cosine);
02347   }
02348   )
02349 
02350   STRINGIFY(
02351   float Hamming(const float x)
02352   {
02353     /*
02354       Offset cosine window function:
02355        .54 + .46 cos(pi x).
02356     */
02357     const float cosine=cos((MagickPI*x));
02358     return(0.54f+0.46f*cosine);
02359   }
02360   )
02361 
02362   STRINGIFY(
02363   float Blackman(const float x)
02364   {
02365     /*
02366       Blackman: 2nd order cosine windowing function:
02367         0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
02368 
02369       Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
02370       five flops.
02371     */
02372     const float cosine=cos((MagickPI*x));
02373     return(0.34f+cosine*(0.5f+cosine*0.16f));
02374   }
02375   )
02376 
02377   STRINGIFY(
02378   static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
02379   {
02380     switch (filterType)
02381     {
02382     /* Call Sinc even for SincFast to get better precision on GPU 
02383        and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
02384     case SincWeightingFunction:
02385     case SincFastWeightingFunction:
02386       return Sinc(x);
02387     case CubicBCWeightingFunction:
02388       return CubicBC(x,filterCoefficients);
02389     case BoxWeightingFunction:
02390       return BoxResizeFilter(x);
02391     case TriangleWeightingFunction:
02392       return Triangle(x);
02393     case HannWeightingFunction:
02394       return Hann(x);
02395     case HammingWeightingFunction:
02396       return Hamming(x);
02397     case BlackmanWeightingFunction:
02398       return Blackman(x);
02399 
02400     default:
02401       return 0.0f;
02402     }
02403   }
02404   )
02405 
02406 
02407   STRINGIFY(
02408   static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
02409            , const ResizeWeightingFunctionType resizeWindowType
02410            , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
02411   {
02412     float scale;
02413     float xBlur = fabs(x/resizeFilterBlur);
02414     if (resizeWindowSupport < MagickEpsilon
02415         || resizeWindowType == BoxWeightingFunction)
02416     {
02417       scale = 1.0f;
02418     }
02419     else
02420     {
02421       scale = resizeFilterScale;
02422       scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
02423     }
02424     float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
02425     return weight;
02426   }
02427 
02428   )
02429 
02430   ;
02431   const char *accelerateKernels2 =
02432 
02433   STRINGIFY(
02434 
02435   static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
02436     return (numWorkItems/pixelPerWorkgroup);
02437   }
02438 
02439   // returns the index of the pixel for the current workitem to compute.
02440   // returns -1 if this workitem doesn't need to participate in any computation
02441   static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
02442     const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
02443     int pixelIndex = itemID/numWorkItemsPerPixel;
02444     pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
02445     return pixelIndex;
02446   }
02447  
02448   )
02449 
02450   STRINGIFY(
02451   __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
02452     void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
02453       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
02454       const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
02455       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
02456       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
02457       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
02458       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
02459       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
02460   {
02461     // calculate the range of resized image pixels computed by this workgroup
02462     const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
02463     const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
02464     const unsigned int actualNumPixelToCompute = stopX - startX;
02465 
02466     // calculate the range of input image pixels to cache
02467     float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
02468     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
02469     scale = PerceptibleReciprocal(scale);
02470 
02471     const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
02472     const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
02473 
02474     // cache the input pixels into local memory
02475     const unsigned int y = get_global_id(1);
02476     const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
02477     const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
02478     event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
02479     wait_group_events(1, &e);
02480 
02481     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
02482     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
02483     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
02484     {
02485       const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
02486       const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
02487       const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
02488 
02489       // determine which resized pixel computed by this workitem
02490       const unsigned int itemID = get_local_id(0);
02491       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
02492 
02493       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
02494 
02495       float4 filteredPixel = (float4)0.0f;
02496       float density = 0.0f;
02497       float gamma = 0.0f;
02498       // -1 means this workitem doesn't participate in the computation
02499       if (pixelIndex != -1)
02500       {
02501         // x coordinated of the resized pixel computed by this workitem
02502         const int x = chunkStartX + pixelIndex;
02503 
02504         // calculate how many steps required for this pixel
02505         const float bisect = (x+0.5)/xFactor+MagickEpsilon;
02506         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
02507         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
02508         const unsigned int n = stop - start;
02509 
02510         // calculate how many steps this workitem will contribute
02511         unsigned int numStepsPerWorkItem = n / numItems;
02512         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
02513 
02514         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
02515         if (startStep < n)
02516         {
02517           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
02518 
02519           unsigned int cacheIndex = start+startStep-cacheRangeStartX;
02520           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
02521           {
02522             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
02523               (ResizeWeightingFunctionType) resizeFilterType,
02524               (ResizeWeightingFunctionType) resizeWindowType,
02525               resizeFilterScale, resizeFilterWindowSupport,
02526               resizeFilterBlur, scale*(start + i - bisect + 0.5));
02527 
02528             float4 cp = (float4)0.0f;
02529 
02530             __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
02531             cp.x = (float) *(p);
02532             if (number_channels > 2)
02533             {
02534               cp.y = (float) *(p + 1);
02535               cp.z = (float) *(p + 2);
02536             }
02537 
02538             if (alpha_index != 0)
02539             {
02540               cp.w = (float) *(p + alpha_index);
02541 
02542               float alpha = weight * QuantumScale * cp.w;
02543 
02544               filteredPixel.x += alpha * cp.x;
02545               filteredPixel.y += alpha * cp.y;
02546               filteredPixel.z += alpha * cp.z;
02547               filteredPixel.w += weight * cp.w;
02548               gamma += alpha;
02549             }
02550             else
02551               filteredPixel += ((float4) weight)*cp;
02552 
02553             density += weight;
02554           }
02555         }
02556       }
02557 
02558       // initialize the accumulators to zero
02559       if (itemID < actualNumPixelInThisChunk) {
02560         outputPixelCache[itemID] = (float4)0.0f;
02561         densityCache[itemID] = 0.0f;
02562         if (alpha_index != 0)
02563           gammaCache[itemID] = 0.0f;
02564       }
02565       barrier(CLK_LOCAL_MEM_FENCE);
02566 
02567       // accumulatte the filtered pixel value and the density
02568       for (unsigned int i = 0; i < numItems; i++) {
02569         if (pixelIndex != -1) {
02570           if (itemID%numItems == i) {
02571             outputPixelCache[pixelIndex]+=filteredPixel;
02572             densityCache[pixelIndex]+=density;
02573             if (alpha_index != 0)
02574               gammaCache[pixelIndex]+=gamma;
02575           }
02576         }
02577         barrier(CLK_LOCAL_MEM_FENCE);
02578       }
02579 
02580       if (itemID < actualNumPixelInThisChunk)
02581       {
02582         float4 filteredPixel = outputPixelCache[itemID];
02583 
02584         float gamma = 0.0f;
02585         if (alpha_index != 0)
02586           gamma = gammaCache[itemID];
02587 
02588         float density = densityCache[itemID];
02589         if ((density != 0.0f) && (density != 1.0f))
02590         {
02591           density = PerceptibleReciprocal(density);
02592           filteredPixel *= (float4) density;
02593           if (alpha_index != 0)
02594             gamma *= density;
02595         }
02596 
02597         if (alpha_index != 0)
02598         {
02599           gamma = PerceptibleReciprocal(gamma);
02600           filteredPixel.x *= gamma;
02601           filteredPixel.y *= gamma;
02602           filteredPixel.z *= gamma;
02603         }
02604 
02605         WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
02606       }
02607     }
02608   }
02609   )
02610 
02611 
02612   STRINGIFY(
02613  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
02614     void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
02615       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
02616       const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
02617       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
02618       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
02619       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
02620       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
02621       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
02622   {
02623     // calculate the range of resized image pixels computed by this workgroup
02624     const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
02625     const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
02626     const unsigned int actualNumPixelToCompute = stopY - startY;
02627 
02628     // calculate the range of input image pixels to cache
02629     float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
02630     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
02631     scale = PerceptibleReciprocal(scale);
02632 
02633     const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
02634     const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
02635 
02636     // cache the input pixels into local memory
02637     const unsigned int x = get_global_id(0);
02638     unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
02639     unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
02640     unsigned int stride = inputColumns * number_channels;
02641     for (unsigned int i = 0; i < number_channels; i++)
02642     {
02643       event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
02644       wait_group_events(1,&e);
02645     }
02646 
02647     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
02648     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
02649     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
02650     {
02651       const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
02652       const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
02653       const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
02654 
02655       // determine which resized pixel computed by this workitem
02656       const unsigned int itemID = get_local_id(1);
02657       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
02658 
02659       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
02660 
02661       float4 filteredPixel = (float4)0.0f;
02662       float density = 0.0f;
02663       float gamma = 0.0f;
02664       // -1 means this workitem doesn't participate in the computation
02665       if (pixelIndex != -1)
02666       {
02667         // x coordinated of the resized pixel computed by this workitem
02668         const int y = chunkStartY + pixelIndex;
02669 
02670         // calculate how many steps required for this pixel
02671         const float bisect = (y+0.5)/yFactor+MagickEpsilon;
02672         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
02673         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
02674         const unsigned int n = stop - start;
02675 
02676         // calculate how many steps this workitem will contribute
02677         unsigned int numStepsPerWorkItem = n / numItems;
02678         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
02679 
02680         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
02681         if (startStep < n)
02682         {
02683           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
02684 
02685           unsigned int cacheIndex = start+startStep-cacheRangeStartY;
02686           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
02687           {
02688             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
02689               (ResizeWeightingFunctionType) resizeFilterType,
02690               (ResizeWeightingFunctionType) resizeWindowType,
02691               resizeFilterScale, resizeFilterWindowSupport,
02692               resizeFilterBlur, scale*(start + i - bisect + 0.5));
02693 
02694             float4 cp = (float4)0.0f;
02695 
02696             __local CLQuantum *p = inputImageCache + cacheIndex;
02697             cp.x = (float) *(p);
02698             if (number_channels > 2)
02699             {
02700               cp.y = (float) *(p + rangeLength);
02701               cp.z = (float) *(p + (rangeLength * 2));
02702             }
02703 
02704             if (alpha_index != 0)
02705             {
02706               cp.w = (float) *(p + (rangeLength * alpha_index));
02707 
02708               float alpha = weight * QuantumScale * cp.w;
02709 
02710               filteredPixel.x += alpha * cp.x;
02711               filteredPixel.y += alpha * cp.y;
02712               filteredPixel.z += alpha * cp.z;
02713               filteredPixel.w += weight * cp.w;
02714               gamma += alpha;
02715             }
02716             else
02717               filteredPixel += ((float4) weight)*cp;
02718 
02719             density += weight;
02720           }
02721         }
02722       }
02723 
02724       // initialize the accumulators to zero
02725       if (itemID < actualNumPixelInThisChunk) {
02726         outputPixelCache[itemID] = (float4)0.0f;
02727         densityCache[itemID] = 0.0f;
02728         if (alpha_index != 0)
02729           gammaCache[itemID] = 0.0f;
02730       }
02731       barrier(CLK_LOCAL_MEM_FENCE);
02732 
02733       // accumulatte the filtered pixel value and the density
02734       for (unsigned int i = 0; i < numItems; i++) {
02735         if (pixelIndex != -1) {
02736           if (itemID%numItems == i) {
02737             outputPixelCache[pixelIndex]+=filteredPixel;
02738             densityCache[pixelIndex]+=density;
02739             if (alpha_index != 0)
02740               gammaCache[pixelIndex]+=gamma;
02741           }
02742         }
02743         barrier(CLK_LOCAL_MEM_FENCE);
02744       }
02745 
02746       if (itemID < actualNumPixelInThisChunk)
02747       {
02748         float4 filteredPixel = outputPixelCache[itemID];
02749 
02750         float gamma = 0.0f;
02751         if (alpha_index != 0)
02752           gamma = gammaCache[itemID];
02753 
02754         float density = densityCache[itemID];
02755         if ((density != 0.0f) && (density != 1.0f))
02756         {
02757           density = PerceptibleReciprocal(density);
02758           filteredPixel *= (float4) density;
02759           if (alpha_index != 0)
02760             gamma *= density;
02761         }
02762 
02763         if (alpha_index != 0)
02764         {
02765           gamma = PerceptibleReciprocal(gamma);
02766           filteredPixel.x *= gamma;
02767           filteredPixel.y *= gamma;
02768           filteredPixel.z *= gamma;
02769         }
02770 
02771         WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
02772       }
02773     }
02774   }
02775   )
02776 
02777 /*
02778 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02779 %                                                                             %
02780 %                                                                             %
02781 %                                                                             %
02782 %     R o t a t i o n a l B l u r                                             %
02783 %                                                                             %
02784 %                                                                             %
02785 %                                                                             %
02786 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02787 */
02788 
02789   STRINGIFY(
02790   __kernel void RotationalBlur(const __global CLQuantum *image,
02791     const unsigned int number_channels,const unsigned int channel,
02792     const float2 blurCenter,__constant float *cos_theta,
02793     __constant float *sin_theta,const unsigned int cossin_theta_size,
02794     __global CLQuantum *filteredImage)
02795   {
02796     const int x = get_global_id(0);
02797     const int y = get_global_id(1);
02798     const int columns = get_global_size(0);
02799     const int rows = get_global_size(1);
02800     unsigned int step = 1;
02801     float center_x = (float) x - blurCenter.x;
02802     float center_y = (float) y - blurCenter.y;
02803     float radius = hypot(center_x, center_y);
02804 
02805     float blur_radius = hypot(blurCenter.x, blurCenter.y);
02806 
02807     if (radius > MagickEpsilon)
02808     {
02809       step = (unsigned int) (blur_radius / radius);
02810       if (step == 0)
02811         step = 1;
02812       if (step >= cossin_theta_size)
02813         step = cossin_theta_size-1;
02814     }
02815 
02816     float4 result = 0.0f;
02817     float normalize = 0.0f;
02818     float gamma = 0.0f;
02819 
02820     for (unsigned int i=0; i<cossin_theta_size; i+=step)
02821     {
02822       int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
02823       int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
02824 
02825       float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
02826 
02827       if ((number_channels == 4) || (number_channels == 2))
02828       {
02829         float alpha = (float)(QuantumScale*pixel.w);
02830 
02831         gamma += alpha;
02832 
02833         result.x += alpha * pixel.x;
02834         result.y += alpha * pixel.y;
02835         result.z += alpha * pixel.z;
02836         result.w += pixel.w;
02837       }
02838       else
02839         result += pixel;
02840 
02841       normalize += 1.0f;
02842     }
02843 
02844     normalize = PerceptibleReciprocal(normalize);
02845 
02846     if ((number_channels == 4) || (number_channels == 2))
02847     {
02848       gamma = PerceptibleReciprocal(gamma);
02849       result.x *= gamma;
02850       result.y *= gamma;
02851       result.z *= gamma;
02852       result.w *= normalize;
02853     }
02854     else
02855       result *= normalize;
02856 
02857     WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
02858   }
02859   )
02860 
02861 /*
02862 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02863 %                                                                             %
02864 %                                                                             %
02865 %                                                                             %
02866 %     U n s h a r p M a s k                                                   %
02867 %                                                                             %
02868 %                                                                             %
02869 %                                                                             %
02870 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02871 */
02872 
02873   STRINGIFY(
02874   __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
02875     const __global float4 *blurRowData,const unsigned int number_channels,
02876     const ChannelType channel,const unsigned int columns,
02877     const unsigned int rows,__local float4* cachedData,
02878     __local float* cachedFilter,const __global float *filter,
02879     const unsigned int width,const float gain, const float threshold,
02880     __global CLQuantum *filteredImage)
02881   {
02882     const unsigned int radius = (width-1)/2;
02883 
02884     // cache the pixel shared by the workgroup
02885     const int groupX = get_group_id(0);
02886     const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
02887     const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
02888 
02889     if ((groupStartY >= 0) && (groupStopY < rows))
02890     {
02891       event_t e = async_work_group_strided_copy(cachedData,
02892         blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
02893       wait_group_events(1,&e);
02894     }
02895     else
02896     {
02897       for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
02898         cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
02899 
02900       barrier(CLK_LOCAL_MEM_FENCE);
02901     }
02902     // cache the filter as well
02903     event_t e = async_work_group_copy(cachedFilter,filter,width,0);
02904     wait_group_events(1,&e);
02905 
02906     // only do the work if this is not a patched item
02907     const int cy = get_global_id(1);
02908 
02909     if (cy < rows)
02910     {
02911       float4 blurredPixel = (float4) 0.0f;
02912 
02913       int i = 0;
02914 
02915       for ( ; i+7 < width; )
02916       {
02917         for (int j=0; j < 8; j++, i++)
02918           blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
02919       }
02920 
02921       for ( ; i < width; i++)
02922         blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
02923 
02924       float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
02925       float4 outputPixel = inputImagePixel - blurredPixel;
02926 
02927       float quantumThreshold = QuantumRange*threshold;
02928 
02929       int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
02930       outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
02931 
02932       //write back
02933       WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
02934     }
02935   }
02936   )
02937 
02938   STRINGIFY(
02939   __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
02940     const ChannelType channel,__constant float *filter,const unsigned int width,
02941     const unsigned int columns,const unsigned int rows,__local float4 *pixels,
02942     const float gain,const float threshold,__global CLQuantum *filteredImage)
02943   {
02944     const unsigned int x = get_global_id(0);
02945     const unsigned int y = get_global_id(1);
02946 
02947     const unsigned int radius = (width - 1) / 2;
02948 
02949     int row = y - radius;
02950     int baseRow = get_group_id(1) * get_local_size(1) - radius;
02951     int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
02952 
02953     while (row < endRow) {
02954       int srcy = (row < 0) ? -row : row; // mirror pad
02955       srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
02956 
02957       float4 value = 0.0f;
02958 
02959       int ix = x - radius;
02960       int i = 0;
02961 
02962       while (i + 7 < width) {
02963         for (int j = 0; j < 8; ++j) { // unrolled
02964           int srcx = ix + j;
02965           srcx = (srcx < 0) ? -srcx : srcx;
02966           srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
02967           value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
02968         }
02969         ix += 8;
02970         i += 8;
02971       }
02972 
02973       while (i < width) {
02974         int srcx = (ix < 0) ? -ix : ix; // mirror pad
02975         srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
02976         value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
02977         ++i;
02978         ++ix;
02979       }
02980       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
02981       row += get_local_size(1);
02982     }
02983 
02984     barrier(CLK_LOCAL_MEM_FENCE);
02985 
02986     const int px = get_local_id(0);
02987     const int py = get_local_id(1);
02988     const int prp = get_local_size(0);
02989     float4 value = (float4)(0.0f);
02990 
02991     int i = 0;
02992     while (i + 7 < width) {
02993       for (int j = 0; j < 8; ++j) // unrolled
02994         value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
02995       i += 8;
02996     }
02997     while (i < width) {
02998       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
02999       ++i;
03000     }
03001 
03002     if ((x < columns) && (y < rows)) {
03003       float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
03004       float4 diff = srcPixel - value;
03005 
03006       float quantumThreshold = QuantumRange*threshold;
03007 
03008       int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
03009       value = select(srcPixel + diff * gain, srcPixel, mask);
03010 
03011       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
03012     }
03013   }
03014   )
03015 
03016   STRINGIFY(
03017     __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
03018     void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
03019       const unsigned int number_channels,const unsigned int max_channels,
03020       const float threshold,const int passes,const unsigned int imageWidth,
03021       const unsigned int imageHeight)
03022   {
03023     const int pad = (1 << (passes - 1));
03024     const int tileSize = 64;
03025     const int tileRowPixels = 64;
03026     const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
03027 
03028     CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
03029 
03030     local float buffer[64 * 64];
03031 
03032     int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
03033     int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
03034 
03035     for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03036       int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
03037                 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
03038 
03039       for (int channel = 0; channel < max_channels; ++channel)
03040         stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
03041     }
03042 
03043     for (int channel = 0; channel < max_channels; ++channel) {
03044       // Load LDS
03045       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03046         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
03047 
03048       // Process
03049 
03050       float tmp[16];
03051       float accum[16];
03052       float pixel;
03053 
03054       for (int i = 0; i < 16; i++)
03055         accum[i]=0.0f;
03056 
03057       for (int pass = 0; pass < passes; ++pass) {
03058         const int radius = 1 << pass;
03059         const int x = get_local_id(0);
03060         const float thresh = threshold * noise[pass];
03061 
03062         // Apply horizontal hat
03063         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03064           const int offset = i * tileRowPixels;
03065           if (pass == 0)
03066             tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
03067           pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
03068           barrier(CLK_LOCAL_MEM_FENCE);
03069           buffer[x + offset] = pixel;
03070         }
03071         barrier(CLK_LOCAL_MEM_FENCE);
03072 
03073         // Apply vertical hat
03074         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03075           pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
03076           float delta = tmp[i / 4] - pixel;
03077           tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
03078           if (delta < -thresh)
03079             delta += thresh;
03080           else if (delta > thresh)
03081             delta -= thresh;
03082           else
03083             delta = 0;
03084           accum[i / 4] += delta;
03085         }
03086         barrier(CLK_LOCAL_MEM_FENCE);
03087 
03088         if (pass < passes - 1)
03089           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03090             buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
03091         else  // last pass
03092           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03093             accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
03094         barrier(CLK_LOCAL_MEM_FENCE);
03095       }
03096 
03097       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03098         stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
03099 
03100       barrier(CLK_LOCAL_MEM_FENCE);
03101     }
03102 
03103     // Write from stage to output
03104 
03105     if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
03106       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03107         if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
03108           int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
03109           for (int channel = 0; channel < max_channels; ++channel) {
03110             dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
03111           }
03112         }
03113       }
03114     }
03115   }
03116   )
03117 
03118   ;
03119 
03120 #endif // MAGICKCORE_OPENCL_SUPPORT
03121 
03122 #if defined(__cplusplus) || defined(c_plusplus)
03123 }
03124 #endif
03125 
03126 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H

Generated on 21 Sep 2020 for MagickCore by  doxygen 1.6.1