accelerate-kernels-private.h

Go to the documentation of this file.
00001 /*
00002   Copyright 1999-2020 ImageMagick Studio LLC, a non-profit organization
00003   dedicated to making software imaging solutions freely available.
00004   
00005   You may not use this file except in compliance with the License.  You may
00006   obtain a copy of the License at
00007   
00008     https://imagemagick.org/script/license.php
00009   
00010   Unless required by applicable law or agreed to in writing, software
00011   distributed under the License is distributed on an "AS IS" BASIS,
00012   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00013   See the License for the specific language governing permissions and
00014   limitations under the License.
00015 
00016   MagickCore private kernels for accelerated functions.
00017 */
00018 
00019 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
00020 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
00021 
00022 #if defined(__cplusplus) || defined(c_plusplus)
00023 extern "C" {
00024 #endif
00025 
00026 #if defined(MAGICKCORE_OPENCL_SUPPORT)
00027 
00028 /*
00029   Define declarations.
00030 */
00031 #define OPENCL_DEFINE(VAR,...)  "\n #""define " #VAR " " #__VA_ARGS__ " \n"
00032 #define OPENCL_ELIF(...)        "\n #""elif " #__VA_ARGS__ " \n"
00033 #define OPENCL_ELSE()           "\n #""else " " \n"
00034 #define OPENCL_ENDIF()          "\n #""endif " " \n"
00035 #define OPENCL_IF(...)          "\n #""if " #__VA_ARGS__ " \n"
00036 #define STRINGIFY(...) #__VA_ARGS__ "\n"
00037 
00038 const char *accelerateKernels =
00039 
00040 /*
00041   Define declarations.
00042 */
00043   OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
00044   OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
00045   OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
00046   OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
00047   OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
00048   OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
00049   OPENCL_DEFINE(SigmaRandom, (attenuate))
00050   OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
00051   OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
00052   OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
00053 
00054 /*
00055   Typedef declarations.
00056 */
00057   STRINGIFY(
00058     typedef enum
00059     {
00060       UndefinedColorspace,
00061       CMYColorspace,           /* negated linear RGB colorspace */
00062       CMYKColorspace,          /* CMY with Black separation */
00063       GRAYColorspace,          /* Single Channel greyscale (non-linear) image */
00064       HCLColorspace,
00065       HCLpColorspace,
00066       HSBColorspace,
00067       HSIColorspace,
00068       HSLColorspace,
00069       HSVColorspace,           /* alias for HSB */
00070       HWBColorspace,
00071       LabColorspace,
00072       LCHColorspace,           /* alias for LCHuv */
00073       LCHabColorspace,         /* Cylindrical (Polar) Lab */
00074       LCHuvColorspace,         /* Cylindrical (Polar) Luv */
00075       LogColorspace,
00076       LMSColorspace,
00077       LuvColorspace,
00078       OHTAColorspace,
00079       Rec601YCbCrColorspace,
00080       Rec709YCbCrColorspace,
00081       RGBColorspace,           /* Linear RGB colorspace */
00082       scRGBColorspace,         /* ??? */
00083       sRGBColorspace,          /* Default: non-linear sRGB colorspace */
00084       TransparentColorspace,
00085       xyYColorspace,
00086       XYZColorspace,           /* IEEE Color Reference colorspace */
00087       YCbCrColorspace,
00088       YCCColorspace,
00089       YDbDrColorspace,
00090       YIQColorspace,
00091       YPbPrColorspace,
00092       YUVColorspace,
00093       LinearGRAYColorspace     /* Single Channel greyscale (linear) image */
00094     } ColorspaceType;
00095   )
00096 
00097   STRINGIFY(
00098     typedef enum
00099     {
00100       UndefinedCompositeOp,
00101       AlphaCompositeOp,
00102       AtopCompositeOp,
00103       BlendCompositeOp,
00104       BlurCompositeOp,
00105       BumpmapCompositeOp,
00106       ChangeMaskCompositeOp,
00107       ClearCompositeOp,
00108       ColorBurnCompositeOp,
00109       ColorDodgeCompositeOp,
00110       ColorizeCompositeOp,
00111       CopyBlackCompositeOp,
00112       CopyBlueCompositeOp,
00113       CopyCompositeOp,
00114       CopyCyanCompositeOp,
00115       CopyGreenCompositeOp,
00116       CopyMagentaCompositeOp,
00117       CopyAlphaCompositeOp,
00118       CopyRedCompositeOp,
00119       CopyYellowCompositeOp,
00120       DarkenCompositeOp,
00121       DarkenIntensityCompositeOp,
00122       DifferenceCompositeOp,
00123       DisplaceCompositeOp,
00124       DissolveCompositeOp,
00125       DistortCompositeOp,
00126       DivideDstCompositeOp,
00127       DivideSrcCompositeOp,
00128       DstAtopCompositeOp,
00129       DstCompositeOp,
00130       DstInCompositeOp,
00131       DstOutCompositeOp,
00132       DstOverCompositeOp,
00133       ExclusionCompositeOp,
00134       HardLightCompositeOp,
00135       HardMixCompositeOp,
00136       HueCompositeOp,
00137       InCompositeOp,
00138       IntensityCompositeOp,
00139       LightenCompositeOp,
00140       LightenIntensityCompositeOp,
00141       LinearBurnCompositeOp,
00142       LinearDodgeCompositeOp,
00143       LinearLightCompositeOp,
00144       LuminizeCompositeOp,
00145       MathematicsCompositeOp,
00146       MinusDstCompositeOp,
00147       MinusSrcCompositeOp,
00148       ModulateCompositeOp,
00149       ModulusAddCompositeOp,
00150       ModulusSubtractCompositeOp,
00151       MultiplyCompositeOp,
00152       NoCompositeOp,
00153       OutCompositeOp,
00154       OverCompositeOp,
00155       OverlayCompositeOp,
00156       PegtopLightCompositeOp,
00157       PinLightCompositeOp,
00158       PlusCompositeOp,
00159       ReplaceCompositeOp,
00160       SaturateCompositeOp,
00161       ScreenCompositeOp,
00162       SoftLightCompositeOp,
00163       SrcAtopCompositeOp,
00164       SrcCompositeOp,
00165       SrcInCompositeOp,
00166       SrcOutCompositeOp,
00167       SrcOverCompositeOp,
00168       ThresholdCompositeOp,
00169       VividLightCompositeOp,
00170       XorCompositeOp,
00171       StereoCompositeOp
00172     } CompositeOperator;
00173   )
00174 
00175   STRINGIFY(
00176     typedef enum
00177     {
00178       UndefinedFunction,
00179       ArcsinFunction,
00180       ArctanFunction,
00181       PolynomialFunction,
00182       SinusoidFunction
00183     } MagickFunction;
00184   )
00185 
00186   STRINGIFY(
00187     typedef enum
00188     {
00189       UndefinedNoise,
00190       UniformNoise,
00191       GaussianNoise,
00192       MultiplicativeGaussianNoise,
00193       ImpulseNoise,
00194       LaplacianNoise,
00195       PoissonNoise,
00196       RandomNoise
00197     } NoiseType;
00198   )
00199 
00200   STRINGIFY(
00201     typedef enum
00202     {
00203       UndefinedPixelIntensityMethod = 0,
00204       AveragePixelIntensityMethod,
00205       BrightnessPixelIntensityMethod,
00206       LightnessPixelIntensityMethod,
00207       MSPixelIntensityMethod,
00208       Rec601LumaPixelIntensityMethod,
00209       Rec601LuminancePixelIntensityMethod,
00210       Rec709LumaPixelIntensityMethod,
00211       Rec709LuminancePixelIntensityMethod,
00212       RMSPixelIntensityMethod
00213     } PixelIntensityMethod;
00214   )
00215 
00216   STRINGIFY(
00217     typedef enum
00218     {
00219       BoxWeightingFunction = 0,
00220       TriangleWeightingFunction,
00221       CubicBCWeightingFunction,
00222       HannWeightingFunction,
00223       HammingWeightingFunction,
00224       BlackmanWeightingFunction,
00225       GaussianWeightingFunction,
00226       QuadraticWeightingFunction,
00227       JincWeightingFunction,
00228       SincWeightingFunction,
00229       SincFastWeightingFunction,
00230       KaiserWeightingFunction,
00231       WelchWeightingFunction,
00232       BohmanWeightingFunction,
00233       LagrangeWeightingFunction,
00234       CosineWeightingFunction
00235     } ResizeWeightingFunctionType;
00236   )
00237 
00238   STRINGIFY(
00239     typedef enum
00240     {
00241       UndefinedChannel = 0x0000,
00242       RedChannel = 0x0001,
00243       GrayChannel = 0x0001,
00244       CyanChannel = 0x0001,
00245       GreenChannel = 0x0002,
00246       MagentaChannel = 0x0002,
00247       BlueChannel = 0x0004,
00248       YellowChannel = 0x0004,
00249       BlackChannel = 0x0008,
00250       AlphaChannel = 0x0010,
00251       OpacityChannel = 0x0010,
00252       IndexChannel = 0x0020,             /* Color Index Table? */
00253       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
00254       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
00255       MetaChannel = 0x0100,              /* ???? */
00256       CompositeChannels = 0x001F,
00257       AllChannels = 0x7ffffff,
00258       /*
00259         Special purpose channel types.
00260         FUTURE: are these needed any more - they are more like hacks
00261         SyncChannels for example is NOT a real channel but a 'flag'
00262         It really says -- "User has not defined channels"
00263         Though it does have extra meaning in the "-auto-level" operator
00264       */
00265       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
00266       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
00267       GrayChannels = 0x0400,
00268       SyncChannels = 0x20000,    /* channels modified as a single unit */
00269       DefaultChannels = AllChannels
00270     } ChannelType;  /* must correspond to PixelChannel */
00271   )
00272 
00273 /*
00274   Helper functions.
00275 */
00276 
00277 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
00278 
00279   STRINGIFY(
00280     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
00281     {
00282       return((CLQuantum) value);
00283     }
00284   )
00285 
00286 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
00287 
00288   STRINGIFY(
00289     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
00290     {
00291       return((CLQuantum) (257.0f*value));
00292     }
00293   )
00294 
00295 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
00296 
00297   STRINGIFY(
00298     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
00299     {
00300       return((CLQuantum) (16843009.0*value));
00301     }
00302   )
00303 
00304 OPENCL_ENDIF()
00305 
00306 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
00307 
00308   STRINGIFY(
00309     static inline CLQuantum ClampToQuantum(const float value)
00310       {
00311         return (CLQuantum) clamp(value, 0.0f, QuantumRange);
00312       }
00313   )
00314 
00315 OPENCL_ELSE()
00316 
00317   STRINGIFY(
00318     static inline CLQuantum ClampToQuantum(const float value)
00319       {
00320         return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
00321       }
00322   )
00323 
00324 OPENCL_ENDIF()
00325 
00326   STRINGIFY(
00327     static inline int ClampToCanvas(const int offset,const int range)
00328       {
00329         return clamp(offset, (int)0, range-1);
00330       }
00331   )
00332 
00333   STRINGIFY(
00334     static inline uint ScaleQuantumToMap(CLQuantum value)
00335       {
00336         if (value >= (CLQuantum) MaxMap)
00337           return ((uint)MaxMap);
00338         else 
00339           return ((uint)value);
00340       }
00341   )
00342 
00343   STRINGIFY(
00344     static inline float PerceptibleReciprocal(const float x)
00345     {
00346       float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
00347       return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
00348     }
00349   )
00350 
00351   STRINGIFY(
00352   static inline float RoundToUnity(const float value)
00353    {
00354      return clamp(value,0.0f,1.0f);
00355    }
00356   )
00357 
00358   STRINGIFY(
00359 
00360   static inline unsigned int getPixelIndex(const unsigned int number_channels,
00361     const unsigned int columns, const unsigned int x, const unsigned int y)
00362   {
00363     return (x * number_channels) + (y * columns * number_channels);
00364   }
00365 
00366   static inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
00367   static inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
00368   static inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
00369   static inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
00370 
00371   static inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
00372   static inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
00373   static inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
00374   static inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
00375 
00376   static inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
00377   static inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
00378   static inline float getBlueF4(float4 p)                      { return p.x; }
00379   static inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
00380 
00381   static inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
00382   static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
00383   static inline float getGreenF4(float4 p)                     { return p.y; }
00384   static inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
00385 
00386   static inline CLQuantum getRed(CLPixelType p)                { return p.z; }
00387   static inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
00388   static inline float getRedF4(float4 p)                       { return p.z; }
00389   static inline void setRedF4(float4* p, float value)          { (*p).z = value; }
00390 
00391   static inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
00392   static inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
00393   static inline float getAlphaF4(float4 p)                     { return p.w; }
00394   static inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
00395 
00396   static inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
00397     const ChannelType channel, float *red, float *green, float *blue, float *alpha)
00398   {
00399     if ((channel & RedChannel) != 0)
00400       *red=getPixelRed(p);
00401 
00402     if (number_channels > 2)
00403       {
00404         if ((channel & GreenChannel) != 0)
00405           *green=getPixelGreen(p);
00406 
00407         if ((channel & BlueChannel) != 0)
00408           *blue=getPixelBlue(p);
00409       }
00410 
00411     if (((number_channels == 4) || (number_channels == 2)) &&
00412         ((channel & AlphaChannel) != 0))
00413       *alpha=getPixelAlpha(p,number_channels);
00414   }
00415 
00416   static inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
00417     const unsigned int columns, const unsigned int x, const unsigned int y)
00418   {
00419     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
00420 
00421     float4 pixel;
00422 
00423     pixel.x=getPixelRed(p);
00424 
00425     if (number_channels > 2)
00426       {
00427         pixel.y=getPixelGreen(p);
00428         pixel.z=getPixelBlue(p);
00429       }
00430 
00431     if ((number_channels == 4) || (number_channels == 2))
00432       pixel.w=getPixelAlpha(p,number_channels);
00433     return(pixel);
00434   }
00435 
00436   static inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
00437     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
00438   {
00439     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
00440 
00441     float red = 0.0f;
00442     float green = 0.0f;
00443     float blue = 0.0f;
00444     float alpha = 0.0f;
00445 
00446     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
00447     return (float4)(red, green, blue, alpha);
00448   }
00449 
00450   static inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
00451     const ChannelType channel, float red, float green, float blue, float alpha)
00452   {
00453     if ((channel & RedChannel) != 0)
00454       setPixelRed(p,ClampToQuantum(red));
00455 
00456     if (number_channels > 2)
00457       {
00458         if ((channel & GreenChannel) != 0)
00459           setPixelGreen(p,ClampToQuantum(green));
00460 
00461         if ((channel & BlueChannel) != 0)
00462           setPixelBlue(p,ClampToQuantum(blue));
00463       }
00464 
00465     if (((number_channels == 4) || (number_channels == 2)) &&
00466         ((channel & AlphaChannel) != 0))
00467       setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
00468   }
00469 
00470   static inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
00471     const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
00472   {
00473     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
00474 
00475     setPixelRed(p,ClampToQuantum(pixel.x));
00476 
00477     if (number_channels > 2)
00478       {
00479         setPixelGreen(p,ClampToQuantum(pixel.y));
00480         setPixelBlue(p,ClampToQuantum(pixel.z));
00481       }
00482 
00483     if ((number_channels == 4) || (number_channels == 2))
00484       setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
00485   }
00486 
00487   static inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
00488     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
00489     float4 pixel)
00490   {
00491     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
00492     WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
00493   }
00494 
00495   static inline float GetPixelIntensity(const unsigned int colorspace,
00496     const unsigned int method,float red,float green,float blue)
00497   {
00498     float intensity;
00499 
00500     if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace))
00501       return red;
00502 
00503     switch (method)
00504     {
00505       case AveragePixelIntensityMethod:
00506         {
00507           intensity=(red+green+blue)/3.0;
00508           break;
00509         }
00510       case BrightnessPixelIntensityMethod:
00511         {
00512           intensity=MagickMax(MagickMax(red,green),blue);
00513           break;
00514         }
00515       case LightnessPixelIntensityMethod:
00516         {
00517           intensity=(MagickMin(MagickMin(red,green),blue)+
00518               MagickMax(MagickMax(red,green),blue))/2.0;
00519           break;
00520         }
00521       case MSPixelIntensityMethod:
00522         {
00523           intensity=(float) (((float) red*red+green*green+blue*blue)/
00524               (3.0*QuantumRange));
00525           break;
00526         }
00527       case Rec601LumaPixelIntensityMethod:
00528         {
00529           /*
00530           if (image->colorspace == RGBColorspace)
00531           {
00532             red=EncodePixelGamma(red);
00533             green=EncodePixelGamma(green);
00534             blue=EncodePixelGamma(blue);
00535           }
00536           */
00537           intensity=0.298839*red+0.586811*green+0.114350*blue;
00538           break;
00539         }
00540       case Rec601LuminancePixelIntensityMethod:
00541         {
00542           /*
00543           if (image->colorspace == sRGBColorspace)
00544           {
00545             red=DecodePixelGamma(red);
00546             green=DecodePixelGamma(green);
00547             blue=DecodePixelGamma(blue);
00548           }
00549           */
00550           intensity=0.298839*red+0.586811*green+0.114350*blue;
00551           break;
00552         }
00553       case Rec709LumaPixelIntensityMethod:
00554       default:
00555         {
00556           /*
00557           if (image->colorspace == RGBColorspace)
00558           {
00559             red=EncodePixelGamma(red);
00560             green=EncodePixelGamma(green);
00561             blue=EncodePixelGamma(blue);
00562           }
00563           */
00564           intensity=0.212656*red+0.715158*green+0.072186*blue;
00565           break;
00566         }
00567       case Rec709LuminancePixelIntensityMethod:
00568         {
00569           /*
00570           if (image->colorspace == sRGBColorspace)
00571           {
00572             red=DecodePixelGamma(red);
00573             green=DecodePixelGamma(green);
00574             blue=DecodePixelGamma(blue);
00575           }
00576           */
00577           intensity=0.212656*red+0.715158*green+0.072186*blue;
00578           break;
00579         }
00580       case RMSPixelIntensityMethod:
00581         {
00582           intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
00583               sqrt(3.0));
00584           break;
00585         }
00586     }
00587 
00588     return intensity;
00589   }
00590 
00591   static inline int mirrorBottom(int value)
00592   {
00593       return (value < 0) ? - (value) : value;
00594   }
00595 
00596   static inline int mirrorTop(int value, int width)
00597   {
00598       return (value >= width) ? (2 * width - value - 1) : value;
00599   }
00600   )
00601 
00602 /*
00603 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00604 %                                                                             %
00605 %                                                                             %
00606 %                                                                             %
00607 %     A d d N o i s e                                                         %
00608 %                                                                             %
00609 %                                                                             %
00610 %                                                                             %
00611 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00612 */
00613 
00614   STRINGIFY(
00615   /*
00616   Part of MWC64X by David Thomas, dt10@imperial.ac.uk
00617   This is provided under BSD, full license is with the main package.
00618   See http://www.doc.ic.ac.uk/~dt10/research
00619   */
00620 
00621   // Pre: a<M, b<M
00622   // Post: r=(a+b) mod M
00623   ulong MWC_AddMod64(ulong a, ulong b, ulong M)
00624   {
00625     ulong v=a+b;
00626     //if( (v>=M) || (v<a) )
00627     if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
00628       v=v-M;
00629     return v;
00630   }
00631 
00632   // Pre: a<M,b<M
00633   // Post: r=(a*b) mod M
00634   // This could be done more efficently, but it is portable, and should
00635   // be easy to understand. It can be replaced with any of the better
00636   // modular multiplication algorithms (for example if you know you have
00637   // double precision available or something).
00638   ulong MWC_MulMod64(ulong a, ulong b, ulong M)
00639   {
00640     ulong r=0;
00641     while(a!=0){
00642       if(a&1)
00643         r=MWC_AddMod64(r,b,M);
00644       b=MWC_AddMod64(b,b,M);
00645       a=a>>1;
00646     }
00647     return r;
00648   }
00649 
00650   // Pre: a<M, e>=0
00651   // Post: r=(a^b) mod M
00652   // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
00653   // most architectures
00654   ulong MWC_PowMod64(ulong a, ulong e, ulong M)
00655   {
00656     ulong sqr=a, acc=1;
00657     while(e!=0){
00658       if(e&1)
00659         acc=MWC_MulMod64(acc,sqr,M);
00660         sqr=MWC_MulMod64(sqr,sqr,M);
00661       e=e>>1;
00662     }
00663     return acc;
00664   }
00665 
00666   uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
00667   {
00668     ulong m=MWC_PowMod64(A, distance, M);
00669     ulong x=curr.x*(ulong)A+curr.y;
00670     x=MWC_MulMod64(x, m, M);
00671     return (uint2)((uint)(x/A), (uint)(x%A));
00672   }
00673 
00674   uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
00675   {
00676     // This is an arbitrary constant for starting LCG jumping from. I didn't
00677     // want to start from 1, as then you end up with the two or three first values
00678     // being a bit poor in ones - once you've decided that, one constant is as
00679     // good as any another. There is no deep mathematical reason for it, I just
00680     // generated a random number.
00681     enum{ MWC_BASEID = 4077358422479273989UL };
00682 
00683     ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
00684     ulong m=MWC_PowMod64(A, dist, M);
00685 
00686     ulong x=MWC_MulMod64(MWC_BASEID, m, M);
00687     return (uint2)((uint)(x/A), (uint)(x%A));
00688   }
00689 
00691   typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
00692 
00693   void MWC64X_Step(mwc64x_state_t *s)
00694   {
00695     uint X=s->x, C=s->c;
00696 
00697     uint Xn=s->seed0*X+C;
00698     uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
00699     uint Cn=mad_hi(s->seed0,X,carry);
00700 
00701     s->x=Xn;
00702     s->c=Cn;
00703   }
00704 
00705   void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
00706   {
00707     uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
00708     s->x=tmp.x;
00709     s->c=tmp.y;
00710   }
00711 
00712   void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
00713   {
00714     uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
00715     s->x=tmp.x;
00716     s->c=tmp.y;
00717   }
00718 
00720   uint MWC64X_NextUint(mwc64x_state_t *s)
00721   {
00722     uint res=s->x ^ s->c;
00723     MWC64X_Step(s);
00724     return res;
00725   }
00726 
00727   //
00728   // End of MWC64X excerpt
00729   //
00730 
00731   float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
00732   {
00733     return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
00734   }
00735 
00736   float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
00737   {
00738     float
00739       alpha,
00740       beta,
00741       noise,
00742       sigma;
00743 
00744     noise = 0.0f;
00745     alpha=mwcReadPseudoRandomValue(r);
00746     switch(noise_type)
00747     {
00748       case UniformNoise:
00749       default:
00750         {
00751           noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
00752           break;
00753         }
00754       case GaussianNoise:
00755         {
00756           float
00757             gamma,
00758             tau;
00759 
00760           if (alpha == 0.0f)
00761             alpha=1.0f;
00762           beta=mwcReadPseudoRandomValue(r);
00763           gamma=sqrt(-2.0f*log(alpha));
00764           sigma=gamma*cospi((2.0f*beta));
00765           tau=gamma*sinpi((2.0f*beta));
00766           noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
00767           break;
00768         }
00769       case ImpulseNoise:
00770       {
00771         if (alpha < (SigmaImpulse/2.0f))
00772           noise=0.0f;
00773         else
00774           if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
00775             noise=QuantumRange;
00776           else
00777             noise=pixel;
00778         break;
00779       }
00780       case LaplacianNoise:
00781       {
00782         if (alpha <= 0.5f)
00783           {
00784             if (alpha <= MagickEpsilon)
00785               noise=(pixel-QuantumRange);
00786             else
00787               noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
00788                 0.5f);
00789             break;
00790           }
00791         beta=1.0f-alpha;
00792         if (beta <= (0.5f*MagickEpsilon))
00793           noise=(pixel+QuantumRange);
00794         else
00795           noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
00796         break;
00797       }
00798       case MultiplicativeGaussianNoise:
00799       {
00800         sigma=1.0f;
00801         if (alpha > MagickEpsilon)
00802           sigma=sqrt(-2.0f*log(alpha));
00803         beta=mwcReadPseudoRandomValue(r);
00804         noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
00805           cospi((2.0f*beta))/2.0f);
00806         break;
00807       }
00808       case PoissonNoise:
00809       {
00810         float 
00811           poisson;
00812         unsigned int i;
00813         poisson=exp(-SigmaPoisson*QuantumScale*pixel);
00814         for (i=0; alpha > poisson; i++)
00815         {
00816           beta=mwcReadPseudoRandomValue(r);
00817           alpha*=beta;
00818         }
00819         noise=(QuantumRange*i/SigmaPoisson);
00820         break;
00821       }
00822       case RandomNoise:
00823       {
00824         noise=(QuantumRange*SigmaRandom*alpha);
00825         break;
00826       }
00827     }
00828     return noise;
00829   }
00830 
00831   __kernel
00832   void AddNoise(const __global CLQuantum *image,
00833     const unsigned int number_channels,const ChannelType channel,
00834     const unsigned int length,const unsigned int pixelsPerWorkItem,
00835     const NoiseType noise_type,const float attenuate,const unsigned int seed0,
00836     const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
00837     __global CLQuantum *filteredImage)
00838   {
00839     mwc64x_state_t rng;
00840     rng.seed0 = seed0;
00841     rng.seed1 = seed1;
00842 
00843     uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
00844     uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
00845     MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
00846 
00847     uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
00848     uint count = pixelsPerWorkItem;
00849 
00850     while (count > 0 && pos < length)
00851     {
00852       const __global CLQuantum *p = image + pos;
00853       __global CLQuantum *q = filteredImage + pos;
00854 
00855       float red;
00856       float green;
00857       float blue;
00858       float alpha;
00859 
00860       ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
00861 
00862       if ((channel & RedChannel) != 0)
00863         red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
00864 
00865       if (number_channels > 2)
00866       {
00867         if ((channel & GreenChannel) != 0)
00868           green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
00869 
00870         if ((channel & BlueChannel) != 0)
00871           blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
00872       }
00873 
00874       if (((number_channels == 4) || (number_channels == 2)) &&
00875           ((channel & AlphaChannel) != 0))
00876         alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
00877 
00878       WriteChannels(q, number_channels, channel, red, green, blue, alpha);
00879 
00880       pos += (get_local_size(0) * number_channels);
00881       count--;
00882     }
00883   }
00884   )
00885 
00886 /*
00887 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00888 %                                                                             %
00889 %                                                                             %
00890 %                                                                             %
00891 %    B l u r                                                                  %
00892 %                                                                             %
00893 %                                                                             %
00894 %                                                                             %
00895 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00896 */
00897 
00898   STRINGIFY(
00899   /*
00900   Reduce image noise and reduce detail levels by row
00901   */
00902   __kernel void BlurRow(const __global CLQuantum *image,
00903     const unsigned int number_channels,const ChannelType channel,
00904     __constant float *filter,const unsigned int width,
00905     const unsigned int imageColumns,const unsigned int imageRows,
00906     __local float4 *temp,__global float4 *tempImage)
00907   {
00908     const int x = get_global_id(0);
00909     const int y = get_global_id(1);
00910 
00911     const int columns = imageColumns;
00912 
00913     const unsigned int radius = (width-1)/2;
00914     const int wsize = get_local_size(0);
00915     const unsigned int loadSize = wsize+width;
00916 
00917     //group coordinate
00918     const int groupX=get_local_size(0)*get_group_id(0);
00919 
00920     //parallel load and clamp
00921     for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
00922     {
00923       int cx = ClampToCanvas(i + groupX - radius, columns);
00924       temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
00925     }
00926 
00927     // barrier
00928     barrier(CLK_LOCAL_MEM_FENCE);
00929 
00930     // only do the work if this is not a patched item
00931     if (get_global_id(0) < columns)
00932     {
00933       // compute
00934       float4 result = (float4) 0;
00935 
00936       int i = 0;
00937 
00938       for ( ; i+7 < width; )
00939       {
00940         for (int j=0; j < 8; j++)
00941           result+=filter[i+j]*temp[i+j+get_local_id(0)];
00942         i+=8;
00943       }
00944 
00945       for ( ; i < width; i++)
00946         result+=filter[i]*temp[i+get_local_id(0)];
00947 
00948       // write back to global
00949       tempImage[y*columns+x] = result;
00950     }
00951   }
00952   )
00953 
00954   STRINGIFY(
00955   /*
00956   Reduce image noise and reduce detail levels by line
00957   */
00958   __kernel void BlurColumn(const __global float4 *blurRowData,
00959     const unsigned int number_channels,const ChannelType channel,
00960     __constant float *filter,const unsigned int width,
00961     const unsigned int imageColumns,const unsigned int imageRows,
00962     __local float4 *temp,__global CLQuantum *filteredImage)
00963   {
00964     const int x = get_global_id(0);
00965     const int y = get_global_id(1);
00966 
00967     const int columns = imageColumns;
00968     const int rows = imageRows;
00969 
00970     unsigned int radius = (width-1)/2;
00971     const int wsize = get_local_size(1);
00972     const unsigned int loadSize = wsize+width;
00973 
00974     //group coordinate
00975     const int groupX=get_local_size(0)*get_group_id(0);
00976     const int groupY=get_local_size(1)*get_group_id(1);
00977     //notice that get_local_size(0) is 1, so
00978     //groupX=get_group_id(0);
00979 
00980     //parallel load and clamp
00981     for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
00982       temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
00983 
00984     // barrier
00985     barrier(CLK_LOCAL_MEM_FENCE);
00986 
00987     // only do the work if this is not a patched item
00988     if (get_global_id(1) < rows)
00989     {
00990       // compute
00991       float4 result = (float4) 0;
00992 
00993       int i = 0;
00994 
00995       for ( ; i+7 < width; )
00996       {
00997         for (int j=0; j < 8; j++)
00998           result+=filter[i+j]*temp[i+j+get_local_id(1)];
00999         i+=8;
01000       }
01001 
01002       for ( ; i < width; i++)
01003         result+=filter[i]*temp[i+get_local_id(1)];
01004 
01005       // write back to global
01006       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
01007     }
01008   }
01009   )
01010 
01011 /*
01012 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01013 %                                                                             %
01014 %                                                                             %
01015 %                                                                             %
01016 %    C o n t r a s t                                                          %
01017 %                                                                             %
01018 %                                                                             %
01019 %                                                                             %
01020 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01021 */
01022 
01023   STRINGIFY(
01024 
01025   static inline float4 ConvertRGBToHSB(const float4 pixel)
01026   {
01027     float4 result=0.0f;
01028     result.w=pixel.w;
01029     float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
01030     if (tmax != 0.0f)
01031     {
01032       float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
01033       float delta=tmax-tmin;
01034 
01035       result.y=delta/tmax;
01036       result.z=QuantumScale*tmax;
01037       if (delta != 0.0f)
01038       {
01039         result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
01040         result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
01041           (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
01042         result.x/=6.0f;
01043         result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
01044       }
01045     }
01046     return(result);
01047   }
01048 
01049   static inline float4 ConvertHSBToRGB(const float4 pixel)
01050   {
01051     float hue=pixel.x;
01052     float saturation=pixel.y;
01053     float brightness=pixel.z;
01054 
01055     float4 result=pixel;
01056 
01057     if (saturation == 0.0f)
01058     {
01059       result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
01060     }
01061     else
01062     {
01063       float h=6.0f*(hue-floor(hue));
01064       float f=h-floor(h);
01065       float p=brightness*(1.0f-saturation);
01066       float q=brightness*(1.0f-saturation*f);
01067       float t=brightness*(1.0f-(saturation*(1.0f-f)));
01068       int ih = (int)h;
01069 
01070       if (ih == 1)
01071       {
01072         result.x=ClampToQuantum(QuantumRange*q);
01073         result.y=ClampToQuantum(QuantumRange*brightness);
01074         result.z=ClampToQuantum(QuantumRange*p);
01075       }
01076       else if (ih == 2)
01077       {
01078         result.x=ClampToQuantum(QuantumRange*p);
01079         result.y=ClampToQuantum(QuantumRange*brightness);
01080         result.z=ClampToQuantum(QuantumRange*t);
01081       }
01082       else if (ih == 3)
01083       {
01084         result.x=ClampToQuantum(QuantumRange*p);
01085         result.y=ClampToQuantum(QuantumRange*q);
01086         result.z=ClampToQuantum(QuantumRange*brightness);
01087       }
01088       else if (ih == 4)
01089       {
01090         result.x=ClampToQuantum(QuantumRange*t);
01091         result.y=ClampToQuantum(QuantumRange*p);
01092         result.z=ClampToQuantum(QuantumRange*brightness);
01093       }
01094       else if (ih == 5)
01095       {
01096         result.x=ClampToQuantum(QuantumRange*brightness);
01097         result.y=ClampToQuantum(QuantumRange*p);
01098         result.z=ClampToQuantum(QuantumRange*q);
01099       }
01100       else
01101       {
01102         result.x=ClampToQuantum(QuantumRange*brightness);
01103         result.y=ClampToQuantum(QuantumRange*t);
01104         result.z=ClampToQuantum(QuantumRange*p);
01105       }
01106     }
01107     return(result);
01108   }
01109 
01110   __kernel void Contrast(__global CLQuantum *image,
01111     const unsigned int number_channels,const int sign)
01112   {
01113     const int x=get_global_id(0);
01114     const int y=get_global_id(1);
01115     const unsigned int columns=get_global_size(0);
01116 
01117     float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
01118     if (number_channels < 3)
01119       pixel.y=pixel.z=pixel.x;
01120 
01121     pixel=ConvertRGBToHSB(pixel);
01122     float brightness=pixel.z;
01123     brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
01124     brightness=clamp(brightness,0.0f,1.0f);
01125     pixel.z=brightness;
01126     pixel=ConvertHSBToRGB(pixel);
01127 
01128     WriteAllChannels(image,number_channels,columns,x,y,pixel);
01129   }
01130   )
01131 
01132 /*
01133 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01134 %                                                                             %
01135 %                                                                             %
01136 %                                                                             %
01137 %    C o n t r a s t S t r e t c h                                            %
01138 %                                                                             %
01139 %                                                                             %
01140 %                                                                             %
01141 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01142 */
01143 
01144     STRINGIFY(
01145     /*
01146     */
01147     __kernel void Histogram(__global CLPixelType * restrict im,
01148       const ChannelType channel, 
01149       const unsigned int colorspace,
01150       const unsigned int method,
01151       __global uint4 * restrict histogram)
01152       {
01153         const int x = get_global_id(0);  
01154         const int y = get_global_id(1);  
01155         const int columns = get_global_size(0);  
01156         const int c = x + y * columns;
01157         if ((channel & SyncChannels) != 0)
01158         {
01159           float red=(float)getRed(im[c]);
01160           float green=(float)getGreen(im[c]);
01161           float blue=(float)getBlue(im[c]);
01162 
01163           float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
01164           uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
01165           atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
01166         }
01167         else
01168         {
01169           // for equalizing, we always need all channels?
01170           // otherwise something more
01171         }
01172       }
01173     )
01174 
01175     STRINGIFY(
01176     /*
01177     */
01178     __kernel void ContrastStretch(__global CLPixelType * restrict im,
01179       const ChannelType channel,  
01180       __global CLPixelType * restrict stretch_map,
01181       const float4 white, const float4 black)
01182       {
01183         const int x = get_global_id(0);  
01184         const int y = get_global_id(1);  
01185         const int columns = get_global_size(0);  
01186         const int c = x + y * columns;
01187 
01188         uint ePos;
01189         CLPixelType oValue, eValue;
01190         CLQuantum red, green, blue, alpha;
01191 
01192         //read from global
01193         oValue=im[c];
01194 
01195         if ((channel & RedChannel) != 0)
01196         {
01197           if (getRedF4(white) != getRedF4(black))
01198           {
01199             ePos = ScaleQuantumToMap(getRed(oValue)); 
01200             eValue = stretch_map[ePos];
01201             red = getRed(eValue);
01202           }
01203         }
01204 
01205         if ((channel & GreenChannel) != 0)
01206         {
01207           if (getGreenF4(white) != getGreenF4(black))
01208           {
01209             ePos = ScaleQuantumToMap(getGreen(oValue)); 
01210             eValue = stretch_map[ePos];
01211             green = getGreen(eValue);
01212           }
01213         }
01214 
01215         if ((channel & BlueChannel) != 0)
01216         {
01217           if (getBlueF4(white) != getBlueF4(black))
01218           {
01219             ePos = ScaleQuantumToMap(getBlue(oValue)); 
01220             eValue = stretch_map[ePos];
01221             blue = getBlue(eValue);
01222           }
01223         }
01224 
01225         if ((channel & AlphaChannel) != 0)
01226         {
01227           if (getAlphaF4(white) != getAlphaF4(black))
01228           {
01229             ePos = ScaleQuantumToMap(getAlpha(oValue)); 
01230             eValue = stretch_map[ePos];
01231             alpha = getAlpha(eValue);
01232           }
01233         }
01234 
01235         //write back
01236         im[c]=(CLPixelType)(blue, green, red, alpha);
01237 
01238       }
01239     )
01240 
01241 /*
01242 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01243 %                                                                             %
01244 %                                                                             %
01245 %                                                                             %
01246 %    C o n v o l v e                                                          %
01247 %                                                                             %
01248 %                                                                             %
01249 %                                                                             %
01250 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01251 */
01252 
01253   STRINGIFY(
01254     __kernel 
01255     void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
01256     const unsigned int imageWidth, const unsigned int imageHeight,
01257     __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
01258     const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
01259 
01260       int2 blockID;
01261       blockID.x = get_global_id(0) / get_local_size(0);
01262       blockID.y = get_global_id(1) / get_local_size(1);
01263 
01264       // image area processed by this workgroup
01265       int2 imageAreaOrg;
01266       imageAreaOrg.x = blockID.x * get_local_size(0);
01267       imageAreaOrg.y = blockID.y * get_local_size(1);
01268 
01269       int2 midFilterDimen;
01270       midFilterDimen.x = (filterWidth-1)/2;
01271       midFilterDimen.y = (filterHeight-1)/2;
01272 
01273       int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
01274 
01275       // dimension of the local cache
01276       int2 cachedAreaDimen;
01277       cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
01278       cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
01279 
01280       // cache the pixels accessed by this workgroup in local memory
01281       int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
01282       int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
01283       int groupSize = get_local_size(0) * get_local_size(1);
01284       for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
01285 
01286         int2 cachedAreaIndex;
01287         cachedAreaIndex.x = i % cachedAreaDimen.x;
01288         cachedAreaIndex.y = i / cachedAreaDimen.x;
01289 
01290         int2 imagePixelIndex;
01291         imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
01292 
01293         // only support EdgeVirtualPixelMethod through ClampToCanvas
01294         // TODO: implement other virtual pixel method
01295         imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
01296         imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
01297 
01298         pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
01299       }
01300 
01301       // cache the filter
01302       for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
01303         filterCache[i] = filter[i];
01304       }
01305       barrier(CLK_LOCAL_MEM_FENCE);
01306 
01307 
01308       int2 imageIndex;
01309       imageIndex.x = imageAreaOrg.x + get_local_id(0);
01310       imageIndex.y = imageAreaOrg.y + get_local_id(1);
01311 
01312       // if out-of-range, stops here and quit
01313       if (imageIndex.x >= imageWidth
01314         || imageIndex.y >= imageHeight) {
01315           return;
01316       }
01317 
01318       int filterIndex = 0;
01319       float4 sum = (float4)0.0f;
01320       float gamma = 0.0f;
01321       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
01322         int cacheIndexY = get_local_id(1);
01323         for (int j = 0; j < filterHeight; j++) {
01324           int cacheIndexX = get_local_id(0);
01325           for (int i = 0; i < filterWidth; i++) {
01326             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
01327             float f = filterCache[filterIndex];
01328 
01329             sum.x += f * p.x;
01330             sum.y += f * p.y;
01331             sum.z += f * p.z; 
01332             sum.w += f * p.w;
01333 
01334             gamma += f;
01335             filterIndex++;
01336             cacheIndexX++;
01337           }
01338           cacheIndexY++;
01339         }
01340       }
01341       else {
01342         int cacheIndexY = get_local_id(1);
01343         for (int j = 0; j < filterHeight; j++) {
01344           int cacheIndexX = get_local_id(0);
01345           for (int i = 0; i < filterWidth; i++) {
01346 
01347             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
01348             float alpha = QuantumScale*p.w;
01349             float f = filterCache[filterIndex];
01350             float g = alpha * f;
01351 
01352             sum.x += g*p.x;
01353             sum.y += g*p.y;
01354             sum.z += g*p.z;
01355             sum.w += f*p.w;
01356 
01357             gamma += g;
01358             filterIndex++;
01359             cacheIndexX++;
01360           }
01361           cacheIndexY++;
01362         }
01363         gamma = PerceptibleReciprocal(gamma);
01364         sum.xyz = gamma*sum.xyz;
01365       }
01366       CLPixelType outputPixel;
01367       outputPixel.x = ClampToQuantum(sum.x);
01368       outputPixel.y = ClampToQuantum(sum.y);
01369       outputPixel.z = ClampToQuantum(sum.z);
01370       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
01371 
01372       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
01373     }
01374   )
01375 
01376   STRINGIFY(
01377     __kernel 
01378     void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
01379                   const uint imageWidth, const uint imageHeight,
01380                   __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
01381                   const uint matte, const ChannelType channel) {
01382 
01383       int2 imageIndex;
01384       imageIndex.x = get_global_id(0);
01385       imageIndex.y = get_global_id(1);
01386 
01387       /*
01388       unsigned int imageWidth = get_global_size(0);
01389       unsigned int imageHeight = get_global_size(1);
01390       */
01391       if (imageIndex.x >= imageWidth
01392           || imageIndex.y >= imageHeight)
01393           return;
01394 
01395       int2 midFilterDimen;
01396       midFilterDimen.x = (filterWidth-1)/2;
01397       midFilterDimen.y = (filterHeight-1)/2;
01398 
01399       int filterIndex = 0;
01400       float4 sum = (float4)0.0f;
01401       float gamma = 0.0f;
01402       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
01403         for (int j = 0; j < filterHeight; j++) {
01404           int2 inputPixelIndex;
01405           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
01406           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
01407           for (int i = 0; i < filterWidth; i++) {
01408             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
01409             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
01410         
01411             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
01412             float f = filter[filterIndex];
01413 
01414             sum.x += f * p.x;
01415             sum.y += f * p.y;
01416             sum.z += f * p.z; 
01417             sum.w += f * p.w;
01418 
01419             gamma += f;
01420 
01421             filterIndex++;
01422           }
01423         }
01424       }
01425       else {
01426 
01427         for (int j = 0; j < filterHeight; j++) {
01428           int2 inputPixelIndex;
01429           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
01430           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
01431           for (int i = 0; i < filterWidth; i++) {
01432             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
01433             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
01434         
01435             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
01436             float alpha = QuantumScale*p.w;
01437             float f = filter[filterIndex];
01438             float g = alpha * f;
01439 
01440             sum.x += g*p.x;
01441             sum.y += g*p.y;
01442             sum.z += g*p.z;
01443             sum.w += f*p.w;
01444 
01445             gamma += g;
01446 
01447 
01448             filterIndex++;
01449           }
01450         }
01451         gamma = PerceptibleReciprocal(gamma);
01452         sum.xyz = gamma*sum.xyz;
01453       }
01454 
01455       CLPixelType outputPixel;
01456       outputPixel.x = ClampToQuantum(sum.x);
01457       outputPixel.y = ClampToQuantum(sum.y);
01458       outputPixel.z = ClampToQuantum(sum.z);
01459       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
01460 
01461       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
01462     }
01463   )
01464 
01465 /*
01466 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01467 %                                                                             %
01468 %                                                                             %
01469 %                                                                             %
01470 %     D e s p e c k l e                                                       %
01471 %                                                                             %
01472 %                                                                             %
01473 %                                                                             %
01474 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01475 */
01476 
01477   STRINGIFY(
01478 
01479   __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
01480   , const unsigned int imageWidth, const unsigned int imageHeight
01481   , const int2 offset, const int polarity, const int matte) {
01482 
01483     int x = get_global_id(0);
01484     int y = get_global_id(1);
01485 
01486     CLPixelType v = inputImage[y*imageWidth+x];
01487 
01488     int2 neighbor;
01489     neighbor.y = y + offset.y;
01490     neighbor.x = x + offset.x;
01491 
01492     int2 clampedNeighbor;
01493     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
01494     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
01495 
01496     CLPixelType r = (clampedNeighbor.x == neighbor.x
01497                      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
01498     :(CLPixelType)0;
01499 
01500     int sv[4];
01501     sv[0] = (int)v.x;
01502     sv[1] = (int)v.y;
01503     sv[2] = (int)v.z;
01504     sv[3] = (int)v.w;
01505 
01506     int sr[4];
01507     sr[0] = (int)r.x;
01508     sr[1] = (int)r.y;
01509     sr[2] = (int)r.z;
01510     sr[3] = (int)r.w;
01511 
01512     if (polarity > 0) {
01513       \n #pragma unroll 4\n
01514       for (unsigned int i = 0; i < 4; i++) {
01515         sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
01516       }
01517     }
01518     else {
01519       \n #pragma unroll 4\n
01520       for (unsigned int i = 0; i < 4; i++) {
01521         sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
01522       }
01523 
01524     }
01525 
01526     v.x = (CLQuantum)sv[0];
01527     v.y = (CLQuantum)sv[1];
01528     v.z = (CLQuantum)sv[2];
01529 
01530     if (matte!=0)
01531       v.w = (CLQuantum)sv[3];
01532 
01533     outputImage[y*imageWidth+x] = v;
01534 
01535     }
01536 
01537 
01538   )
01539 
01540   STRINGIFY(
01541 
01542   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
01543   , const unsigned int imageWidth, const unsigned int imageHeight
01544   , const int2 offset, const int polarity, const int matte) {
01545 
01546     int x = get_global_id(0);
01547     int y = get_global_id(1);
01548 
01549     CLPixelType v = inputImage[y*imageWidth+x];
01550 
01551     int2 neighbor, clampedNeighbor;
01552 
01553     neighbor.y = y + offset.y;
01554     neighbor.x = x + offset.x;
01555     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
01556     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
01557 
01558     CLPixelType r = (clampedNeighbor.x == neighbor.x
01559       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
01560     :(CLPixelType)0;
01561 
01562 
01563     neighbor.y = y - offset.y;
01564     neighbor.x = x - offset.x;
01565     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
01566     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
01567 
01568     CLPixelType s = (clampedNeighbor.x == neighbor.x
01569       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
01570     :(CLPixelType)0;
01571 
01572 
01573     int sv[4];
01574     sv[0] = (int)v.x;
01575     sv[1] = (int)v.y;
01576     sv[2] = (int)v.z;
01577     sv[3] = (int)v.w;
01578 
01579     int sr[4];
01580     sr[0] = (int)r.x;
01581     sr[1] = (int)r.y;
01582     sr[2] = (int)r.z;
01583     sr[3] = (int)r.w;
01584 
01585     int ss[4];
01586     ss[0] = (int)s.x;
01587     ss[1] = (int)s.y;
01588     ss[2] = (int)s.z;
01589     ss[3] = (int)s.w;
01590 
01591     if (polarity > 0) {
01592       \n #pragma unroll 4\n
01593       for (unsigned int i = 0; i < 4; i++) {
01594         //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
01595         //
01596         //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
01597         //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
01598         sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
01599       }
01600     }
01601     else {
01602       \n #pragma unroll 4\n
01603       for (unsigned int i = 0; i < 4; i++) {
01604         //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
01605         //
01606         //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
01607         sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
01608       }
01609     }
01610 
01611     v.x = (CLQuantum)sv[0];
01612     v.y = (CLQuantum)sv[1];
01613     v.z = (CLQuantum)sv[2];
01614 
01615     if (matte!=0)
01616       v.w = (CLQuantum)sv[3];
01617 
01618     outputImage[y*imageWidth+x] = v;
01619 
01620     }
01621   )
01622 
01623 /*
01624 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01625 %                                                                             %
01626 %                                                                             %
01627 %                                                                             %
01628 %     E q u a l i z e                                                         %
01629 %                                                                             %
01630 %                                                                             %
01631 %                                                                             %
01632 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01633 */
01634 
01635     STRINGIFY(
01636     /*
01637     */
01638     __kernel void Equalize(__global CLPixelType * restrict im,
01639       const ChannelType channel,  
01640       __global CLPixelType * restrict equalize_map,
01641       const float4 white, const float4 black)
01642       {
01643         const int x = get_global_id(0);  
01644         const int y = get_global_id(1);  
01645         const int columns = get_global_size(0);  
01646         const int c = x + y * columns;
01647 
01648         uint ePos;
01649         CLPixelType oValue, eValue;
01650         CLQuantum red, green, blue, alpha;
01651 
01652         //read from global
01653         oValue=im[c];
01654 
01655         if ((channel & SyncChannels) != 0)
01656         {
01657           if (getRedF4(white) != getRedF4(black))
01658           {
01659             ePos = ScaleQuantumToMap(getRed(oValue)); 
01660             eValue = equalize_map[ePos];
01661             red = getRed(eValue);
01662             ePos = ScaleQuantumToMap(getGreen(oValue)); 
01663             eValue = equalize_map[ePos];
01664             green = getRed(eValue);
01665             ePos = ScaleQuantumToMap(getBlue(oValue)); 
01666             eValue = equalize_map[ePos];
01667             blue = getRed(eValue);
01668             ePos = ScaleQuantumToMap(getAlpha(oValue)); 
01669             eValue = equalize_map[ePos];
01670             alpha = getRed(eValue);
01671  
01672             //write back
01673             im[c]=(CLPixelType)(blue, green, red, alpha);
01674           }
01675 
01676         }
01677 
01678         // for equalizing, we always need all channels?
01679         // otherwise something more
01680 
01681      }
01682     )
01683 
01684 /*
01685 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01686 %                                                                             %
01687 %                                                                             %
01688 %                                                                             %
01689 %     F u n c t i o n                                                         %
01690 %                                                                             %
01691 %                                                                             %
01692 %                                                                             %
01693 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01694 */
01695 
01696   STRINGIFY(
01697   /*
01698   apply FunctionImageChannel(braightness-contrast)
01699   */
01700   CLQuantum ApplyFunction(float pixel,const MagickFunction function,
01701     const unsigned int number_parameters,__constant float *parameters)
01702   {
01703     float result = 0.0f;
01704 
01705     switch (function)
01706     {
01707     case PolynomialFunction:
01708       {
01709         for (unsigned int i=0; i < number_parameters; i++)
01710           result = result*QuantumScale*pixel + parameters[i];
01711         result *= QuantumRange;
01712         break;
01713       }
01714     case SinusoidFunction:
01715       {
01716         float  freq,phase,ampl,bias;
01717         freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
01718         phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
01719         ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
01720         bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
01721         result = QuantumRange*(ampl*sin(2.0f*MagickPI*
01722           (freq*QuantumScale*pixel + phase/360.0f)) + bias);
01723         break;
01724       }
01725     case ArcsinFunction:
01726       {
01727         float  width,range,center,bias;
01728         width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
01729         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
01730         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
01731         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
01732 
01733         result = 2.0f/width*(QuantumScale*pixel - center);
01734         result = range/MagickPI*asin(result)+bias;
01735         result = ( result <= -1.0f ) ? bias - range/2.0f : result;
01736         result = ( result >= 1.0f ) ? bias + range/2.0f : result;
01737         result *= QuantumRange;
01738         break;
01739       }
01740     case ArctanFunction:
01741       {
01742         float slope,range,center,bias;
01743         slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
01744         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
01745         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
01746         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
01747         result = MagickPI*slope*(QuantumScale*pixel-center);
01748         result = QuantumRange*(range/MagickPI*atan(result) + bias);
01749         break;
01750       }
01751     case UndefinedFunction:
01752       break;
01753     }
01754     return(ClampToQuantum(result));
01755   }
01756   )
01757 
01758   STRINGIFY(
01759   /*
01760   Improve brightness / contrast of the image
01761   channel : define which channel is improved
01762   function : the function called to enchance the brightness contrast
01763   number_parameters : numbers of parameters 
01764   parameters : the parameter
01765   */
01766   __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
01767     const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
01768     __constant float *parameters)
01769   {
01770     const unsigned int x = get_global_id(0);
01771     const unsigned int y = get_global_id(1);
01772     const unsigned int columns = get_global_size(0);
01773     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
01774 
01775     float red;
01776     float green;
01777     float blue;
01778     float alpha;
01779 
01780     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
01781 
01782     if ((channel & RedChannel) != 0)
01783       red=ApplyFunction(red, function, number_parameters, parameters);
01784 
01785     if (number_channels > 2)
01786       {
01787         if ((channel & GreenChannel) != 0)
01788           green=ApplyFunction(green, function, number_parameters, parameters);
01789 
01790         if ((channel & BlueChannel) != 0)
01791           blue=ApplyFunction(blue, function, number_parameters, parameters);
01792       }
01793 
01794     if (((number_channels == 4) || (number_channels == 2)) &&
01795         ((channel & AlphaChannel) != 0))
01796       alpha=ApplyFunction(alpha, function, number_parameters, parameters);
01797 
01798     WriteChannels(p, number_channels, channel, red, green, blue, alpha);
01799   }
01800   )
01801 
01802 /*
01803 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01804 %                                                                             %
01805 %                                                                             %
01806 %                                                                             %
01807 %     G r a y s c a l e                                                       %
01808 %                                                                             %
01809 %                                                                             %
01810 %                                                                             %
01811 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01812 */
01813 
01814   STRINGIFY(
01815   __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
01816     const unsigned int colorspace,const unsigned int method)
01817   {
01818     const unsigned int x = get_global_id(0);
01819     const unsigned int y = get_global_id(1);
01820     const unsigned int columns = get_global_size(0);
01821     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
01822 
01823     float
01824       blue,
01825       green,
01826       red;
01827 
01828     red=getPixelRed(p);
01829     green=getPixelGreen(p);
01830     blue=getPixelBlue(p);
01831 
01832     CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
01833 
01834     setPixelRed(p,intensity);
01835     setPixelGreen(p,intensity);
01836     setPixelBlue(p,intensity);
01837   }
01838   )
01839 
01840 /*
01841 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01842 %                                                                             %
01843 %                                                                             %
01844 %                                                                             %
01845 %     L o c a l C o n t r a s t                                               %
01846 %                                                                             %
01847 %                                                                             %
01848 %                                                                             %
01849 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01850 */
01851 
01852     STRINGIFY(
01853 
01854       __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
01855           const int radius, 
01856           const int imageWidth,
01857           const int imageHeight)
01858       {
01859         const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
01860 
01861         int x = get_local_id(0);
01862         int y = get_global_id(1);
01863 
01864         if ((x >= imageWidth) || (y >= imageHeight))
01865           return;
01866 
01867         global CLPixelType *src = srcImage + y * imageWidth;
01868 
01869         for (int i = x; i < imageWidth; i += get_local_size(0)) {
01870             float sum = 0.0f;
01871             float weight = 1.0f;
01872 
01873             int j = i - radius;
01874             while ((j + 7) < i) {
01875                 for (int k = 0; k < 8; ++k) // Unroll 8x
01876                     sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
01877                 weight += 8.0f;
01878                 j+=8;
01879             }
01880             while (j < i) {
01881                 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
01882                 weight += 1.0f;
01883                 ++j;
01884             }
01885 
01886             while ((j + 7) < radius + i) {
01887                 for (int k = 0; k < 8; ++k) // Unroll 8x
01888                     sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
01889                 weight -= 8.0f;
01890                 j+=8;
01891             }
01892             while (j < radius + i) {
01893                 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
01894                 weight -= 1.0f;
01895                 ++j;
01896             }
01897 
01898             tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
01899         }
01900       }
01901     )
01902 
01903     STRINGIFY(
01904       __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
01905           const int radius, 
01906           const float strength,
01907           const int imageWidth,
01908           const int imageHeight)
01909       {
01910         const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
01911 
01912         int x = get_global_id(0);
01913         int y = get_global_id(1);
01914 
01915         if ((x >= imageWidth) || (y >= imageHeight))
01916                 return;
01917 
01918         global float *src = blurImage + x;
01919 
01920         float sum = 0.0f;
01921         float weight = 1.0f;
01922 
01923         int j = y - radius;
01924         while ((j + 7) < y) {
01925             for (int k = 0; k < 8; ++k) // Unroll 8x
01926                 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
01927             weight += 8.0f;
01928             j+=8;
01929         }
01930         while (j < y) {
01931             sum += weight * src[mirrorBottom(j) * imageWidth];
01932             weight += 1.0f;
01933             ++j;
01934         }
01935 
01936         while ((j + 7) < radius + y) {
01937             for (int k = 0; k < 8; ++k) // Unroll 8x
01938                 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
01939             weight -= 8.0f;
01940             j+=8;
01941         }
01942         while (j < radius + y) {
01943             sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
01944             weight -= 1.0f;
01945             ++j;
01946         }
01947 
01948         CLPixelType pixel = srcImage[x + y * imageWidth];
01949         float srcVal = dot(RGB, convert_float4(pixel));
01950         float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
01951         mult = (srcVal + mult) / srcVal;
01952 
01953         pixel.x = ClampToQuantum(pixel.x * mult);
01954         pixel.y = ClampToQuantum(pixel.y * mult);
01955         pixel.z = ClampToQuantum(pixel.z * mult);
01956 
01957         dstImage[x + y * imageWidth] = pixel;
01958       }
01959     )
01960 
01961 /*
01962 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01963 %                                                                             %
01964 %                                                                             %
01965 %                                                                             %
01966 %     M o d u l a t e                                                         %
01967 %                                                                             %
01968 %                                                                             %
01969 %                                                                             %
01970 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01971 */
01972 
01973   STRINGIFY(
01974 
01975   static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
01976     float *hue, float *saturation, float *lightness)
01977   {
01978   float
01979     c,
01980     tmax,
01981     tmin;
01982 
01983   /*
01984      Convert RGB to HSL colorspace.
01985      */
01986   tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
01987   tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
01988 
01989   c=tmax-tmin;
01990 
01991   *lightness=(tmax+tmin)/2.0;
01992   if (c <= 0.0)
01993   {
01994     *hue=0.0;
01995     *saturation=0.0;
01996     return;
01997   }
01998 
01999   if (tmax == (QuantumScale*red))
02000   {
02001     *hue=(QuantumScale*green-QuantumScale*blue)/c;
02002     if ((QuantumScale*green) < (QuantumScale*blue))
02003       *hue+=6.0;
02004   }
02005   else
02006     if (tmax == (QuantumScale*green))
02007       *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
02008     else
02009       *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
02010 
02011   *hue*=60.0/360.0;
02012   if (*lightness <= 0.5)
02013     *saturation=c/(2.0*(*lightness));
02014   else
02015     *saturation=c/(2.0-2.0*(*lightness));
02016   }
02017 
02018   static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
02019       CLQuantum *red,CLQuantum *green,CLQuantum *blue)
02020   {
02021     float
02022       b,
02023       c,
02024       g,
02025       h,
02026       tmin,
02027       r,
02028       x;
02029 
02030     /*
02031        Convert HSL to RGB colorspace.
02032        */
02033     h=hue*360.0;
02034     if (lightness <= 0.5)
02035       c=2.0*lightness*saturation;
02036     else
02037       c=(2.0-2.0*lightness)*saturation;
02038     tmin=lightness-0.5*c;
02039     h-=360.0*floor(h/360.0);
02040     h/=60.0;
02041     x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
02042     switch ((int) floor(h) % 6)
02043     {
02044       case 0:
02045         {
02046           r=tmin+c;
02047           g=tmin+x;
02048           b=tmin;
02049           break;
02050         }
02051       case 1:
02052         {
02053           r=tmin+x;
02054           g=tmin+c;
02055           b=tmin;
02056           break;
02057         }
02058       case 2:
02059         {
02060           r=tmin;
02061           g=tmin+c;
02062           b=tmin+x;
02063           break;
02064         }
02065       case 3:
02066         {
02067           r=tmin;
02068           g=tmin+x;
02069           b=tmin+c;
02070           break;
02071         }
02072       case 4:
02073         {
02074           r=tmin+x;
02075           g=tmin;
02076           b=tmin+c;
02077           break;
02078         }
02079       case 5:
02080         {
02081           r=tmin+c;
02082           g=tmin;
02083           b=tmin+x;
02084           break;
02085         }
02086       default:
02087         {
02088           r=0.0;
02089           g=0.0;
02090           b=0.0;
02091         }
02092     }
02093     *red=ClampToQuantum(QuantumRange*r);
02094     *green=ClampToQuantum(QuantumRange*g);
02095     *blue=ClampToQuantum(QuantumRange*b);
02096   }
02097 
02098   static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 
02099     CLQuantum *red,CLQuantum *green,CLQuantum *blue)
02100   {
02101     float
02102       hue,
02103       lightness,
02104       saturation;
02105 
02106     /*
02107     Increase or decrease color lightness, saturation, or hue.
02108     */
02109     ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
02110     hue+=0.5*(0.01*percent_hue-1.0);
02111     while (hue < 0.0)
02112       hue+=1.0;
02113     while (hue >= 1.0)
02114       hue-=1.0;
02115     saturation*=0.01*percent_saturation;
02116     lightness*=0.01*percent_lightness;
02117     ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
02118   }
02119 
02120   __kernel void Modulate(__global CLPixelType *im, 
02121     const float percent_brightness, 
02122     const float percent_hue, 
02123     const float percent_saturation, 
02124     const int colorspace)
02125   {
02126 
02127     const int x = get_global_id(0);  
02128     const int y = get_global_id(1);
02129     const int columns = get_global_size(0);
02130     const int c = x + y * columns;
02131 
02132     CLPixelType pixel = im[c];
02133 
02134     CLQuantum
02135         blue,
02136         green,
02137         red;
02138 
02139     red=getRed(pixel);
02140     green=getGreen(pixel);
02141     blue=getBlue(pixel);
02142 
02143     switch (colorspace)
02144     {
02145       case HSLColorspace:
02146       default:
02147         {
02148           ModulateHSL(percent_hue, percent_saturation, percent_brightness, 
02149               &red, &green, &blue);
02150         }
02151 
02152     }
02153 
02154     CLPixelType filteredPixel;
02155    
02156     setRed(&filteredPixel, red);
02157     setGreen(&filteredPixel, green);
02158     setBlue(&filteredPixel, blue);
02159     filteredPixel.w = pixel.w;
02160 
02161     im[c] = filteredPixel;
02162   }
02163   )
02164 
02165 /*
02166 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02167 %                                                                             %
02168 %                                                                             %
02169 %                                                                             %
02170 %     M o t i o n B l u r                                                     %
02171 %                                                                             %
02172 %                                                                             %
02173 %                                                                             %
02174 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02175 */
02176 
02177   STRINGIFY(
02178     __kernel 
02179     void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
02180                     const unsigned int imageWidth, const unsigned int imageHeight,
02181                     const __global float *filter, const unsigned int width, const __global int2* offset,
02182                     const float4 bias,
02183                     const ChannelType channel, const unsigned int matte) {
02184 
02185       int2 currentPixel;
02186       currentPixel.x = get_global_id(0);
02187       currentPixel.y = get_global_id(1);
02188 
02189       if (currentPixel.x >= imageWidth
02190           || currentPixel.y >= imageHeight)
02191           return;
02192 
02193       float4 pixel;
02194       pixel.x = (float)bias.x;
02195       pixel.y = (float)bias.y;
02196       pixel.z = (float)bias.z;
02197       pixel.w = (float)bias.w;
02198 
02199       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
02200         
02201         for (int i = 0; i < width; i++) {
02202           // only support EdgeVirtualPixelMethod through ClampToCanvas
02203           // TODO: implement other virtual pixel method
02204           int2 samplePixel = currentPixel + offset[i];
02205           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
02206           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
02207           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
02208 
02209           pixel.x += (filter[i] * (float)samplePixelValue.x);
02210           pixel.y += (filter[i] * (float)samplePixelValue.y);
02211           pixel.z += (filter[i] * (float)samplePixelValue.z);
02212           pixel.w += (filter[i] * (float)samplePixelValue.w);
02213         }
02214 
02215         CLPixelType outputPixel;
02216         outputPixel.x = ClampToQuantum(pixel.x);
02217         outputPixel.y = ClampToQuantum(pixel.y);
02218         outputPixel.z = ClampToQuantum(pixel.z);
02219         outputPixel.w = ClampToQuantum(pixel.w);
02220         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
02221       }
02222       else {
02223 
02224         float gamma = 0.0f;
02225         for (int i = 0; i < width; i++) {
02226           // only support EdgeVirtualPixelMethod through ClampToCanvas
02227           // TODO: implement other virtual pixel method
02228           int2 samplePixel = currentPixel + offset[i];
02229           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
02230           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
02231 
02232           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
02233 
02234           float alpha = QuantumScale*samplePixelValue.w;
02235           float k = filter[i];
02236           pixel.x = pixel.x + k * alpha * samplePixelValue.x;
02237           pixel.y = pixel.y + k * alpha * samplePixelValue.y;
02238           pixel.z = pixel.z + k * alpha * samplePixelValue.z;
02239 
02240           pixel.w += k * alpha * samplePixelValue.w;
02241 
02242           gamma+=k*alpha;
02243         }
02244         gamma = PerceptibleReciprocal(gamma);
02245         pixel.xyz = gamma*pixel.xyz;
02246 
02247         CLPixelType outputPixel;
02248         outputPixel.x = ClampToQuantum(pixel.x);
02249         outputPixel.y = ClampToQuantum(pixel.y);
02250         outputPixel.z = ClampToQuantum(pixel.z);
02251         outputPixel.w = ClampToQuantum(pixel.w);
02252         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
02253       }
02254     }
02255   )
02256 
02257 /*
02258 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02259 %                                                                             %
02260 %                                                                             %
02261 %                                                                             %
02262 %     R e s i z e                                                             %
02263 %                                                                             %
02264 %                                                                             %
02265 %                                                                             %
02266 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02267 */
02268 
02269   STRINGIFY(
02270   // Based on Box from resize.c
02271   float BoxResizeFilter(const float x)
02272   {
02273     return 1.0f;
02274   }
02275   )
02276     
02277   STRINGIFY(
02278   // Based on CubicBC from resize.c
02279   float CubicBC(const float x,const __global float* resizeFilterCoefficients)
02280   {
02281     /*
02282     Cubic Filters using B,C determined values:
02283     Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
02284     Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
02285     Spline              B = 1   C = 0    B-Spline Gaussian approximation
02286     Hermite             B = 0   C = 0    B-Spline interpolator
02287 
02288     See paper by Mitchell and Netravali, Reconstruction Filters in Computer
02289     Graphics Computer Graphics, Volume 22, Number 4, August 1988
02290     http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
02291     Mitchell.pdf.
02292 
02293     Coefficents are determined from B,C values:
02294     P0 = (  6 - 2*B       )/6 = coeff[0]
02295     P1 =         0
02296     P2 = (-18 +12*B + 6*C )/6 = coeff[1]
02297     P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
02298     Q0 = (      8*B +24*C )/6 = coeff[3]
02299     Q1 = (    -12*B -48*C )/6 = coeff[4]
02300     Q2 = (      6*B +30*C )/6 = coeff[5]
02301     Q3 = (    - 1*B - 6*C )/6 = coeff[6]
02302 
02303     which are used to define the filter:
02304 
02305     P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
02306     Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
02307 
02308     which ensures function is continuous in value and derivative (slope).
02309     */
02310     if (x < 1.0)
02311       return(resizeFilterCoefficients[0]+x*(x*
02312       (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
02313     if (x < 2.0)
02314       return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
02315       (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
02316     return(0.0);
02317   }
02318   )
02319 
02320   STRINGIFY(
02321   float Sinc(const float x)
02322   {
02323     if (x != 0.0f)
02324     {
02325       const float alpha=(float) (MagickPI*x);
02326       return sinpi(x)/alpha;
02327     }
02328     return(1.0f);
02329   }
02330   )
02331 
02332   STRINGIFY(
02333   float Triangle(const float x)
02334   {
02335     /*
02336     1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
02337     a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
02338     for Sinc().
02339     */
02340     return ((x<1.0f)?(1.0f-x):0.0f);
02341   }
02342   )
02343 
02344 
02345   STRINGIFY(
02346   float Hann(const float x)
02347   {
02348     /*
02349     Cosine window function:
02350       0.5+0.5*cos(pi*x).
02351     */
02352     const float cosine=cos((MagickPI*x));
02353     return(0.5f+0.5f*cosine);
02354   }
02355   )
02356 
02357   STRINGIFY(
02358   float Hamming(const float x)
02359   {
02360     /*
02361       Offset cosine window function:
02362        .54 + .46 cos(pi x).
02363     */
02364     const float cosine=cos((MagickPI*x));
02365     return(0.54f+0.46f*cosine);
02366   }
02367   )
02368 
02369   STRINGIFY(
02370   float Blackman(const float x)
02371   {
02372     /*
02373       Blackman: 2nd order cosine windowing function:
02374         0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
02375 
02376       Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
02377       five flops.
02378     */
02379     const float cosine=cos((MagickPI*x));
02380     return(0.34f+cosine*(0.5f+cosine*0.16f));
02381   }
02382   )
02383 
02384   STRINGIFY(
02385   static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
02386   {
02387     switch (filterType)
02388     {
02389     /* Call Sinc even for SincFast to get better precision on GPU 
02390        and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
02391     case SincWeightingFunction:
02392     case SincFastWeightingFunction:
02393       return Sinc(x);
02394     case CubicBCWeightingFunction:
02395       return CubicBC(x,filterCoefficients);
02396     case BoxWeightingFunction:
02397       return BoxResizeFilter(x);
02398     case TriangleWeightingFunction:
02399       return Triangle(x);
02400     case HannWeightingFunction:
02401       return Hann(x);
02402     case HammingWeightingFunction:
02403       return Hamming(x);
02404     case BlackmanWeightingFunction:
02405       return Blackman(x);
02406 
02407     default:
02408       return 0.0f;
02409     }
02410   }
02411   )
02412 
02413 
02414   STRINGIFY(
02415   static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
02416            , const ResizeWeightingFunctionType resizeWindowType
02417            , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
02418   {
02419     float scale;
02420     float xBlur = fabs(x/resizeFilterBlur);
02421     if (resizeWindowSupport < MagickEpsilon
02422         || resizeWindowType == BoxWeightingFunction)
02423     {
02424       scale = 1.0f;
02425     }
02426     else
02427     {
02428       scale = resizeFilterScale;
02429       scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
02430     }
02431     float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
02432     return weight;
02433   }
02434 
02435   )
02436 
02437   ;
02438   const char *accelerateKernels2 =
02439 
02440   STRINGIFY(
02441 
02442   static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
02443     return (numWorkItems/pixelPerWorkgroup);
02444   }
02445 
02446   // returns the index of the pixel for the current workitem to compute.
02447   // returns -1 if this workitem doesn't need to participate in any computation
02448   static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
02449     const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
02450     int pixelIndex = itemID/numWorkItemsPerPixel;
02451     pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
02452     return pixelIndex;
02453   }
02454  
02455   )
02456 
02457   STRINGIFY(
02458   __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
02459     void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
02460       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
02461       const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
02462       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
02463       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
02464       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
02465       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
02466       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
02467   {
02468     // calculate the range of resized image pixels computed by this workgroup
02469     const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
02470     const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
02471     const unsigned int actualNumPixelToCompute = stopX - startX;
02472 
02473     // calculate the range of input image pixels to cache
02474     float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
02475     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
02476     scale = PerceptibleReciprocal(scale);
02477 
02478     const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
02479     const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
02480 
02481     // cache the input pixels into local memory
02482     const unsigned int y = get_global_id(1);
02483     const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
02484     const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
02485     event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
02486     wait_group_events(1, &e);
02487 
02488     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
02489     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
02490     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
02491     {
02492       const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
02493       const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
02494       const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
02495 
02496       // determine which resized pixel computed by this workitem
02497       const unsigned int itemID = get_local_id(0);
02498       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
02499 
02500       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
02501 
02502       float4 filteredPixel = (float4)0.0f;
02503       float density = 0.0f;
02504       float gamma = 0.0f;
02505       // -1 means this workitem doesn't participate in the computation
02506       if (pixelIndex != -1)
02507       {
02508         // x coordinated of the resized pixel computed by this workitem
02509         const int x = chunkStartX + pixelIndex;
02510 
02511         // calculate how many steps required for this pixel
02512         const float bisect = (x+0.5)/xFactor+MagickEpsilon;
02513         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
02514         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
02515         const unsigned int n = stop - start;
02516 
02517         // calculate how many steps this workitem will contribute
02518         unsigned int numStepsPerWorkItem = n / numItems;
02519         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
02520 
02521         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
02522         if (startStep < n)
02523         {
02524           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
02525 
02526           unsigned int cacheIndex = start+startStep-cacheRangeStartX;
02527           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
02528           {
02529             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
02530               (ResizeWeightingFunctionType) resizeFilterType,
02531               (ResizeWeightingFunctionType) resizeWindowType,
02532               resizeFilterScale, resizeFilterWindowSupport,
02533               resizeFilterBlur, scale*(start + i - bisect + 0.5));
02534 
02535             float4 cp = (float4)0.0f;
02536 
02537             __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
02538             cp.x = (float) *(p);
02539             if (number_channels > 2)
02540             {
02541               cp.y = (float) *(p + 1);
02542               cp.z = (float) *(p + 2);
02543             }
02544 
02545             if (alpha_index != 0)
02546             {
02547               cp.w = (float) *(p + alpha_index);
02548 
02549               float alpha = weight * QuantumScale * cp.w;
02550 
02551               filteredPixel.x += alpha * cp.x;
02552               filteredPixel.y += alpha * cp.y;
02553               filteredPixel.z += alpha * cp.z;
02554               filteredPixel.w += weight * cp.w;
02555               gamma += alpha;
02556             }
02557             else
02558               filteredPixel += ((float4) weight)*cp;
02559 
02560             density += weight;
02561           }
02562         }
02563       }
02564 
02565       // initialize the accumulators to zero
02566       if (itemID < actualNumPixelInThisChunk) {
02567         outputPixelCache[itemID] = (float4)0.0f;
02568         densityCache[itemID] = 0.0f;
02569         if (alpha_index != 0)
02570           gammaCache[itemID] = 0.0f;
02571       }
02572       barrier(CLK_LOCAL_MEM_FENCE);
02573 
02574       // accumulatte the filtered pixel value and the density
02575       for (unsigned int i = 0; i < numItems; i++) {
02576         if (pixelIndex != -1) {
02577           if (itemID%numItems == i) {
02578             outputPixelCache[pixelIndex]+=filteredPixel;
02579             densityCache[pixelIndex]+=density;
02580             if (alpha_index != 0)
02581               gammaCache[pixelIndex]+=gamma;
02582           }
02583         }
02584         barrier(CLK_LOCAL_MEM_FENCE);
02585       }
02586 
02587       if (itemID < actualNumPixelInThisChunk)
02588       {
02589         float4 filteredPixel = outputPixelCache[itemID];
02590 
02591         float gamma = 0.0f;
02592         if (alpha_index != 0)
02593           gamma = gammaCache[itemID];
02594 
02595         float density = densityCache[itemID];
02596         if ((density != 0.0f) && (density != 1.0f))
02597         {
02598           density = PerceptibleReciprocal(density);
02599           filteredPixel *= (float4) density;
02600           if (alpha_index != 0)
02601             gamma *= density;
02602         }
02603 
02604         if (alpha_index != 0)
02605         {
02606           gamma = PerceptibleReciprocal(gamma);
02607           filteredPixel.x *= gamma;
02608           filteredPixel.y *= gamma;
02609           filteredPixel.z *= gamma;
02610         }
02611 
02612         WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
02613       }
02614     }
02615   }
02616   )
02617 
02618 
02619   STRINGIFY(
02620  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
02621     void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
02622       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
02623       const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
02624       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
02625       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
02626       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
02627       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
02628       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
02629   {
02630     // calculate the range of resized image pixels computed by this workgroup
02631     const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
02632     const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
02633     const unsigned int actualNumPixelToCompute = stopY - startY;
02634 
02635     // calculate the range of input image pixels to cache
02636     float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
02637     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
02638     scale = PerceptibleReciprocal(scale);
02639 
02640     const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
02641     const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
02642 
02643     // cache the input pixels into local memory
02644     const unsigned int x = get_global_id(0);
02645     unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
02646     unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
02647     unsigned int stride = inputColumns * number_channels;
02648     for (unsigned int i = 0; i < number_channels; i++)
02649     {
02650       event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
02651       wait_group_events(1,&e);
02652     }
02653 
02654     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
02655     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
02656     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
02657     {
02658       const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
02659       const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
02660       const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
02661 
02662       // determine which resized pixel computed by this workitem
02663       const unsigned int itemID = get_local_id(1);
02664       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
02665 
02666       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
02667 
02668       float4 filteredPixel = (float4)0.0f;
02669       float density = 0.0f;
02670       float gamma = 0.0f;
02671       // -1 means this workitem doesn't participate in the computation
02672       if (pixelIndex != -1)
02673       {
02674         // x coordinated of the resized pixel computed by this workitem
02675         const int y = chunkStartY + pixelIndex;
02676 
02677         // calculate how many steps required for this pixel
02678         const float bisect = (y+0.5)/yFactor+MagickEpsilon;
02679         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
02680         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
02681         const unsigned int n = stop - start;
02682 
02683         // calculate how many steps this workitem will contribute
02684         unsigned int numStepsPerWorkItem = n / numItems;
02685         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
02686 
02687         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
02688         if (startStep < n)
02689         {
02690           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
02691 
02692           unsigned int cacheIndex = start+startStep-cacheRangeStartY;
02693           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
02694           {
02695             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
02696               (ResizeWeightingFunctionType) resizeFilterType,
02697               (ResizeWeightingFunctionType) resizeWindowType,
02698               resizeFilterScale, resizeFilterWindowSupport,
02699               resizeFilterBlur, scale*(start + i - bisect + 0.5));
02700 
02701             float4 cp = (float4)0.0f;
02702 
02703             __local CLQuantum *p = inputImageCache + cacheIndex;
02704             cp.x = (float) *(p);
02705             if (number_channels > 2)
02706             {
02707               cp.y = (float) *(p + rangeLength);
02708               cp.z = (float) *(p + (rangeLength * 2));
02709             }
02710 
02711             if (alpha_index != 0)
02712             {
02713               cp.w = (float) *(p + (rangeLength * alpha_index));
02714 
02715               float alpha = weight * QuantumScale * cp.w;
02716 
02717               filteredPixel.x += alpha * cp.x;
02718               filteredPixel.y += alpha * cp.y;
02719               filteredPixel.z += alpha * cp.z;
02720               filteredPixel.w += weight * cp.w;
02721               gamma += alpha;
02722             }
02723             else
02724               filteredPixel += ((float4) weight)*cp;
02725 
02726             density += weight;
02727           }
02728         }
02729       }
02730 
02731       // initialize the accumulators to zero
02732       if (itemID < actualNumPixelInThisChunk) {
02733         outputPixelCache[itemID] = (float4)0.0f;
02734         densityCache[itemID] = 0.0f;
02735         if (alpha_index != 0)
02736           gammaCache[itemID] = 0.0f;
02737       }
02738       barrier(CLK_LOCAL_MEM_FENCE);
02739 
02740       // accumulatte the filtered pixel value and the density
02741       for (unsigned int i = 0; i < numItems; i++) {
02742         if (pixelIndex != -1) {
02743           if (itemID%numItems == i) {
02744             outputPixelCache[pixelIndex]+=filteredPixel;
02745             densityCache[pixelIndex]+=density;
02746             if (alpha_index != 0)
02747               gammaCache[pixelIndex]+=gamma;
02748           }
02749         }
02750         barrier(CLK_LOCAL_MEM_FENCE);
02751       }
02752 
02753       if (itemID < actualNumPixelInThisChunk)
02754       {
02755         float4 filteredPixel = outputPixelCache[itemID];
02756 
02757         float gamma = 0.0f;
02758         if (alpha_index != 0)
02759           gamma = gammaCache[itemID];
02760 
02761         float density = densityCache[itemID];
02762         if ((density != 0.0f) && (density != 1.0f))
02763         {
02764           density = PerceptibleReciprocal(density);
02765           filteredPixel *= (float4) density;
02766           if (alpha_index != 0)
02767             gamma *= density;
02768         }
02769 
02770         if (alpha_index != 0)
02771         {
02772           gamma = PerceptibleReciprocal(gamma);
02773           filteredPixel.x *= gamma;
02774           filteredPixel.y *= gamma;
02775           filteredPixel.z *= gamma;
02776         }
02777 
02778         WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
02779       }
02780     }
02781   }
02782   )
02783 
02784 /*
02785 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02786 %                                                                             %
02787 %                                                                             %
02788 %                                                                             %
02789 %     R o t a t i o n a l B l u r                                             %
02790 %                                                                             %
02791 %                                                                             %
02792 %                                                                             %
02793 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02794 */
02795 
02796   STRINGIFY(
02797   __kernel void RotationalBlur(const __global CLQuantum *image,
02798     const unsigned int number_channels,const unsigned int channel,
02799     const float2 blurCenter,__constant float *cos_theta,
02800     __constant float *sin_theta,const unsigned int cossin_theta_size,
02801     __global CLQuantum *filteredImage)
02802   {
02803     const int x = get_global_id(0);
02804     const int y = get_global_id(1);
02805     const int columns = get_global_size(0);
02806     const int rows = get_global_size(1);
02807     unsigned int step = 1;
02808     float center_x = (float) x - blurCenter.x;
02809     float center_y = (float) y - blurCenter.y;
02810     float radius = hypot(center_x, center_y);
02811 
02812     float blur_radius = hypot(blurCenter.x, blurCenter.y);
02813 
02814     if (radius > MagickEpsilon)
02815     {
02816       step = (unsigned int) (blur_radius / radius);
02817       if (step == 0)
02818         step = 1;
02819       if (step >= cossin_theta_size)
02820         step = cossin_theta_size-1;
02821     }
02822 
02823     float4 result = 0.0f;
02824     float normalize = 0.0f;
02825     float gamma = 0.0f;
02826 
02827     for (unsigned int i=0; i<cossin_theta_size; i+=step)
02828     {
02829       int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
02830       int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
02831 
02832       float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
02833 
02834       if ((number_channels == 4) || (number_channels == 2))
02835       {
02836         float alpha = (float)(QuantumScale*pixel.w);
02837 
02838         gamma += alpha;
02839 
02840         result.x += alpha * pixel.x;
02841         result.y += alpha * pixel.y;
02842         result.z += alpha * pixel.z;
02843         result.w += pixel.w;
02844       }
02845       else
02846         result += pixel;
02847 
02848       normalize += 1.0f;
02849     }
02850 
02851     normalize = PerceptibleReciprocal(normalize);
02852 
02853     if ((number_channels == 4) || (number_channels == 2))
02854     {
02855       gamma = PerceptibleReciprocal(gamma);
02856       result.x *= gamma;
02857       result.y *= gamma;
02858       result.z *= gamma;
02859       result.w *= normalize;
02860     }
02861     else
02862       result *= normalize;
02863 
02864     WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
02865   }
02866   )
02867 
02868 /*
02869 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02870 %                                                                             %
02871 %                                                                             %
02872 %                                                                             %
02873 %     U n s h a r p M a s k                                                   %
02874 %                                                                             %
02875 %                                                                             %
02876 %                                                                             %
02877 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02878 */
02879 
02880   STRINGIFY(
02881   __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
02882     const __global float4 *blurRowData,const unsigned int number_channels,
02883     const ChannelType channel,const unsigned int columns,
02884     const unsigned int rows,__local float4* cachedData,
02885     __local float* cachedFilter,const __global float *filter,
02886     const unsigned int width,const float gain, const float threshold,
02887     __global CLQuantum *filteredImage)
02888   {
02889     const unsigned int radius = (width-1)/2;
02890 
02891     // cache the pixel shared by the workgroup
02892     const int groupX = get_group_id(0);
02893     const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
02894     const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
02895 
02896     if ((groupStartY >= 0) && (groupStopY < rows))
02897     {
02898       event_t e = async_work_group_strided_copy(cachedData,
02899         blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
02900       wait_group_events(1,&e);
02901     }
02902     else
02903     {
02904       for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
02905         cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
02906 
02907       barrier(CLK_LOCAL_MEM_FENCE);
02908     }
02909     // cache the filter as well
02910     event_t e = async_work_group_copy(cachedFilter,filter,width,0);
02911     wait_group_events(1,&e);
02912 
02913     // only do the work if this is not a patched item
02914     const int cy = get_global_id(1);
02915 
02916     if (cy < rows)
02917     {
02918       float4 blurredPixel = (float4) 0.0f;
02919 
02920       int i = 0;
02921 
02922       for ( ; i+7 < width; )
02923       {
02924         for (int j=0; j < 8; j++, i++)
02925           blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
02926       }
02927 
02928       for ( ; i < width; i++)
02929         blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
02930 
02931       float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
02932       float4 outputPixel = inputImagePixel - blurredPixel;
02933 
02934       float quantumThreshold = QuantumRange*threshold;
02935 
02936       int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
02937       outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
02938 
02939       //write back
02940       WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
02941     }
02942   }
02943   )
02944 
02945   STRINGIFY(
02946   __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
02947     const ChannelType channel,__constant float *filter,const unsigned int width,
02948     const unsigned int columns,const unsigned int rows,__local float4 *pixels,
02949     const float gain,const float threshold,__global CLQuantum *filteredImage)
02950   {
02951     const unsigned int x = get_global_id(0);
02952     const unsigned int y = get_global_id(1);
02953 
02954     const unsigned int radius = (width - 1) / 2;
02955 
02956     int row = y - radius;
02957     int baseRow = get_group_id(1) * get_local_size(1) - radius;
02958     int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
02959 
02960     while (row < endRow) {
02961       int srcy = (row < 0) ? -row : row; // mirror pad
02962       srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
02963 
02964       float4 value = 0.0f;
02965 
02966       int ix = x - radius;
02967       int i = 0;
02968 
02969       while (i + 7 < width) {
02970         for (int j = 0; j < 8; ++j) { // unrolled
02971           int srcx = ix + j;
02972           srcx = (srcx < 0) ? -srcx : srcx;
02973           srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
02974           value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
02975         }
02976         ix += 8;
02977         i += 8;
02978       }
02979 
02980       while (i < width) {
02981         int srcx = (ix < 0) ? -ix : ix; // mirror pad
02982         srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
02983         value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
02984         ++i;
02985         ++ix;
02986       }
02987       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
02988       row += get_local_size(1);
02989     }
02990 
02991     barrier(CLK_LOCAL_MEM_FENCE);
02992 
02993     const int px = get_local_id(0);
02994     const int py = get_local_id(1);
02995     const int prp = get_local_size(0);
02996     float4 value = (float4)(0.0f);
02997 
02998     int i = 0;
02999     while (i + 7 < width) {
03000       for (int j = 0; j < 8; ++j) // unrolled
03001         value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
03002       i += 8;
03003     }
03004     while (i < width) {
03005       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
03006       ++i;
03007     }
03008 
03009     if ((x < columns) && (y < rows)) {
03010       float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
03011       float4 diff = srcPixel - value;
03012 
03013       float quantumThreshold = QuantumRange*threshold;
03014 
03015       int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
03016       value = select(srcPixel + diff * gain, srcPixel, mask);
03017 
03018       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
03019     }
03020   }
03021   )
03022 
03023   STRINGIFY(
03024     __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
03025     void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
03026       const unsigned int number_channels,const unsigned int max_channels,
03027       const float threshold,const int passes,const unsigned int imageWidth,
03028       const unsigned int imageHeight)
03029   {
03030     const int pad = (1 << (passes - 1));
03031     const int tileSize = 64;
03032     const int tileRowPixels = 64;
03033     const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
03034 
03035     CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
03036 
03037     local float buffer[64 * 64];
03038 
03039     int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
03040     int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
03041 
03042     for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03043       int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
03044                 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
03045 
03046       for (int channel = 0; channel < max_channels; ++channel)
03047         stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
03048     }
03049 
03050     for (int channel = 0; channel < max_channels; ++channel) {
03051       // Load LDS
03052       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03053         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
03054 
03055       // Process
03056 
03057       float tmp[16];
03058       float accum[16];
03059       float pixel;
03060 
03061       for (int i = 0; i < 16; i++)
03062         accum[i]=0.0f;
03063 
03064       for (int pass = 0; pass < passes; ++pass) {
03065         const int radius = 1 << pass;
03066         const int x = get_local_id(0);
03067         const float thresh = threshold * noise[pass];
03068 
03069         // Apply horizontal hat
03070         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03071           const int offset = i * tileRowPixels;
03072           if (pass == 0)
03073             tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
03074           pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
03075           barrier(CLK_LOCAL_MEM_FENCE);
03076           buffer[x + offset] = pixel;
03077         }
03078         barrier(CLK_LOCAL_MEM_FENCE);
03079 
03080         // Apply vertical hat
03081         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03082           pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
03083           float delta = tmp[i / 4] - pixel;
03084           tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
03085           if (delta < -thresh)
03086             delta += thresh;
03087           else if (delta > thresh)
03088             delta -= thresh;
03089           else
03090             delta = 0;
03091           accum[i / 4] += delta;
03092         }
03093         barrier(CLK_LOCAL_MEM_FENCE);
03094 
03095         if (pass < passes - 1)
03096           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03097             buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
03098         else  // last pass
03099           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03100             accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
03101         barrier(CLK_LOCAL_MEM_FENCE);
03102       }
03103 
03104       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03105         stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
03106 
03107       barrier(CLK_LOCAL_MEM_FENCE);
03108     }
03109 
03110     // Write from stage to output
03111 
03112     if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
03113       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03114         if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
03115           int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
03116           for (int channel = 0; channel < max_channels; ++channel) {
03117             dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
03118           }
03119         }
03120       }
03121     }
03122   }
03123   )
03124 
03125   ;
03126 
03127 #endif // MAGICKCORE_OPENCL_SUPPORT
03128 
03129 #if defined(__cplusplus) || defined(c_plusplus)
03130 }
03131 #endif
03132 
03133 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H

Generated on 2 Dec 2019 for MagickCore by  doxygen 1.6.1