accelerate-kernels-private.h

Go to the documentation of this file.
00001 /*
00002   Copyright 1999-2019 ImageMagick Studio LLC, a non-profit organization
00003   dedicated to making software imaging solutions freely available.
00004   
00005   You may not use this file except in compliance with the License.  You may
00006   obtain a copy of the License at
00007   
00008     https://imagemagick.org/script/license.php
00009   
00010   Unless required by applicable law or agreed to in writing, software
00011   distributed under the License is distributed on an "AS IS" BASIS,
00012   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00013   See the License for the specific language governing permissions and
00014   limitations under the License.
00015 
00016   MagickCore private methods for accelerated functions.
00017 */
00018 
00019 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
00020 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
00021 
00022 #if defined(__cplusplus) || defined(c_plusplus)
00023 extern "C" {
00024 #endif
00025 
00026 #if defined(MAGICKCORE_OPENCL_SUPPORT)
00027 
00028 /*
00029   Define declarations.
00030 */
00031 #define OPENCL_DEFINE(VAR,...)  "\n #""define " #VAR " " #__VA_ARGS__ " \n"
00032 #define OPENCL_ELIF(...)        "\n #""elif " #__VA_ARGS__ " \n"
00033 #define OPENCL_ELSE()           "\n #""else " " \n"
00034 #define OPENCL_ENDIF()          "\n #""endif " " \n"
00035 #define OPENCL_IF(...)          "\n #""if " #__VA_ARGS__ " \n"
00036 #define STRINGIFY(...) #__VA_ARGS__ "\n"
00037 
00038 const char* accelerateKernels =
00039 
00040 /*
00041   Define declarations.
00042 */
00043   OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
00044   OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
00045   OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
00046   OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
00047   OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
00048   OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
00049   OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
00050   OPENCL_DEFINE(SigmaRandom, (attenuate))
00051   OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
00052   OPENCL_DEFINE(MagickMax(x, y), (((x) > (y)) ? (x) : (y)))
00053   OPENCL_DEFINE(MagickMin(x, y), (((x) < (y)) ? (x) : (y)))
00054 
00055 /*
00056   Typedef declarations.
00057 */
00058   STRINGIFY(
00059   typedef enum
00060   {
00061     UndefinedColorspace,
00062     RGBColorspace,            /* Linear RGB colorspace */
00063     GRAYColorspace,           /* greyscale (non-linear) image (faked 1 channel) */
00064     TransparentColorspace,
00065     OHTAColorspace,
00066     LabColorspace,
00067     XYZColorspace,
00068     YCbCrColorspace,
00069     YCCColorspace,
00070     YIQColorspace,
00071     YPbPrColorspace,
00072     YUVColorspace,
00073     CMYKColorspace,           /* negared linear RGB with black separated */
00074     sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
00075     HSBColorspace,
00076     HSLColorspace,
00077     HWBColorspace,
00078     Rec601LumaColorspace,
00079     Rec601YCbCrColorspace,
00080     Rec709LumaColorspace,
00081     Rec709YCbCrColorspace,
00082     LogColorspace,
00083     CMYColorspace,            /* negated linear RGB colorspace */
00084     LuvColorspace,
00085     HCLColorspace,
00086     LCHColorspace,            /* alias for LCHuv */
00087     LMSColorspace,
00088     LCHabColorspace,          /* Cylindrical (Polar) Lab */
00089     LCHuvColorspace,          /* Cylindrical (Polar) Luv */
00090     scRGBColorspace,
00091     HSIColorspace,
00092     HSVColorspace,            /* alias for HSB */
00093     HCLpColorspace,
00094     YDbDrColorspace,
00095     LinearGRAYColorspace      /* greyscale (linear) image (faked 1 channel) */
00096   } ColorspaceType;
00097   )
00098 
00099   STRINGIFY(
00100     typedef enum
00101     {
00102       UndefinedCompositeOp,
00103       NoCompositeOp,
00104       ModulusAddCompositeOp,
00105       AtopCompositeOp,
00106       BlendCompositeOp,
00107       BumpmapCompositeOp,
00108       ChangeMaskCompositeOp,
00109       ClearCompositeOp,
00110       ColorBurnCompositeOp,
00111       ColorDodgeCompositeOp,
00112       ColorizeCompositeOp,
00113       CopyBlackCompositeOp,
00114       CopyBlueCompositeOp,
00115       CopyCompositeOp,
00116       CopyCyanCompositeOp,
00117       CopyGreenCompositeOp,
00118       CopyMagentaCompositeOp,
00119       CopyOpacityCompositeOp,
00120       CopyRedCompositeOp,
00121       CopyYellowCompositeOp,
00122       DarkenCompositeOp,
00123       DstAtopCompositeOp,
00124       DstCompositeOp,
00125       DstInCompositeOp,
00126       DstOutCompositeOp,
00127       DstOverCompositeOp,
00128       DifferenceCompositeOp,
00129       DisplaceCompositeOp,
00130       DissolveCompositeOp,
00131       ExclusionCompositeOp,
00132       HardLightCompositeOp,
00133       HueCompositeOp,
00134       InCompositeOp,
00135       LightenCompositeOp,
00136       LinearLightCompositeOp,
00137       LuminizeCompositeOp,
00138       MinusDstCompositeOp,
00139       ModulateCompositeOp,
00140       MultiplyCompositeOp,
00141       OutCompositeOp,
00142       OverCompositeOp,
00143       OverlayCompositeOp,
00144       PlusCompositeOp,
00145       ReplaceCompositeOp,
00146       SaturateCompositeOp,
00147       ScreenCompositeOp,
00148       SoftLightCompositeOp,
00149       SrcAtopCompositeOp,
00150       SrcCompositeOp,
00151       SrcInCompositeOp,
00152       SrcOutCompositeOp,
00153       SrcOverCompositeOp,
00154       ModulusSubtractCompositeOp,
00155       ThresholdCompositeOp,
00156       XorCompositeOp,
00157       /* These are new operators, added after the above was last sorted.
00158        * The list should be re-sorted only when a new library version is
00159        * created.
00160        */
00161       DivideDstCompositeOp,
00162       DistortCompositeOp,
00163       BlurCompositeOp,
00164       PegtopLightCompositeOp,
00165       VividLightCompositeOp,
00166       PinLightCompositeOp,
00167       LinearDodgeCompositeOp,
00168       LinearBurnCompositeOp,
00169       MathematicsCompositeOp,
00170       DivideSrcCompositeOp,
00171       MinusSrcCompositeOp,
00172       DarkenIntensityCompositeOp,
00173       LightenIntensityCompositeOp
00174     } CompositeOperator;
00175   )
00176 
00177   STRINGIFY(
00178      typedef enum
00179      {
00180        UndefinedFunction,
00181        PolynomialFunction,
00182        SinusoidFunction,
00183        ArcsinFunction,
00184        ArctanFunction
00185      } MagickFunction;
00186   )
00187 
00188   STRINGIFY(
00189     typedef enum
00190     {
00191       UndefinedNoise,
00192       UniformNoise,
00193       GaussianNoise,
00194       MultiplicativeGaussianNoise,
00195       ImpulseNoise,
00196       LaplacianNoise,
00197       PoissonNoise,
00198       RandomNoise
00199     } NoiseType;
00200   )
00201 
00202   STRINGIFY(
00203   typedef enum
00204   {
00205     UndefinedPixelIntensityMethod = 0,
00206     AveragePixelIntensityMethod,
00207     BrightnessPixelIntensityMethod,
00208     LightnessPixelIntensityMethod,
00209     Rec601LumaPixelIntensityMethod,
00210     Rec601LuminancePixelIntensityMethod,
00211     Rec709LumaPixelIntensityMethod,
00212     Rec709LuminancePixelIntensityMethod,
00213     RMSPixelIntensityMethod,
00214     MSPixelIntensityMethod
00215   } PixelIntensityMethod;
00216   )
00217 
00218   STRINGIFY(
00219   typedef enum {
00220     BoxWeightingFunction = 0,
00221     TriangleWeightingFunction,
00222     CubicBCWeightingFunction,
00223     HanningWeightingFunction,
00224     HammingWeightingFunction,
00225     BlackmanWeightingFunction,
00226     GaussianWeightingFunction,
00227     QuadraticWeightingFunction,
00228     JincWeightingFunction,
00229     SincWeightingFunction,
00230     SincFastWeightingFunction,
00231     KaiserWeightingFunction,
00232     WelshWeightingFunction,
00233     BohmanWeightingFunction,
00234     LagrangeWeightingFunction,
00235     CosineWeightingFunction,
00236   } ResizeWeightingFunctionType;
00237   )
00238 
00239   STRINGIFY(
00240      typedef enum
00241      {
00242        UndefinedChannel,
00243        RedChannel = 0x0001,
00244        GrayChannel = 0x0001,
00245        CyanChannel = 0x0001,
00246        GreenChannel = 0x0002,
00247        MagentaChannel = 0x0002,
00248        BlueChannel = 0x0004,
00249        YellowChannel = 0x0004,
00250        AlphaChannel = 0x0008,
00251        OpacityChannel = 0x0008,
00252        MatteChannel = 0x0008,     /* deprecated */
00253        BlackChannel = 0x0020,
00254        IndexChannel = 0x0020,
00255        CompositeChannels = 0x002F,
00256        AllChannels = 0x7ffffff,
00257        /*
00258        Special purpose channel types.
00259        */
00260        TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
00261        RGBChannels = 0x0080,      /* set alpha from  grayscale mask in RGB */
00262        GrayChannels = 0x0080,
00263        SyncChannels = 0x0100,     /* channels should be modified equally */
00264        DefaultChannels = ((AllChannels | SyncChannels) &~ OpacityChannel)
00265      } ChannelType;
00266   )
00267 
00268 /*
00269   Helper functions.
00270 */
00271 
00272 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
00273 
00274   STRINGIFY(
00275     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
00276     {
00277       return((CLQuantum) value);
00278     }
00279   )
00280 
00281 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
00282 
00283   STRINGIFY(
00284     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
00285     {
00286       return((CLQuantum) (257.0f*value));
00287     }
00288   )
00289 
00290 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
00291 
00292   STRINGIFY(
00293     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
00294     {
00295       return((CLQuantum) (16843009.0*value));
00296     }
00297   )
00298 
00299 OPENCL_ENDIF()
00300 
00301 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
00302 
00303   STRINGIFY(
00304     static inline CLQuantum ClampToQuantum(const float value)
00305     {
00306       return (CLQuantum) value;
00307     }
00308   )
00309 
00310 OPENCL_ELSE()
00311 
00312   STRINGIFY(
00313     static inline CLQuantum ClampToQuantum(const float value)
00314     {
00315       return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
00316     }
00317   )
00318 
00319 OPENCL_ENDIF()
00320 
00321   STRINGIFY(
00322     static inline int ClampToCanvas(const int offset, const int range)
00323     {
00324       return clamp(offset, (int)0, range - 1);
00325     }
00326   )
00327 
00328   STRINGIFY(
00329     static inline int ClampToCanvasWithHalo(const int offset, const int range, const int edge, const int section)
00330     {
00331       return clamp(offset, section ? (int)(0 - edge) : (int)0, section ? (range - 1) : (range - 1 + edge));
00332     }
00333   )
00334 
00335   STRINGIFY(
00336     static inline uint ScaleQuantumToMap(CLQuantum value)
00337     {
00338       if (value >= (CLQuantum)MaxMap)
00339         return ((uint)MaxMap);
00340       else
00341         return ((uint)value);
00342     }
00343   )
00344 
00345   STRINGIFY(
00346     static inline float PerceptibleReciprocal(const float x)
00347     {
00348       float sign = x < (float) 0.0 ? (float)-1.0 : (float) 1.0;
00349       return((sign*x) >= MagickEpsilon ? (float) 1.0 / x : sign*((float) 1.0 / MagickEpsilon));
00350     }
00351   )
00352 
00353   STRINGIFY(
00354     static inline float RoundToUnity(const float value)
00355     {
00356       return clamp(value, 0.0f, 1.0f);
00357     }
00358   )
00359 
00360     STRINGIFY(
00361 
00362       static inline CLQuantum getBlue(CLPixelType p) { return p.x; }
00363   static inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
00364   static inline float getBlueF4(float4 p) { return p.x; }
00365   static inline void setBlueF4(float4* p, float value) { (*p).x = value; }
00366 
00367   static inline CLQuantum getGreen(CLPixelType p) { return p.y; }
00368   static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
00369   static inline float getGreenF4(float4 p) { return p.y; }
00370   static inline void setGreenF4(float4* p, float value) { (*p).y = value; }
00371 
00372   static inline CLQuantum getRed(CLPixelType p) { return p.z; }
00373   static inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
00374   static inline float getRedF4(float4 p) { return p.z; }
00375   static inline void setRedF4(float4* p, float value) { (*p).z = value; }
00376 
00377   static inline CLQuantum getOpacity(CLPixelType p) { return p.w; }
00378   static inline void setOpacity(CLPixelType* p, CLQuantum value) { (*p).w = value; }
00379   static inline float getOpacityF4(float4 p) { return p.w; }
00380   static inline void setOpacityF4(float4* p, float value) { (*p).w = value; }
00381 
00382   static inline void setGray(CLPixelType* p, CLQuantum value) { (*p).z = value; (*p).y = value; (*p).x = value; }
00383 
00384   static inline float GetPixelIntensity(const int method, const int colorspace, CLPixelType p)
00385   {
00386     float red = getRed(p);
00387     float green = getGreen(p);
00388     float blue = getBlue(p);
00389 
00390     float intensity;
00391 
00392     if (colorspace == GRAYColorspace)
00393       return red;
00394 
00395     switch (method)
00396     {
00397     case AveragePixelIntensityMethod:
00398     {
00399       intensity = (red + green + blue) / 3.0;
00400       break;
00401     }
00402     case BrightnessPixelIntensityMethod:
00403     {
00404       intensity = MagickMax(MagickMax(red, green), blue);
00405       break;
00406     }
00407     case LightnessPixelIntensityMethod:
00408     {
00409       intensity = (MagickMin(MagickMin(red, green), blue) +
00410         MagickMax(MagickMax(red, green), blue)) / 2.0;
00411       break;
00412     }
00413     case MSPixelIntensityMethod:
00414     {
00415       intensity = (float)(((float)red*red + green*green + blue*blue) /
00416         (3.0*QuantumRange));
00417       break;
00418     }
00419     case Rec601LumaPixelIntensityMethod:
00420     {
00421       /*
00422       if (image->colorspace == RGBColorspace)
00423       {
00424       red=EncodePixelGamma(red);
00425       green=EncodePixelGamma(green);
00426       blue=EncodePixelGamma(blue);
00427       }
00428       */
00429       intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
00430       break;
00431     }
00432     case Rec601LuminancePixelIntensityMethod:
00433     {
00434       /*
00435       if (image->colorspace == sRGBColorspace)
00436       {
00437       red=DecodePixelGamma(red);
00438       green=DecodePixelGamma(green);
00439       blue=DecodePixelGamma(blue);
00440       }
00441       */
00442       intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
00443       break;
00444     }
00445     case Rec709LumaPixelIntensityMethod:
00446     default:
00447     {
00448       /*
00449       if (image->colorspace == RGBColorspace)
00450       {
00451       red=EncodePixelGamma(red);
00452       green=EncodePixelGamma(green);
00453       blue=EncodePixelGamma(blue);
00454       }
00455       */
00456       intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
00457       break;
00458     }
00459     case Rec709LuminancePixelIntensityMethod:
00460     {
00461       /*
00462       if (image->colorspace == sRGBColorspace)
00463       {
00464       red=DecodePixelGamma(red);
00465       green=DecodePixelGamma(green);
00466       blue=DecodePixelGamma(blue);
00467       }
00468       */
00469       intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
00470       break;
00471     }
00472     case RMSPixelIntensityMethod:
00473     {
00474       intensity = (float)(sqrt((float)red*red + green*green + blue*blue) /
00475         sqrt(3.0));
00476       break;
00477     }
00478     }
00479 
00480     return intensity;
00481 
00482   }
00483   )
00484 
00485 /*
00486 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00487 %                                                                             %
00488 %                                                                             %
00489 %                                                                             %
00490 %     A d d N o i s e                                                         %
00491 %                                                                             %
00492 %                                                                             %
00493 %                                                                             %
00494 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00495 */
00496 
00497 STRINGIFY(
00498 
00499 /*
00500 Part of MWC64X by David Thomas, dt10@imperial.ac.uk
00501 This is provided under BSD, full license is with the main package.
00502 See http://www.doc.ic.ac.uk/~dt10/research
00503 */
00504 
00505 // Pre: a<M, b<M
00506 // Post: r=(a+b) mod M
00507 ulong MWC_AddMod64(ulong a, ulong b, ulong M)
00508 {
00509         ulong v=a+b;
00510         //if( (v>=M) || (v<a) )
00511         if( (v>=M) || (convert_float(v) < convert_float(a)) )   // workaround for what appears to be an optimizer bug.
00512                 v=v-M;
00513         return v;
00514 }
00515 
00516 // Pre: a<M,b<M
00517 // Post: r=(a*b) mod M
00518 // This could be done more efficently, but it is portable, and should
00519 // be easy to understand. It can be replaced with any of the better
00520 // modular multiplication algorithms (for example if you know you have
00521 // double precision available or something).
00522 ulong MWC_MulMod64(ulong a, ulong b, ulong M)
00523 {       
00524         ulong r=0;
00525         while(a!=0){
00526                 if(a&1)
00527                         r=MWC_AddMod64(r,b,M);
00528                 b=MWC_AddMod64(b,b,M);
00529                 a=a>>1;
00530         }
00531         return r;
00532 }
00533 
00534 
00535 // Pre: a<M, e>=0
00536 // Post: r=(a^b) mod M
00537 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
00538 // most architectures
00539 ulong MWC_PowMod64(ulong a, ulong e, ulong M)
00540 {
00541         ulong sqr=a, acc=1;
00542         while(e!=0){
00543                 if(e&1)
00544                         acc=MWC_MulMod64(acc,sqr,M);
00545                 sqr=MWC_MulMod64(sqr,sqr,M);
00546                 e=e>>1;
00547         }
00548         return acc;
00549 }
00550 
00551 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
00552 {
00553         ulong m=MWC_PowMod64(A, distance, M);
00554         ulong x=curr.x*(ulong)A+curr.y;
00555         x=MWC_MulMod64(x, m, M);
00556         return (uint2)((uint)(x/A), (uint)(x%A));
00557 }
00558 
00559 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
00560 {
00561         // This is an arbitrary constant for starting LCG jumping from. I didn't
00562         // want to start from 1, as then you end up with the two or three first values
00563         // being a bit poor in ones - once you've decided that, one constant is as
00564         // good as any another. There is no deep mathematical reason for it, I just
00565         // generated a random number.
00566         enum{ MWC_BASEID = 4077358422479273989UL };
00567         
00568         ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
00569         ulong m=MWC_PowMod64(A, dist, M);
00570         
00571         ulong x=MWC_MulMod64(MWC_BASEID, m, M);
00572         return (uint2)((uint)(x/A), (uint)(x%A));
00573 }
00574 
00576 typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
00577 
00578 void MWC64X_Step(mwc64x_state_t *s)
00579 {
00580         uint X=s->x, C=s->c;
00581         
00582         uint Xn=s->seed0*X+C;
00583         uint carry=(uint)(Xn<C);                                // The (Xn<C) will be zero or one for scalar
00584         uint Cn=mad_hi(s->seed0,X,carry);  
00585         
00586         s->x=Xn;
00587         s->c=Cn;
00588 }
00589 
00590 void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
00591 {
00592         uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
00593         s->x=tmp.x;
00594         s->c=tmp.y;
00595 }
00596 
00597 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
00598 {
00599         uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
00600         s->x=tmp.x;
00601         s->c=tmp.y;
00602 }
00603 
00605 uint MWC64X_NextUint(mwc64x_state_t *s)
00606 {
00607         uint res=s->x ^ s->c;
00608         MWC64X_Step(s);
00609         return res;
00610 }
00611 
00612 //
00613 // End of MWC64X excerpt
00614 //
00615 
00616   float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {
00617         return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff);     // normalized to 1.0
00618   }
00619 
00620   
00621   float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
00622  
00623     float 
00624       alpha,
00625       beta,
00626       noise,
00627       sigma;
00628 
00629     noise = 0.0f;
00630     alpha=mwcReadPseudoRandomValue(r);
00631     switch(noise_type) {
00632     case UniformNoise:
00633     default:
00634       {
00635         noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
00636         break;
00637       }
00638     case GaussianNoise:
00639       {
00640         float
00641           gamma,
00642           tau;
00643 
00644         if (alpha == 0.0f)
00645           alpha=1.0f;
00646         beta=mwcReadPseudoRandomValue(r);
00647         gamma=sqrt(-2.0f*log(alpha));
00648         sigma=gamma*cospi((2.0f*beta));
00649         tau=gamma*sinpi((2.0f*beta));
00650         noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
00651                       QuantumRange*TauGaussian*tau);        
00652         break;
00653       }
00654 
00655 
00656     case ImpulseNoise:
00657     {
00658       if (alpha < (SigmaImpulse/2.0f))
00659         noise=0.0f;
00660       else
00661         if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
00662           noise=(float)QuantumRange;
00663         else
00664           noise=(float)pixel;
00665       break;
00666     }
00667     case LaplacianNoise:
00668     {
00669       if (alpha <= 0.5f)
00670         {
00671           if (alpha <= MagickEpsilon)
00672             noise=(float) (pixel-QuantumRange);
00673           else
00674             noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
00675               0.5f);
00676           break;
00677         }
00678       beta=1.0f-alpha;
00679       if (beta <= (0.5f*MagickEpsilon))
00680         noise=(float) (pixel+QuantumRange);
00681       else
00682         noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
00683       break;
00684     }
00685     case MultiplicativeGaussianNoise:
00686     {
00687       sigma=1.0f;
00688       if (alpha > MagickEpsilon)
00689         sigma=sqrt(-2.0f*log(alpha));
00690       beta=mwcReadPseudoRandomValue(r);
00691       noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
00692         cospi((float) (2.0f*beta))/2.0f);
00693       break;
00694     }
00695     case PoissonNoise:
00696     {
00697       float 
00698         poisson;
00699       unsigned int i;
00700       poisson=exp(-SigmaPoisson*QuantumScale*pixel);
00701       for (i=0; alpha > poisson; i++)
00702       {
00703         beta=mwcReadPseudoRandomValue(r);
00704         alpha*=beta;
00705       }
00706       noise=(float) (QuantumRange*i/SigmaPoisson);
00707       break;
00708     }
00709     case RandomNoise:
00710     {
00711       noise=(float) (QuantumRange*SigmaRandom*alpha);
00712       break;
00713     }
00714 
00715     };
00716     return noise;
00717   }
00718 
00719   __kernel
00720   void AddNoise(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
00721                     ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem
00722                     ,const ChannelType channel 
00723                     ,const NoiseType noise_type, const float attenuate
00724                     ,const unsigned int seed0, const unsigned int seed1
00725                                         ,const unsigned int numRandomNumbersPerPixel) {
00726 
00727         mwc64x_state_t rng;
00728         rng.seed0 = seed0;
00729         rng.seed1 = seed1;
00730 
00731         uint span = pixelsPerWorkItem * numRandomNumbersPerPixel;       // length of RNG substream each workitem will use
00732         uint offset = span * get_local_size(0) * get_group_id(0);       // offset of this workgroup's RNG substream (in master stream);
00733 
00734         MWC64X_SeedStreams(&rng, offset, span);                                         // Seed the RNG streams
00735 
00736         uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0);   // pixel to process
00737 
00738         uint count = pixelsPerWorkItem;
00739 
00740         while (count > 0) {
00741                 if (pos < inputPixelCount) {
00742                         CLPixelType p = inputImage[pos];
00743 
00744                         if ((channel&RedChannel)!=0) {
00745                           setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
00746                         }
00747     
00748                         if ((channel&GreenChannel)!=0) {
00749                           setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
00750                         }
00751 
00752                         if ((channel&BlueChannel)!=0) {
00753                           setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
00754                         }
00755 
00756                         if ((channel & OpacityChannel) != 0) {
00757                           setOpacity(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getOpacity(p),noise_type,attenuate)));
00758                         }
00759 
00760                         filteredImage[pos] = p;
00761                         //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);
00762                 }
00763                 pos += get_local_size(0);
00764                 --count;
00765         }
00766   }
00767   )
00768 
00769 /*
00770 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00771 %                                                                             %
00772 %                                                                             %
00773 %                                                                             %
00774 %    B l u r                                                                  %
00775 %                                                                             %
00776 %                                                                             %
00777 %                                                                             %
00778 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00779 */
00780 
00781     STRINGIFY(
00782       /*
00783       Reduce image noise and reduce detail levels by row
00784       im: input pixels filtered_in  filtered_im: output pixels
00785       filter : convolve kernel  width: convolve kernel size
00786       channel : define which channel is blured
00787       is_RGBA_BGRA : define the input is RGBA or BGRA
00788       */
00789       __kernel void BlurRow(__global CLPixelType *im, __global float4 *filtered_im,
00790                          const ChannelType channel, __constant float *filter,
00791                          const unsigned int width, 
00792                          const unsigned int imageColumns, const unsigned int imageRows,
00793                          __local CLPixelType *temp)
00794       {
00795         const int x = get_global_id(0);  
00796         const int y = get_global_id(1);  
00797 
00798         const int columns = imageColumns;  
00799 
00800         const unsigned int radius = (width-1)/2;
00801         const int wsize = get_local_size(0);  
00802         const unsigned int loadSize = wsize+width;
00803 
00804         //load chunk only for now
00805         //event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
00806         //wait_group_events(1,&e);
00807 
00808         //parallel load and clamp
00809         /*
00810         int count = 0;
00811         for (int i=0; i < loadSize; i=i+wsize)
00812         {
00813           int currentX = x + wsize*(count++);
00814 
00815           int localId = get_local_id(0);
00816 
00817           if ((localId+i) > loadSize)
00818             break;
00819 
00820           temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
00821 
00822           if (y==0 && get_group_id(0) == 0)
00823           {
00824             printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
00825           }
00826         }
00827         */
00828 
00829         //group coordinate
00830         const int groupX=get_local_size(0)*get_group_id(0);
00831         const int groupY=get_local_size(1)*get_group_id(1);
00832 
00833         //parallel load and clamp
00834         for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
00835         {
00836           //int cx = ClampToCanvas(groupX+i, columns);
00837           temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
00838 
00839           /*if (0 && y==0 && get_group_id(1) == 0)
00840           {
00841             printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
00842           }*/
00843         }
00844 
00845         // barrier        
00846         barrier(CLK_LOCAL_MEM_FENCE);
00847 
00848         // only do the work if this is not a patched item
00849         if (get_global_id(0) < columns) 
00850         {
00851           // compute
00852           float4 result = (float4) 0;
00853 
00854           int i = 0;
00855           
00856           \n #ifndef UFACTOR   \n 
00857           \n #define UFACTOR 8 \n 
00858           \n #endif                  \n 
00859 
00860           for ( ; i+UFACTOR < width; ) 
00861           {
00862             \n #pragma unroll UFACTOR\n
00863             for (int j=0; j < UFACTOR; j++, i++)
00864             {
00865               result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
00866             }
00867           }
00868 
00869           for ( ; i < width; i++)
00870           {
00871             result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
00872           }
00873 
00874           result.x = ClampToQuantum(result.x);
00875           result.y = ClampToQuantum(result.y);
00876           result.z = ClampToQuantum(result.z);
00877           result.w = ClampToQuantum(result.w);
00878 
00879           // write back to global
00880           filtered_im[y*columns+x] = result;
00881         }
00882       }
00883     )
00884 
00885     STRINGIFY(
00886       /*
00887       Reduce image noise and reduce detail levels by line
00888       im: input pixels filtered_in  filtered_im: output pixels
00889       filter : convolve kernel  width: convolve kernel size
00890       channel : define which channel is blured\
00891       is_RGBA_BGRA : define the input is RGBA or BGRA
00892       */
00893       __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
00894                                 const ChannelType channel, __constant float *filter,
00895                                 const unsigned int width, 
00896                                 const unsigned int imageColumns, const unsigned int imageRows,
00897                                 __local float4 *temp)
00898       {
00899         const int x = get_global_id(0);  
00900         const int y = get_global_id(1);
00901 
00902         //const int columns = get_global_size(0);
00903         //const int rows = get_global_size(1);  
00904         const int columns = imageColumns;  
00905         const int rows = imageRows;  
00906 
00907         unsigned int radius = (width-1)/2;
00908         const int wsize = get_local_size(1);  
00909         const unsigned int loadSize = wsize+width;
00910 
00911         //group coordinate
00912         const int groupX=get_local_size(0)*get_group_id(0);
00913         const int groupY=get_local_size(1)*get_group_id(1);
00914         //notice that get_local_size(0) is 1, so
00915         //groupX=get_group_id(0);
00916         
00917         //parallel load and clamp
00918         for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
00919         {
00920           temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
00921         }
00922         
00923         // barrier        
00924         barrier(CLK_LOCAL_MEM_FENCE);
00925 
00926         // only do the work if this is not a patched item
00927         if (get_global_id(1) < rows)
00928         {
00929           // compute
00930           float4 result = (float4) 0;
00931 
00932           int i = 0;
00933           
00934           \n #ifndef UFACTOR   \n 
00935           \n #define UFACTOR 8 \n 
00936           \n #endif                  \n 
00937           
00938           for ( ; i+UFACTOR < width; ) 
00939           {
00940             \n #pragma unroll UFACTOR \n
00941             for (int j=0; j < UFACTOR; j++, i++)
00942             {
00943               result+=filter[i]*temp[i+get_local_id(1)];
00944             }
00945           }
00946 
00947           for ( ; i < width; i++)
00948           {
00949             result+=filter[i]*temp[i+get_local_id(1)];
00950           }
00951 
00952           result.x = ClampToQuantum(result.x);
00953           result.y = ClampToQuantum(result.y);
00954           result.z = ClampToQuantum(result.z);
00955           result.w = ClampToQuantum(result.w);
00956 
00957           // write back to global
00958           filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
00959         }
00960 
00961       }
00962     )
00963 
00964 /*
00965 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00966 %                                                                             %
00967 %                                                                             %
00968 %                                                                             %
00969 %    C o m p o s i t e                                                        %
00970 %                                                                             %
00971 %                                                                             %
00972 %                                                                             %
00973 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
00974 */
00975 
00976   STRINGIFY(
00977     static inline float ColorDodge(const float Sca,
00978       const float Sa,const float Dca,const float Da)
00979     {
00980       /*
00981         Oct 2004 SVG specification.
00982       */
00983       if ((Sca*Da+Dca*Sa) >= Sa*Da)
00984         return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
00985       return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
00986 
00987 
00988       /*
00989         New specification, March 2009 SVG specification.  This specification was
00990         also wrong of non-overlap cases.
00991       */
00992       /*
00993       if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
00994         return(Sca*(1.0-Da));
00995       if (fabs(Sca-Sa) < MagickEpsilon)
00996         return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
00997       return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
00998       */
00999 
01000       /*
01001         Working from first principles using the original formula:
01002 
01003            f(Sc,Dc) = Dc/(1-Sc)
01004 
01005         This works correctly! Looks like the 2004 model was right but just
01006         required a extra condition for correct handling.
01007       */
01008 
01009       /*
01010       if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
01011         return(Sca*(1.0-Da)+Dca*(1.0-Sa));
01012       if (fabs(Sca-Sa) < MagickEpsilon)
01013         return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
01014       return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
01015       */
01016     }
01017 
01018     static inline void CompositeColorDodge(const float4 *p,
01019       const float4 *q,float4 *composite) {
01020 
01021       float 
01022       Da,
01023       gamma,
01024       Sa;
01025 
01026       Sa=1.0f-QuantumScale*getOpacityF4(*p);  /* simplify and speed up equations */
01027       Da=1.0f-QuantumScale*getOpacityF4(*q);
01028       gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
01029       setOpacityF4(composite, QuantumRange*(1.0-gamma));
01030       gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
01031       setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
01032         getRedF4(*q)*Da,Da));
01033       setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
01034         getGreenF4(*q)*Da,Da));
01035       setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
01036         getBlueF4(*q)*Da,Da));
01037     }
01038   )
01039 
01040   STRINGIFY(
01041     static inline void MagickPixelCompositePlus(const float4 *p,
01042       const float alpha,const float4 *q,
01043       const float beta,float4 *composite)
01044     {
01045       float 
01046         gamma;
01047 
01048       float
01049         Da,
01050         Sa;
01051       /*
01052         Add two pixels with the given opacities.
01053       */
01054       Sa=1.0-QuantumScale*alpha;
01055       Da=1.0-QuantumScale*beta;
01056       gamma=RoundToUnity(Sa+Da);  /* 'Plus' blending -- not 'Over' blending */
01057       setOpacityF4(composite,(float) QuantumRange*(1.0-gamma));
01058       gamma=PerceptibleReciprocal(gamma);
01059       setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
01060       setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
01061       setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
01062     }
01063   )
01064 
01065   STRINGIFY(
01066     static inline void MagickPixelCompositeBlend(const float4 *p,
01067       const float alpha,const float4 *q,
01068       const float beta,float4 *composite)
01069     {
01070       MagickPixelCompositePlus(p,(float) (QuantumRange-alpha*
01071       (QuantumRange-getOpacityF4(*p))),q,(float) (QuantumRange-beta*
01072       (QuantumRange-getOpacityF4(*q))),composite);
01073     }
01074   )
01075   
01076   STRINGIFY(
01077     __kernel 
01078     void Composite(__global CLPixelType *image,
01079                    const unsigned int imageWidth, 
01080                    const unsigned int imageHeight,
01081                    const unsigned int imageMatte,
01082                    const __global CLPixelType *compositeImage,
01083                    const unsigned int compositeWidth, 
01084                    const unsigned int compositeHeight,
01085                    const unsigned int compositeMatte,
01086                    const unsigned int compose,
01087                    const ChannelType channel, 
01088                    const float destination_dissolve,
01089                    const float source_dissolve) {
01090 
01091       uint2 index;
01092       index.x = get_global_id(0);
01093       index.y = get_global_id(1);
01094 
01095 
01096       if (index.x >= imageWidth
01097         || index.y >= imageHeight) {
01098           return;
01099       }
01100       const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
01101       float4 destination;
01102       setRedF4(&destination,getRed(inputPixel));
01103       setGreenF4(&destination,getGreen(inputPixel));
01104       setBlueF4(&destination,getBlue(inputPixel));
01105 
01106       
01107       const CLPixelType compositePixel 
01108         = compositeImage[index.y*imageWidth+index.x];
01109       float4 source;
01110       setRedF4(&source,getRed(compositePixel));
01111       setGreenF4(&source,getGreen(compositePixel));
01112       setBlueF4(&source,getBlue(compositePixel));
01113 
01114       if (imageMatte != 0) {
01115         setOpacityF4(&destination,getOpacity(inputPixel));
01116       }
01117       else {
01118         setOpacityF4(&destination,0.0f);
01119       }
01120 
01121       if (compositeMatte != 0) {
01122         setOpacityF4(&source,getOpacity(compositePixel));
01123       }
01124       else {
01125         setOpacityF4(&source,0.0f);
01126       }
01127 
01128       float4 composite=destination;
01129 
01130       CompositeOperator op = (CompositeOperator)compose;
01131       switch (op) {
01132       case ColorDodgeCompositeOp:
01133         CompositeColorDodge(&source,&destination,&composite);
01134         break;
01135       case BlendCompositeOp:
01136         MagickPixelCompositeBlend(&source,source_dissolve,&destination,
01137             destination_dissolve,&composite);
01138         break;
01139       default:
01140         // unsupported operators
01141         break;
01142       };
01143 
01144       CLPixelType outputPixel;
01145       setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
01146       setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
01147       setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
01148       setOpacity(&outputPixel, ClampToQuantum(getOpacityF4(composite)));
01149       image[index.y*imageWidth+index.x] = outputPixel;
01150     }
01151   )
01152 
01153 /*
01154 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01155 %                                                                             %
01156 %                                                                             %
01157 %                                                                             %
01158 %    C o n t r a s t                                                          %
01159 %                                                                             %
01160 %                                                                             %
01161 %                                                                             %
01162 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01163 */
01164 
01165   STRINGIFY(
01166 
01167   static inline float3 ConvertRGBToHSB(CLPixelType pixel) {
01168     float3 HueSaturationBrightness;
01169     HueSaturationBrightness.x = 0.0f; // Hue
01170     HueSaturationBrightness.y = 0.0f; // Saturation
01171     HueSaturationBrightness.z = 0.0f; // Brightness
01172 
01173     float r=(float) getRed(pixel);
01174     float g=(float) getGreen(pixel);
01175     float b=(float) getBlue(pixel);
01176 
01177     float tmin=MagickMin(MagickMin(r,g),b);
01178     float tmax= MagickMax(MagickMax(r,g),b);
01179 
01180     if (tmax!=0.0f) {
01181       float delta=tmax-tmin;
01182       HueSaturationBrightness.y=delta/tmax;
01183       HueSaturationBrightness.z=QuantumScale*tmax;
01184 
01185       if (delta != 0.0f) {
01186   HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
01187   HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
01188         HueSaturationBrightness.x/=6.0f;
01189         HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
01190       }
01191     }
01192     return HueSaturationBrightness;
01193   }
01194 
01195   static inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
01196 
01197     float hue = HueSaturationBrightness.x;
01198     float brightness = HueSaturationBrightness.z;
01199     float saturation = HueSaturationBrightness.y;
01200    
01201     CLPixelType rgb;
01202 
01203     if (saturation == 0.0f) {
01204       setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
01205       setGreen(&rgb,getRed(rgb));
01206       setBlue(&rgb,getRed(rgb));
01207     }
01208     else {
01209 
01210       float h=6.0f*(hue-floor(hue));
01211       float f=h-floor(h);
01212       float p=brightness*(1.0f-saturation);
01213       float q=brightness*(1.0f-saturation*f);
01214       float t=brightness*(1.0f-(saturation*(1.0f-f)));
01215  
01216       float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
01217       float clamped_t = ClampToQuantum(QuantumRange*t);
01218       float clamped_p = ClampToQuantum(QuantumRange*p);
01219       float clamped_q = ClampToQuantum(QuantumRange*q);     
01220       int ih = (int)h;
01221       setRed(&rgb, (ih == 1)?clamped_q:
01222         (ih == 2 || ih == 3)?clamped_p:
01223         (ih == 4)?clamped_t:
01224                  clampedBrightness);
01225  
01226       setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
01227         (ih == 3)?clamped_q:
01228         (ih == 4 || ih == 5)?clamped_p:
01229                  clamped_t);
01230 
01231       setBlue(&rgb, (ih == 2)?clamped_t:
01232         (ih == 3 || ih == 4)?clampedBrightness:
01233         (ih == 5)?clamped_q:
01234                  clamped_p);
01235     }
01236     return rgb;
01237   }
01238 
01239   __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
01240   {
01241 
01242     const int sign = sharpen!=0?1:-1;
01243     const int x = get_global_id(0);  
01244     const int y = get_global_id(1);
01245     const int columns = get_global_size(0);
01246     const int c = x + y * columns;
01247 
01248     CLPixelType pixel = im[c];
01249     float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
01250     float brightness = HueSaturationBrightness.z;
01251     brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
01252     brightness = clamp(brightness,0.0f,1.0f);
01253     HueSaturationBrightness.z = brightness;
01254 
01255     CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
01256     filteredPixel.w = pixel.w;
01257     im[c] = filteredPixel;
01258   }
01259   )
01260 
01261 /*
01262 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01263 %                                                                             %
01264 %                                                                             %
01265 %                                                                             %
01266 %    C o n t r a s t S t r e t c h                                            %
01267 %                                                                             %
01268 %                                                                             %
01269 %                                                                             %
01270 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01271 */
01272 
01273     STRINGIFY(
01274     /*
01275     */
01276     __kernel void Histogram(__global CLPixelType * restrict im,
01277       const ChannelType channel, 
01278       const int method,
01279       const int colorspace,
01280       __global uint4 * restrict histogram)
01281       {
01282         const int x = get_global_id(0);  
01283         const int y = get_global_id(1);  
01284         const int columns = get_global_size(0);  
01285         const int c = x + y * columns;
01286         if ((channel & SyncChannels) != 0)
01287         {
01288           float intensity = GetPixelIntensity(method, colorspace,im[c]);
01289           uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
01290           atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
01291         }
01292         else
01293         {
01294           // for equalizing, we always need all channels?
01295           // otherwise something more
01296         }
01297       }
01298     )
01299 
01300     STRINGIFY(
01301     /*
01302     */
01303     __kernel void ContrastStretch(__global CLPixelType * restrict im,
01304       const ChannelType channel,  
01305       __global CLPixelType * restrict stretch_map,
01306       const float4 white, const float4 black)
01307       {
01308         const int x = get_global_id(0);  
01309         const int y = get_global_id(1);  
01310         const int columns = get_global_size(0);  
01311         const int c = x + y * columns;
01312 
01313         uint ePos;
01314         CLPixelType oValue, eValue;
01315         CLQuantum red, green, blue, opacity;
01316 
01317         //read from global
01318         oValue=im[c];
01319 
01320         if ((channel & RedChannel) != 0)
01321         {
01322           if (getRedF4(white) != getRedF4(black))
01323           {
01324             ePos = ScaleQuantumToMap(getRed(oValue)); 
01325             eValue = stretch_map[ePos];
01326             red = getRed(eValue);
01327           }
01328         }
01329 
01330         if ((channel & GreenChannel) != 0)
01331         {
01332           if (getGreenF4(white) != getGreenF4(black))
01333           {
01334             ePos = ScaleQuantumToMap(getGreen(oValue)); 
01335             eValue = stretch_map[ePos];
01336             green = getGreen(eValue);
01337           }
01338         }
01339 
01340         if ((channel & BlueChannel) != 0)
01341         {
01342           if (getBlueF4(white) != getBlueF4(black))
01343           {
01344             ePos = ScaleQuantumToMap(getBlue(oValue)); 
01345             eValue = stretch_map[ePos];
01346             blue = getBlue(eValue);
01347           }
01348         }
01349 
01350         if ((channel & OpacityChannel) != 0)
01351         {
01352           if (getOpacityF4(white) != getOpacityF4(black))
01353           {
01354             ePos = ScaleQuantumToMap(getOpacity(oValue)); 
01355             eValue = stretch_map[ePos];
01356             opacity = getOpacity(eValue);
01357           }
01358         }
01359 
01360         //write back
01361         im[c]=(CLPixelType)(blue, green, red, opacity);
01362 
01363       }
01364     )
01365 
01366 /*
01367 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01368 %                                                                             %
01369 %                                                                             %
01370 %                                                                             %
01371 %    C o n v o l v e                                                          %
01372 %                                                                             %
01373 %                                                                             %
01374 %                                                                             %
01375 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01376 */
01377 
01378   STRINGIFY(
01379     __kernel 
01380     void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
01381     const unsigned int imageWidth, const unsigned int imageHeight,
01382     __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
01383     const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
01384 
01385       int2 blockID;
01386       blockID.x = get_group_id(0);
01387       blockID.y = get_group_id(1);
01388 
01389       // image area processed by this workgroup
01390       int2 imageAreaOrg;
01391       imageAreaOrg.x = blockID.x * get_local_size(0);
01392       imageAreaOrg.y = blockID.y * get_local_size(1);
01393 
01394       int2 midFilterDimen;
01395       midFilterDimen.x = (filterWidth-1)/2;
01396       midFilterDimen.y = (filterHeight-1)/2;
01397 
01398       int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
01399 
01400       // dimension of the local cache
01401       int2 cachedAreaDimen;
01402       cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
01403       cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
01404 
01405       // cache the pixels accessed by this workgroup in local memory
01406       int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
01407       int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
01408       int groupSize = get_local_size(0) * get_local_size(1);
01409       for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
01410 
01411         int2 cachedAreaIndex;
01412         cachedAreaIndex.x = i % cachedAreaDimen.x;
01413         cachedAreaIndex.y = i / cachedAreaDimen.x;
01414 
01415         int2 imagePixelIndex;
01416         imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
01417 
01418         // only support EdgeVirtualPixelMethod through ClampToCanvas
01419         // TODO: implement other virtual pixel method
01420         imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
01421         imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
01422 
01423         pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
01424       }
01425 
01426       // cache the filter
01427       for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
01428         filterCache[i] = filter[i];
01429       }
01430       barrier(CLK_LOCAL_MEM_FENCE);
01431 
01432 
01433       int2 imageIndex;
01434       imageIndex.x = imageAreaOrg.x + get_local_id(0);
01435       imageIndex.y = imageAreaOrg.y + get_local_id(1);
01436 
01437       // if out-of-range, stops here and quit
01438       if (imageIndex.x >= imageWidth
01439         || imageIndex.y >= imageHeight) {
01440           return;
01441       }
01442 
01443       int filterIndex = 0;
01444       float4 sum = (float4)0.0f;
01445       float gamma = 0.0f;
01446       if (((channel & OpacityChannel) == 0) || (matte == 0)) {
01447         int cacheIndexY = get_local_id(1);
01448         for (int j = 0; j < filterHeight; j++) {
01449           int cacheIndexX = get_local_id(0);
01450           for (int i = 0; i < filterWidth; i++) {
01451             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
01452             float f = filterCache[filterIndex];
01453 
01454             sum.x += f * p.x;
01455             sum.y += f * p.y;
01456             sum.z += f * p.z; 
01457             sum.w += f * p.w;
01458 
01459             gamma += f;
01460             filterIndex++;
01461             cacheIndexX++;
01462           }
01463           cacheIndexY++;
01464         }
01465       }
01466       else {
01467         int cacheIndexY = get_local_id(1);
01468         for (int j = 0; j < filterHeight; j++) {
01469           int cacheIndexX = get_local_id(0);
01470           for (int i = 0; i < filterWidth; i++) {
01471 
01472             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
01473             float alpha = QuantumScale*(QuantumRange-p.w);
01474             float f = filterCache[filterIndex];
01475             float g = alpha * f;
01476 
01477             sum.x += g*p.x;
01478             sum.y += g*p.y;
01479             sum.z += g*p.z;
01480             sum.w += f*p.w;
01481 
01482             gamma += g;
01483             filterIndex++;
01484             cacheIndexX++;
01485           }
01486           cacheIndexY++;
01487         }
01488         gamma = PerceptibleReciprocal(gamma);
01489         sum.xyz = gamma*sum.xyz;
01490       }
01491       CLPixelType outputPixel;
01492       outputPixel.x = ClampToQuantum(sum.x);
01493       outputPixel.y = ClampToQuantum(sum.y);
01494       outputPixel.z = ClampToQuantum(sum.z);
01495       outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
01496 
01497       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
01498     }
01499   )
01500 
01501   STRINGIFY(
01502     __kernel 
01503     void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
01504                   const uint imageWidth, const uint imageHeight,
01505                   __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
01506                   const uint matte, const ChannelType channel) {
01507 
01508       int2 imageIndex;
01509       imageIndex.x = get_global_id(0);
01510       imageIndex.y = get_global_id(1);
01511 
01512       /*
01513       unsigned int imageWidth = get_global_size(0);
01514       unsigned int imageHeight = get_global_size(1);
01515       */
01516       if (imageIndex.x >= imageWidth
01517           || imageIndex.y >= imageHeight)
01518           return;
01519 
01520       int2 midFilterDimen;
01521       midFilterDimen.x = (filterWidth-1)/2;
01522       midFilterDimen.y = (filterHeight-1)/2;
01523 
01524       int filterIndex = 0;
01525       float4 sum = (float4)0.0f;
01526       float gamma = 0.0f;
01527       if (((channel & OpacityChannel) == 0) || (matte == 0)) {
01528         for (int j = 0; j < filterHeight; j++) {
01529           int2 inputPixelIndex;
01530           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
01531           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
01532           for (int i = 0; i < filterWidth; i++) {
01533             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
01534             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
01535         
01536             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
01537             float f = filter[filterIndex];
01538 
01539             sum.x += f * p.x;
01540             sum.y += f * p.y;
01541             sum.z += f * p.z; 
01542             sum.w += f * p.w;
01543 
01544             gamma += f;
01545 
01546             filterIndex++;
01547           }
01548         }
01549       }
01550       else {
01551 
01552         for (int j = 0; j < filterHeight; j++) {
01553           int2 inputPixelIndex;
01554           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
01555           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
01556           for (int i = 0; i < filterWidth; i++) {
01557             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
01558             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
01559         
01560             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
01561             float alpha = QuantumScale*(QuantumRange-p.w);
01562             float f = filter[filterIndex];
01563             float g = alpha * f;
01564 
01565             sum.x += g*p.x;
01566             sum.y += g*p.y;
01567             sum.z += g*p.z;
01568             sum.w += f*p.w;
01569 
01570             gamma += g;
01571 
01572 
01573             filterIndex++;
01574           }
01575         }
01576         gamma = PerceptibleReciprocal(gamma);
01577         sum.xyz = gamma*sum.xyz;
01578       }
01579 
01580       CLPixelType outputPixel;
01581       outputPixel.x = ClampToQuantum(sum.x);
01582       outputPixel.y = ClampToQuantum(sum.y);
01583       outputPixel.z = ClampToQuantum(sum.z);
01584       outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
01585 
01586       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
01587     }
01588   )
01589 
01590 /*
01591 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01592 %                                                                             %
01593 %                                                                             %
01594 %                                                                             %
01595 %     D e s p e c k l e                                                       %
01596 %                                                                             %
01597 %                                                                             %
01598 %                                                                             %
01599 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01600 */
01601 
01602   STRINGIFY(
01603 
01604   __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
01605   , const unsigned int imageWidth, const unsigned int imageHeight
01606   , const int2 offset, const int polarity, const int matte) {
01607 
01608     int x = get_global_id(0);
01609     int y = get_global_id(1);
01610 
01611     CLPixelType v = inputImage[y*imageWidth+x];
01612 
01613     int2 neighbor;
01614     neighbor.y = y + offset.y;
01615     neighbor.x = x + offset.x;
01616 
01617     int2 clampedNeighbor;
01618     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
01619     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
01620 
01621     CLPixelType r = (clampedNeighbor.x == neighbor.x
01622                      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
01623     :(CLPixelType)0;
01624 
01625     int sv[4];
01626     sv[0] = (int)v.x;
01627     sv[1] = (int)v.y;
01628     sv[2] = (int)v.z;
01629     sv[3] = (int)v.w;
01630 
01631     int sr[4];
01632     sr[0] = (int)r.x;
01633     sr[1] = (int)r.y;
01634     sr[2] = (int)r.z;
01635     sr[3] = (int)r.w;
01636 
01637     if (polarity > 0) {
01638       \n #pragma unroll 4\n
01639       for (unsigned int i = 0; i < 4; i++) {
01640         sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
01641       }
01642     }
01643     else {
01644       \n #pragma unroll 4\n
01645       for (unsigned int i = 0; i < 4; i++) {
01646         sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
01647       }
01648 
01649     }
01650 
01651     v.x = (CLQuantum)sv[0];
01652     v.y = (CLQuantum)sv[1];
01653     v.z = (CLQuantum)sv[2];
01654 
01655     if (matte!=0)
01656       v.w = (CLQuantum)sv[3];
01657 
01658     outputImage[y*imageWidth+x] = v;
01659 
01660     }
01661 
01662 
01663   )
01664 
01665 
01666 
01667   STRINGIFY(
01668 
01669   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
01670   , const unsigned int imageWidth, const unsigned int imageHeight
01671   , const int2 offset, const int polarity, const int matte) {
01672 
01673     int x = get_global_id(0);
01674     int y = get_global_id(1);
01675 
01676     CLPixelType v = inputImage[y*imageWidth+x];
01677 
01678     int2 neighbor, clampedNeighbor;
01679 
01680     neighbor.y = y + offset.y;
01681     neighbor.x = x + offset.x;
01682     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
01683     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
01684 
01685     CLPixelType r = (clampedNeighbor.x == neighbor.x
01686       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
01687     :(CLPixelType)0;
01688 
01689 
01690     neighbor.y = y - offset.y;
01691     neighbor.x = x - offset.x;
01692     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
01693     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
01694 
01695     CLPixelType s = (clampedNeighbor.x == neighbor.x
01696       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
01697     :(CLPixelType)0;
01698 
01699 
01700     int sv[4];
01701     sv[0] = (int)v.x;
01702     sv[1] = (int)v.y;
01703     sv[2] = (int)v.z;
01704     sv[3] = (int)v.w;
01705 
01706     int sr[4];
01707     sr[0] = (int)r.x;
01708     sr[1] = (int)r.y;
01709     sr[2] = (int)r.z;
01710     sr[3] = (int)r.w;
01711 
01712     int ss[4];
01713     ss[0] = (int)s.x;
01714     ss[1] = (int)s.y;
01715     ss[2] = (int)s.z;
01716     ss[3] = (int)s.w;
01717 
01718     if (polarity > 0) {
01719       \n #pragma unroll 4\n
01720       for (unsigned int i = 0; i < 4; i++) {
01721         //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
01722         //
01723         //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
01724         //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
01725         sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
01726       }
01727     }
01728     else {
01729       \n #pragma unroll 4\n
01730       for (unsigned int i = 0; i < 4; i++) {
01731         //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
01732         //
01733         //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
01734         sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
01735       }
01736     }
01737 
01738     v.x = (CLQuantum)sv[0];
01739     v.y = (CLQuantum)sv[1];
01740     v.z = (CLQuantum)sv[2];
01741 
01742     if (matte!=0)
01743       v.w = (CLQuantum)sv[3];
01744 
01745     outputImage[y*imageWidth+x] = v;
01746 
01747     }
01748 
01749 
01750   )
01751 
01752 /*
01753 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01754 %                                                                             %
01755 %                                                                             %
01756 %                                                                             %
01757 %     E q u a l i z e                                                         %
01758 %                                                                             %
01759 %                                                                             %
01760 %                                                                             %
01761 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01762 */
01763 
01764     STRINGIFY(
01765     /*
01766     */
01767     __kernel void Equalize(__global CLPixelType * restrict im,
01768       const ChannelType channel,  
01769       __global CLPixelType * restrict equalize_map,
01770       const float4 white, const float4 black)
01771       {
01772         const int x = get_global_id(0);  
01773         const int y = get_global_id(1);  
01774         const int columns = get_global_size(0);  
01775         const int c = x + y * columns;
01776 
01777         uint ePos;
01778         CLPixelType oValue, eValue;
01779         CLQuantum red, green, blue, opacity;
01780 
01781         //read from global
01782         oValue=im[c];
01783 
01784         if ((channel & SyncChannels) != 0)
01785         {
01786           if (getRedF4(white) != getRedF4(black))
01787           {
01788             ePos = ScaleQuantumToMap(getRed(oValue)); 
01789             eValue = equalize_map[ePos];
01790             red = getRed(eValue);
01791             ePos = ScaleQuantumToMap(getGreen(oValue)); 
01792             eValue = equalize_map[ePos];
01793             green = getRed(eValue);
01794             ePos = ScaleQuantumToMap(getBlue(oValue)); 
01795             eValue = equalize_map[ePos];
01796             blue = getRed(eValue);
01797             ePos = ScaleQuantumToMap(getOpacity(oValue)); 
01798             eValue = equalize_map[ePos];
01799             opacity = getRed(eValue);
01800  
01801             //write back
01802             im[c]=(CLPixelType)(blue, green, red, opacity);
01803           }
01804 
01805         }
01806 
01807         // for equalizing, we always need all channels?
01808         // otherwise something more
01809 
01810      }
01811     )
01812 
01813 /*
01814 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01815 %                                                                             %
01816 %                                                                             %
01817 %                                                                             %
01818 %     F u n c t i o n                                                         %
01819 %                                                                             %
01820 %                                                                             %
01821 %                                                                             %
01822 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01823 */
01824 
01825   STRINGIFY(
01826 
01827     /*
01828     apply FunctionImageChannel(braightness-contrast)
01829     */
01830     CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
01831         const unsigned int number_parameters,
01832         __constant float *parameters)
01833       {
01834         float4 result = (float4) 0.0f;
01835         switch (function)
01836         {
01837         case PolynomialFunction:
01838           {
01839             for (unsigned int i=0; i < number_parameters; i++)
01840               result = result*(float4)QuantumScale*convert_float4(pixel) + parameters[i];
01841             result *= (float4)QuantumRange;
01842             break;
01843           }
01844         case SinusoidFunction:
01845           {
01846             float  freq,phase,ampl,bias;
01847             freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
01848             phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
01849             ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
01850             bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
01851             result.x = QuantumRange*(ampl*sin(2.0f*MagickPI*
01852               (freq*QuantumScale*(float)pixel.x + phase/360.0f)) + bias);
01853             result.y = QuantumRange*(ampl*sin(2.0f*MagickPI*
01854               (freq*QuantumScale*(float)pixel.y + phase/360.0f)) + bias);
01855             result.z = QuantumRange*(ampl*sin(2.0f*MagickPI*
01856               (freq*QuantumScale*(float)pixel.z + phase/360.0f)) + bias);
01857             result.w = QuantumRange*(ampl*sin(2.0f*MagickPI*
01858               (freq*QuantumScale*(float)pixel.w + phase/360.0f)) + bias);
01859             break;
01860           }
01861         case ArcsinFunction:
01862           {
01863             float  width,range,center,bias;
01864             width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
01865             center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
01866             range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
01867             bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
01868 
01869             result.x = 2.0f/width*(QuantumScale*(float)pixel.x - center);
01870             result.x = range/MagickPI*asin(result.x)+bias;
01871             result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
01872             result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
01873 
01874             result.y = 2.0f/width*(QuantumScale*(float)pixel.y - center);
01875             result.y = range/MagickPI*asin(result.y)+bias;
01876             result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
01877             result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
01878 
01879             result.z = 2.0f/width*(QuantumScale*(float)pixel.z - center);
01880             result.z = range/MagickPI*asin(result.z)+bias;
01881             result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
01882             result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
01883 
01884 
01885             result.w = 2.0f/width*(QuantumScale*(float)pixel.w - center);
01886             result.w = range/MagickPI*asin(result.w)+bias;
01887             result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
01888             result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
01889 
01890             result *= (float4)QuantumRange;
01891             break;
01892           }
01893         case ArctanFunction:
01894           {
01895             float slope,range,center,bias;
01896             slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
01897             center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
01898             range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
01899             bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
01900             result = (float4)MagickPI*(float4)slope*((float4)QuantumScale*convert_float4(pixel)-(float4)center);
01901             result = (float4)QuantumRange*((float4)range/(float4)MagickPI*atan(result) + (float4)bias);
01902             break;
01903           }
01904         case UndefinedFunction:
01905           break;
01906         }
01907         return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
01908           ClampToQuantum(result.z), ClampToQuantum(result.w));
01909       }
01910     )
01911 
01912     STRINGIFY(
01913     /*
01914     Improve brightness / contrast of the image
01915     channel : define which channel is improved
01916     function : the function called to enchance the brightness contrast
01917     number_parameters : numbers of parameters 
01918     parameters : the parameter
01919     */
01920     __kernel void ComputeFunction(__global CLPixelType *im,
01921                                         const ChannelType channel, const MagickFunction function,
01922                                         const unsigned int number_parameters, __constant float *parameters)
01923       {
01924         const int x = get_global_id(0);  
01925         const int y = get_global_id(1);  
01926         const int columns = get_global_size(0);  
01927         const int c = x + y * columns;
01928         im[c] = ApplyFunction(im[c], function, number_parameters, parameters); 
01929       }
01930     )
01931 
01932 /*
01933 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01934 %                                                                             %
01935 %                                                                             %
01936 %                                                                             %
01937 %     G r a y s c a l e                                                       %
01938 %                                                                             %
01939 %                                                                             %
01940 %                                                                             %
01941 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
01942 */
01943 
01944   STRINGIFY(
01945   __kernel void Grayscale(__global CLPixelType *im, 
01946     const int method, const int colorspace)
01947   {
01948 
01949     const int x = get_global_id(0);  
01950     const int y = get_global_id(1);
01951     const int columns = get_global_size(0);
01952     const int c = x + y * columns;
01953 
01954     CLPixelType pixel = im[c];
01955 
01956     float
01957         blue,
01958         green,
01959         intensity,
01960         red;
01961 
01962     red=(float)getRed(pixel);
01963     green=(float)getGreen(pixel);
01964     blue=(float)getBlue(pixel);
01965 
01966     intensity=0.0;
01967 
01968     CLPixelType filteredPixel;
01969  
01970     switch (method)
01971     {
01972       case AveragePixelIntensityMethod:
01973         {
01974           intensity=(red+green+blue)/3.0;
01975           break;
01976         }
01977       case BrightnessPixelIntensityMethod:
01978         {
01979           intensity=MagickMax(MagickMax(red,green),blue);
01980           break;
01981         }
01982       case LightnessPixelIntensityMethod:
01983         {
01984           intensity=(MagickMin(MagickMin(red,green),blue)+
01985               MagickMax(MagickMax(red,green),blue))/2.0;
01986           break;
01987         }
01988       case MSPixelIntensityMethod:
01989         {
01990           intensity=(float) (((float) red*red+green*green+
01991                 blue*blue)/(3.0*QuantumRange));
01992           break;
01993         }
01994       case Rec601LumaPixelIntensityMethod:
01995         {
01996           /*
01997           if (colorspace == RGBColorspace)
01998           {
01999             red=EncodePixelGamma(red);
02000             green=EncodePixelGamma(green);
02001             blue=EncodePixelGamma(blue);
02002           }
02003           */
02004           intensity=0.298839*red+0.586811*green+0.114350*blue;
02005           break;
02006         }
02007       case Rec601LuminancePixelIntensityMethod:
02008         {
02009           /*
02010           if (image->colorspace == sRGBColorspace)
02011           {
02012             red=DecodePixelGamma(red);
02013             green=DecodePixelGamma(green);
02014             blue=DecodePixelGamma(blue);
02015           }
02016           */
02017           intensity=0.298839*red+0.586811*green+0.114350*blue;
02018           break;
02019         }
02020       case Rec709LumaPixelIntensityMethod:
02021       default:
02022         {
02023           /*
02024           if (image->colorspace == RGBColorspace)
02025           {
02026             red=EncodePixelGamma(red);
02027             green=EncodePixelGamma(green);
02028             blue=EncodePixelGamma(blue);
02029           }
02030           */
02031           intensity=0.212656*red+0.715158*green+0.072186*blue;
02032           break;
02033         }
02034       case Rec709LuminancePixelIntensityMethod:
02035         {
02036           /*
02037           if (image->colorspace == sRGBColorspace)
02038           {
02039             red=DecodePixelGamma(red);
02040             green=DecodePixelGamma(green);
02041             blue=DecodePixelGamma(blue);
02042           }
02043           */
02044           intensity=0.212656*red+0.715158*green+0.072186*blue;
02045           break;
02046         }
02047       case RMSPixelIntensityMethod:
02048         {
02049           intensity=(float) (sqrt((float) red*red+green*green+
02050                 blue*blue)/sqrt(3.0));
02051           break;
02052         }
02053 
02054     }
02055 
02056     setGray(&filteredPixel, ClampToQuantum(intensity));
02057 
02058     filteredPixel.w = pixel.w;
02059 
02060     im[c] = filteredPixel;
02061   }
02062   )
02063 
02064 /*
02065 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02066 %                                                                             %
02067 %                                                                             %
02068 %                                                                             %
02069 %     L o c a l C o n t r a s t                                               %
02070 %                                                                             %
02071 %                                                                             %
02072 %                                                                             %
02073 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02074 */
02075 
02076     STRINGIFY(
02077       static inline int mirrorBottom(int value)
02078       {
02079           return (value < 0) ? - (value) : value;
02080       }
02081       static inline int mirrorTop(int value, int width)
02082       {
02083           return (value >= width) ? (2 * width - value - 1) : value;
02084       }
02085 
02086       __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
02087           const int radius, 
02088           const int imageWidth,
02089           const int imageHeight)
02090       {
02091         const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
02092 
02093         int x = get_local_id(0);
02094         int y = get_global_id(1);
02095 
02096         if ((x >= imageWidth) || (y >= imageHeight))
02097           return;
02098 
02099         global CLPixelType *src = srcImage + y * imageWidth;
02100 
02101         for (int i = x; i < imageWidth; i += get_local_size(0)) {
02102             float sum = 0.0f;
02103             float weight = 1.0f;
02104 
02105             int j = i - radius;
02106             while ((j + 7) < i) {
02107                 for (int k = 0; k < 8; ++k) // Unroll 8x
02108                     sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
02109                 weight += 8.0f;
02110                 j+=8;
02111             }
02112             while (j < i) {
02113                 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
02114                 weight += 1.0f;
02115                 ++j;
02116             }
02117 
02118             while ((j + 7) < radius + i) {
02119                 for (int k = 0; k < 8; ++k) // Unroll 8x
02120                     sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
02121                 weight -= 8.0f;
02122                 j+=8;
02123             }
02124             while (j < radius + i) {
02125                 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
02126                 weight -= 1.0f;
02127                 ++j;
02128             }
02129 
02130             tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
02131         }
02132       }
02133     )
02134 
02135     STRINGIFY(
02136       __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
02137           const int radius, 
02138           const float strength,
02139           const int imageWidth,
02140           const int imageHeight)
02141       {
02142         const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
02143 
02144         int x = get_global_id(0);
02145         int y = get_global_id(1);
02146 
02147         if ((x >= imageWidth) || (y >= imageHeight))
02148                 return;
02149 
02150         global float *src = blurImage + x;
02151 
02152         float sum = 0.0f;
02153         float weight = 1.0f;
02154 
02155         int j = y - radius;
02156         while ((j + 7) < y) {
02157             for (int k = 0; k < 8; ++k) // Unroll 8x
02158                 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
02159             weight += 8.0f;
02160             j+=8;
02161         }
02162         while (j < y) {
02163             sum += weight * src[mirrorBottom(j) * imageWidth];
02164             weight += 1.0f;
02165             ++j;
02166         }
02167 
02168         while ((j + 7) < radius + y) {
02169             for (int k = 0; k < 8; ++k) // Unroll 8x
02170                 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
02171             weight -= 8.0f;
02172             j+=8;
02173         }
02174         while (j < radius + y) {
02175             sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
02176             weight -= 1.0f;
02177             ++j;
02178         }
02179 
02180         CLPixelType pixel = srcImage[x + y * imageWidth];
02181         float srcVal = dot(RGB, convert_float4(pixel));
02182         float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
02183         mult = (srcVal + mult) / srcVal;
02184 
02185         pixel.x = ClampToQuantum(pixel.x * mult);
02186         pixel.y = ClampToQuantum(pixel.y * mult);
02187         pixel.z = ClampToQuantum(pixel.z * mult);
02188 
02189         dstImage[x + y * imageWidth] = pixel;
02190       }
02191     )
02192 
02193 /*
02194 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02195 %                                                                             %
02196 %                                                                             %
02197 %                                                                             %
02198 %     M o d u l a t e                                                         %
02199 %                                                                             %
02200 %                                                                             %
02201 %                                                                             %
02202 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02203 */
02204 
02205   STRINGIFY(
02206 
02207   static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
02208     float *hue, float *saturation, float *lightness)
02209   {
02210   float
02211     c,
02212     tmax,
02213     tmin;
02214 
02215   /*
02216      Convert RGB to HSL colorspace.
02217      */
02218   tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
02219   tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
02220 
02221   c=tmax-tmin;
02222 
02223   *lightness=(tmax+tmin)/2.0;
02224   if (c <= 0.0)
02225   {
02226     *hue=0.0;
02227     *saturation=0.0;
02228     return;
02229   }
02230 
02231   if (tmax == (QuantumScale*red))
02232   {
02233     *hue=(QuantumScale*green-QuantumScale*blue)/c;
02234     if ((QuantumScale*green) < (QuantumScale*blue))
02235       *hue+=6.0;
02236   }
02237   else
02238     if (tmax == (QuantumScale*green))
02239       *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
02240     else
02241       *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
02242 
02243   *hue*=60.0/360.0;
02244   if (*lightness <= 0.5)
02245     *saturation=c/(2.0*(*lightness));
02246   else
02247     *saturation=c/(2.0-2.0*(*lightness));
02248   }
02249 
02250   static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
02251       CLQuantum *red,CLQuantum *green,CLQuantum *blue)
02252   {
02253     float
02254       b,
02255       c,
02256       g,
02257       h,
02258       tmin,
02259       r,
02260       x;
02261 
02262     /*
02263        Convert HSL to RGB colorspace.
02264        */
02265     h=hue*360.0;
02266     if (lightness <= 0.5)
02267       c=2.0*lightness*saturation;
02268     else
02269       c=(2.0-2.0*lightness)*saturation;
02270     tmin=lightness-0.5*c;
02271     h-=360.0*floor(h/360.0);
02272     h/=60.0;
02273     x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
02274     switch ((int) floor(h) % 6)
02275     {
02276       case 0:
02277         {
02278           r=tmin+c;
02279           g=tmin+x;
02280           b=tmin;
02281           break;
02282         }
02283       case 1:
02284         {
02285           r=tmin+x;
02286           g=tmin+c;
02287           b=tmin;
02288           break;
02289         }
02290       case 2:
02291         {
02292           r=tmin;
02293           g=tmin+c;
02294           b=tmin+x;
02295           break;
02296         }
02297       case 3:
02298         {
02299           r=tmin;
02300           g=tmin+x;
02301           b=tmin+c;
02302           break;
02303         }
02304       case 4:
02305         {
02306           r=tmin+x;
02307           g=tmin;
02308           b=tmin+c;
02309           break;
02310         }
02311       case 5:
02312         {
02313           r=tmin+c;
02314           g=tmin;
02315           b=tmin+x;
02316           break;
02317         }
02318       default:
02319         {
02320           r=0.0;
02321           g=0.0;
02322           b=0.0;
02323         }
02324     }
02325     *red=ClampToQuantum(QuantumRange*r);
02326     *green=ClampToQuantum(QuantumRange*g);
02327     *blue=ClampToQuantum(QuantumRange*b);
02328   }
02329 
02330   static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 
02331     CLQuantum *red,CLQuantum *green,CLQuantum *blue)
02332   {
02333     float
02334       hue,
02335       lightness,
02336       saturation;
02337 
02338     /*
02339     Increase or decrease color lightness, saturation, or hue.
02340     */
02341     ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
02342     hue+=0.5*(0.01*percent_hue-1.0);
02343     while (hue < 0.0)
02344       hue+=1.0;
02345     while (hue >= 1.0)
02346       hue-=1.0;
02347     saturation*=0.01*percent_saturation;
02348     lightness*=0.01*percent_lightness;
02349     ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
02350   }
02351 
02352   __kernel void Modulate(__global CLPixelType *im, 
02353     const float percent_brightness, 
02354     const float percent_hue, 
02355     const float percent_saturation, 
02356     const int colorspace)
02357   {
02358 
02359     const int x = get_global_id(0);  
02360     const int y = get_global_id(1);
02361     const int columns = get_global_size(0);
02362     const int c = x + y * columns;
02363 
02364     CLPixelType pixel = im[c];
02365 
02366     CLQuantum
02367         blue,
02368         green,
02369         red;
02370 
02371     red=getRed(pixel);
02372     green=getGreen(pixel);
02373     blue=getBlue(pixel);
02374 
02375     switch (colorspace)
02376     {
02377       case HSLColorspace:
02378       default:
02379         {
02380           ModulateHSL(percent_hue, percent_saturation, percent_brightness, 
02381               &red, &green, &blue);
02382         }
02383 
02384     }
02385 
02386     CLPixelType filteredPixel;
02387    
02388     setRed(&filteredPixel, red);
02389     setGreen(&filteredPixel, green);
02390     setBlue(&filteredPixel, blue);
02391     filteredPixel.w = pixel.w;
02392 
02393     im[c] = filteredPixel;
02394   }
02395   )
02396 
02397 /*
02398 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02399 %                                                                             %
02400 %                                                                             %
02401 %                                                                             %
02402 %     M o t i o n B l u r                                                     %
02403 %                                                                             %
02404 %                                                                             %
02405 %                                                                             %
02406 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02407 */
02408 
02409   STRINGIFY(
02410     __kernel 
02411     void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
02412                     const unsigned int imageWidth, const unsigned int imageHeight,
02413                     const __global float *filter, const unsigned int width, const __global int2* offset,
02414                     const float4 bias,
02415                     const ChannelType channel, const unsigned int matte) {
02416 
02417       int2 currentPixel;
02418       currentPixel.x = get_global_id(0);
02419       currentPixel.y = get_global_id(1);
02420 
02421       if (currentPixel.x >= imageWidth
02422           || currentPixel.y >= imageHeight)
02423           return;
02424 
02425       float4 pixel;
02426       pixel.x = (float)bias.x;
02427       pixel.y = (float)bias.y;
02428       pixel.z = (float)bias.z;
02429       pixel.w = (float)bias.w;
02430 
02431       if (((channel & OpacityChannel) == 0) || (matte == 0)) {
02432         
02433         for (int i = 0; i < width; i++) {
02434           // only support EdgeVirtualPixelMethod through ClampToCanvas
02435           // TODO: implement other virtual pixel method
02436           int2 samplePixel = currentPixel + offset[i];
02437           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
02438           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
02439           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
02440 
02441           pixel.x += (filter[i] * (float)samplePixelValue.x);
02442           pixel.y += (filter[i] * (float)samplePixelValue.y);
02443           pixel.z += (filter[i] * (float)samplePixelValue.z);
02444           pixel.w += (filter[i] * (float)samplePixelValue.w);
02445         }
02446 
02447         CLPixelType outputPixel;
02448         outputPixel.x = ClampToQuantum(pixel.x);
02449         outputPixel.y = ClampToQuantum(pixel.y);
02450         outputPixel.z = ClampToQuantum(pixel.z);
02451         outputPixel.w = ClampToQuantum(pixel.w);
02452         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
02453       }
02454       else {
02455 
02456         float gamma = 0.0f;
02457         for (int i = 0; i < width; i++) {
02458           // only support EdgeVirtualPixelMethod through ClampToCanvas
02459           // TODO: implement other virtual pixel method
02460           int2 samplePixel = currentPixel + offset[i];
02461           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
02462           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
02463 
02464           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
02465 
02466           float alpha = QuantumScale*(QuantumRange-samplePixelValue.w);
02467           float k = filter[i];
02468           pixel.x = pixel.x + k * alpha * samplePixelValue.x;
02469           pixel.y = pixel.y + k * alpha * samplePixelValue.y;
02470           pixel.z = pixel.z + k * alpha * samplePixelValue.z;
02471 
02472           pixel.w += k * alpha * samplePixelValue.w;
02473 
02474           gamma+=k*alpha;
02475         }
02476         gamma = PerceptibleReciprocal(gamma);
02477         pixel.xyz = gamma*pixel.xyz;
02478 
02479         CLPixelType outputPixel;
02480         outputPixel.x = ClampToQuantum(pixel.x);
02481         outputPixel.y = ClampToQuantum(pixel.y);
02482         outputPixel.z = ClampToQuantum(pixel.z);
02483         outputPixel.w = ClampToQuantum(pixel.w);
02484         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
02485       }
02486     }
02487   )
02488 
02489 /*
02490 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02491 %                                                                             %
02492 %                                                                             %
02493 %                                                                             %
02494 %     R a d i a l B l u r                                                     %
02495 %                                                                             %
02496 %                                                                             %
02497 %                                                                             %
02498 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02499 */
02500 
02501   STRINGIFY(
02502     __kernel void RadialBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
02503                               const float4 bias,
02504                               const unsigned int channel, const unsigned int matte,
02505                               const float2 blurCenter,
02506                               __constant float *cos_theta, __constant float *sin_theta, 
02507                               const unsigned int cossin_theta_size)
02508       {
02509         const int x = get_global_id(0);  
02510         const int y = get_global_id(1);
02511         const int columns = get_global_size(0);
02512         const int rows = get_global_size(1);  
02513         unsigned int step = 1;
02514         float center_x = (float) x - blurCenter.x;
02515         float center_y = (float) y - blurCenter.y;
02516         float radius = hypot(center_x, center_y);
02517         
02518         //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
02519         float blur_radius = hypot(blurCenter.x, blurCenter.y);
02520 
02521         if (radius > MagickEpsilon)
02522         {
02523           step = (unsigned int) (blur_radius / radius);
02524           if (step == 0)
02525             step = 1;
02526           if (step >= cossin_theta_size)
02527             step = cossin_theta_size-1;
02528         }
02529 
02530         float4 result;
02531         result.x = (float)bias.x;
02532         result.y = (float)bias.y;
02533         result.z = (float)bias.z;
02534         result.w = (float)bias.w;
02535         float normalize = 0.0f;
02536 
02537         if (((channel & OpacityChannel) == 0) || (matte == 0)) {
02538           for (unsigned int i=0; i<cossin_theta_size; i+=step)
02539           {
02540             result += convert_float4(im[
02541               ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+ 
02542                 ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
02543               normalize += 1.0f;
02544           }
02545           normalize = PerceptibleReciprocal(normalize);
02546           result = result * normalize;
02547         }
02548         else {
02549           float gamma = 0.0f;
02550           for (unsigned int i=0; i<cossin_theta_size; i+=step)
02551           {
02552             float4 p = convert_float4(im[
02553               ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+ 
02554                 ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
02555             
02556             float alpha = (float)(QuantumScale*(QuantumRange-p.w));
02557             result.x += alpha * p.x;
02558             result.y += alpha * p.y;
02559             result.z += alpha * p.z;
02560             result.w += p.w;
02561             gamma+=alpha;
02562             normalize += 1.0f;
02563           }
02564           gamma = PerceptibleReciprocal(gamma);
02565           normalize = PerceptibleReciprocal(normalize);
02566           result.x = gamma*result.x;
02567           result.y = gamma*result.y;
02568           result.z = gamma*result.z;
02569           result.w = normalize*result.w;
02570         }
02571         filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
02572           ClampToQuantum(result.z), ClampToQuantum(result.w)); 
02573       }
02574   )
02575 
02576 /*
02577 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02578 %                                                                             %
02579 %                                                                             %
02580 %                                                                             %
02581 %     R e s i z e                                                             %
02582 %                                                                             %
02583 %                                                                             %
02584 %                                                                             %
02585 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
02586 */
02587 
02588   STRINGIFY(
02589   // Based on Box from resize.c
02590   float BoxResizeFilter(const float x)
02591   {
02592     return 1.0f;
02593   }
02594   )
02595     
02596   STRINGIFY(
02597   // Based on CubicBC from resize.c
02598   float CubicBC(const float x,const __global float* resizeFilterCoefficients)
02599   {
02600     /*
02601     Cubic Filters using B,C determined values:
02602     Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
02603     Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
02604     Spline              B = 1   C = 0    B-Spline Gaussian approximation
02605     Hermite             B = 0   C = 0    B-Spline interpolator
02606 
02607     See paper by Mitchell and Netravali, Reconstruction Filters in Computer
02608     Graphics Computer Graphics, Volume 22, Number 4, August 1988
02609     http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
02610     Mitchell.pdf.
02611 
02612     Coefficents are determined from B,C values:
02613     P0 = (  6 - 2*B       )/6 = coeff[0]
02614     P1 =         0
02615     P2 = (-18 +12*B + 6*C )/6 = coeff[1]
02616     P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
02617     Q0 = (      8*B +24*C )/6 = coeff[3]
02618     Q1 = (    -12*B -48*C )/6 = coeff[4]
02619     Q2 = (      6*B +30*C )/6 = coeff[5]
02620     Q3 = (    - 1*B - 6*C )/6 = coeff[6]
02621 
02622     which are used to define the filter:
02623 
02624     P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
02625     Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
02626 
02627     which ensures function is continuous in value and derivative (slope).
02628     */
02629     if (x < 1.0)
02630       return(resizeFilterCoefficients[0]+x*(x*
02631       (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
02632     if (x < 2.0)
02633       return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
02634       (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
02635     return(0.0);
02636   }
02637   )
02638 
02639   STRINGIFY(
02640   float Sinc(const float x)
02641   {
02642     if (x != 0.0f)
02643     {
02644       const float alpha=(float) (MagickPI*x);
02645       return sinpi(x)/alpha;
02646     }
02647     return(1.0f);
02648   }
02649   )
02650 
02651   STRINGIFY(
02652   float Triangle(const float x)
02653   {
02654     /*
02655     1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
02656     a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
02657     for Sinc().
02658     */
02659     return ((x<1.0f)?(1.0f-x):0.0f);
02660   }
02661   )
02662 
02663 
02664   STRINGIFY(
02665   float Hanning(const float x)
02666   {
02667     /*
02668     Cosine window function:
02669       0.5+0.5*cos(pi*x).
02670     */
02671     const float cosine=cos((MagickPI*x));
02672     return(0.5f+0.5f*cosine);
02673   }
02674   )
02675 
02676   STRINGIFY(
02677   float Hamming(const float x)
02678   {
02679     /*
02680       Offset cosine window function:
02681        .54 + .46 cos(pi x).
02682     */
02683     const float cosine=cos((MagickPI*x));
02684     return(0.54f+0.46f*cosine);
02685   }
02686   )
02687 
02688   STRINGIFY(
02689   float Blackman(const float x)
02690   {
02691     /*
02692       Blackman: 2nd order cosine windowing function:
02693         0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
02694 
02695       Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
02696       five flops.
02697     */
02698     const float cosine=cos((MagickPI*x));
02699     return(0.34f+cosine*(0.5f+cosine*0.16f));
02700   }
02701   )
02702 
02703 
02704 
02705 
02706   STRINGIFY(
02707   static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
02708   {
02709     switch (filterType)
02710     {
02711     /* Call Sinc even for SincFast to get better precision on GPU 
02712        and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
02713     case SincWeightingFunction:
02714     case SincFastWeightingFunction:
02715       return Sinc(x);
02716     case CubicBCWeightingFunction:
02717       return CubicBC(x,filterCoefficients);
02718     case BoxWeightingFunction:
02719       return BoxResizeFilter(x);
02720     case TriangleWeightingFunction:
02721       return Triangle(x);
02722     case HanningWeightingFunction:
02723       return Hanning(x);
02724     case HammingWeightingFunction:
02725       return Hamming(x);
02726     case BlackmanWeightingFunction:
02727       return Blackman(x);
02728 
02729     default:
02730       return 0.0f;
02731     }
02732   }
02733   )
02734 
02735 
02736   STRINGIFY(
02737   static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
02738            , const ResizeWeightingFunctionType resizeWindowType
02739            , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
02740   {
02741     float scale;
02742     float xBlur = fabs(x/resizeFilterBlur);
02743     if (resizeWindowSupport < MagickEpsilon
02744         || resizeWindowType == BoxWeightingFunction)
02745     {
02746       scale = 1.0f;
02747     }
02748     else
02749     {
02750       scale = resizeFilterScale;
02751       scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
02752     }
02753     float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
02754     return weight;
02755   }
02756 
02757   )
02758 
02759   ;
02760   const char* accelerateKernels2 =
02761 
02762   STRINGIFY(
02763 
02764   static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
02765     return (numWorkItems/pixelPerWorkgroup);
02766   }
02767 
02768   // returns the index of the pixel for the current workitem to compute.
02769   // returns -1 if this workitem doesn't need to participate in any computation
02770   static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
02771     const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
02772     int pixelIndex = itemID/numWorkItemsPerPixel;
02773     pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
02774     return pixelIndex;
02775   }
02776  
02777   )
02778 
02779   STRINGIFY(
02780  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
02781  void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
02782   , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
02783   , const int resizeFilterType, const int resizeWindowType
02784   , const __global float* resizeFilterCubicCoefficients
02785   , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
02786   , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
02787   , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
02788 
02789 
02790     // calculate the range of resized image pixels computed by this workgroup
02791     const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
02792     const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
02793     const unsigned int actualNumPixelToCompute = stopX - startX;
02794 
02795     // calculate the range of input image pixels to cache
02796     float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
02797     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
02798     scale = PerceptibleReciprocal(scale);
02799 
02800     const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
02801     const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
02802 
02803     // cache the input pixels into local memory
02804     const unsigned int y = get_global_id(1);
02805     event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
02806     wait_group_events(1,&e);
02807 
02808     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
02809     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
02810     {
02811 
02812       const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
02813       const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
02814       const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
02815 
02816       // determine which resized pixel computed by this workitem
02817       const unsigned int itemID = get_local_id(0);
02818       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
02819       
02820       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
02821 
02822       float4 filteredPixel = (float4)0.0f;
02823       float density = 0.0f;
02824       float gamma = 0.0f;
02825       // -1 means this workitem doesn't participate in the computation
02826       if (pixelIndex != -1) {
02827 
02828         // x coordinated of the resized pixel computed by this workitem
02829         const int x = chunkStartX + pixelIndex;
02830 
02831         // calculate how many steps required for this pixel
02832         const float bisect = (x+0.5)/xFactor+MagickEpsilon;
02833         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
02834         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
02835         const unsigned int n = stop - start;
02836 
02837         // calculate how many steps this workitem will contribute
02838         unsigned int numStepsPerWorkItem = n / numItems;
02839         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
02840 
02841         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
02842         if (startStep < n) {
02843           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
02844 
02845           unsigned int cacheIndex = start+startStep-cacheRangeStartX;
02846           if (matte == 0) {
02847 
02848             for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
02849               float4 cp = convert_float4(inputImageCache[cacheIndex]);
02850 
02851               float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
02852                 , (ResizeWeightingFunctionType)resizeWindowType
02853                 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
02854 
02855               filteredPixel += ((float4)weight)*cp;
02856               density+=weight;
02857             }
02858 
02859 
02860           }
02861           else {
02862             for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
02863               CLPixelType p = inputImageCache[cacheIndex];
02864 
02865               float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
02866                 , (ResizeWeightingFunctionType)resizeWindowType
02867                 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
02868 
02869               float alpha = weight * QuantumScale * GetPixelAlpha(p);
02870               float4 cp = convert_float4(p);
02871 
02872               filteredPixel.x += alpha * cp.x;
02873               filteredPixel.y += alpha * cp.y;
02874               filteredPixel.z += alpha * cp.z;
02875               filteredPixel.w += weight * cp.w;
02876 
02877               density+=weight;
02878               gamma+=alpha;
02879             }
02880           }
02881         }
02882       }
02883 
02884     // initialize the accumulators to zero
02885     if (itemID < actualNumPixelInThisChunk) {
02886       outputPixelCache[itemID] = (float4)0.0f;
02887       densityCache[itemID] = 0.0f;
02888       if (matte != 0)
02889         gammaCache[itemID] = 0.0f;
02890     }
02891     barrier(CLK_LOCAL_MEM_FENCE);
02892 
02893     // accumulatte the filtered pixel value and the density
02894     for (unsigned int i = 0; i < numItems; i++) {
02895       if (pixelIndex != -1) {
02896         if (itemID%numItems == i) {
02897           outputPixelCache[pixelIndex]+=filteredPixel;
02898           densityCache[pixelIndex]+=density;
02899           if (matte!=0) {
02900             gammaCache[pixelIndex]+=gamma;
02901           }
02902         }
02903       }
02904       barrier(CLK_LOCAL_MEM_FENCE);
02905     }
02906 
02907     if (itemID < actualNumPixelInThisChunk) {
02908       if (matte==0) {
02909         float density = densityCache[itemID];
02910         float4 filteredPixel = outputPixelCache[itemID];
02911         if (density!= 0.0f && density != 1.0)
02912         {
02913           density = PerceptibleReciprocal(density);
02914           filteredPixel *= (float4)density;
02915         }
02916         filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
02917                                                                        , ClampToQuantum(filteredPixel.y)
02918                                                                        , ClampToQuantum(filteredPixel.z)
02919                                                                        , ClampToQuantum(filteredPixel.w));
02920       }
02921       else {
02922         float density = densityCache[itemID];
02923         float gamma = gammaCache[itemID];
02924         float4 filteredPixel = outputPixelCache[itemID];
02925 
02926         if (density!= 0.0f && density != 1.0) {
02927           density = PerceptibleReciprocal(density);
02928           filteredPixel *= (float4)density;
02929           gamma *= density;
02930         }
02931         gamma = PerceptibleReciprocal(gamma);
02932 
02933         CLPixelType fp;
02934         fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
02935           , ClampToQuantum(gamma*filteredPixel.y)
02936           , ClampToQuantum(gamma*filteredPixel.z)
02937           , ClampToQuantum(filteredPixel.w));
02938 
02939         filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
02940 
02941       }
02942     }
02943 
02944     } // end of chunking loop
02945   }
02946   )
02947 
02948 
02949   STRINGIFY(
02950  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
02951  void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
02952   , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
02953   , const int resizeFilterType, const int resizeWindowType
02954   , const __global float* resizeFilterCubicCoefficients
02955   , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
02956   , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
02957   , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
02958 
02959 
02960     // calculate the range of resized image pixels computed by this workgroup
02961     const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
02962     const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
02963     const unsigned int actualNumPixelToCompute = stopY - startY;
02964 
02965     // calculate the range of input image pixels to cache
02966     float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
02967     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
02968     scale = PerceptibleReciprocal(scale);
02969 
02970     const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
02971     const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
02972 
02973     // cache the input pixels into local memory
02974     const unsigned int x = get_global_id(0);
02975     event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
02976     wait_group_events(1,&e);
02977 
02978     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
02979     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
02980     {
02981 
02982       const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
02983       const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
02984       const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
02985 
02986       // determine which resized pixel computed by this workitem
02987       const unsigned int itemID = get_local_id(1);
02988       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
02989       
02990       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
02991 
02992       float4 filteredPixel = (float4)0.0f;
02993       float density = 0.0f;
02994       float gamma = 0.0f;
02995       // -1 means this workitem doesn't participate in the computation
02996       if (pixelIndex != -1) {
02997 
02998         // x coordinated of the resized pixel computed by this workitem
02999         const int y = chunkStartY + pixelIndex;
03000 
03001         // calculate how many steps required for this pixel
03002         const float bisect = (y+0.5)/yFactor+MagickEpsilon;
03003         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
03004         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
03005         const unsigned int n = stop - start;
03006 
03007         // calculate how many steps this workitem will contribute
03008         unsigned int numStepsPerWorkItem = n / numItems;
03009         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
03010 
03011         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
03012         if (startStep < n) {
03013           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
03014 
03015           unsigned int cacheIndex = start+startStep-cacheRangeStartY;
03016           if (matte == 0) {
03017 
03018             for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
03019               float4 cp = convert_float4(inputImageCache[cacheIndex]);
03020 
03021               float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
03022                 , (ResizeWeightingFunctionType)resizeWindowType
03023                 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
03024 
03025               filteredPixel += ((float4)weight)*cp;
03026               density+=weight;
03027             }
03028 
03029 
03030           }
03031           else {
03032             for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
03033               CLPixelType p = inputImageCache[cacheIndex];
03034 
03035               float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
03036                 , (ResizeWeightingFunctionType)resizeWindowType
03037                 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
03038 
03039               float alpha = weight * QuantumScale * GetPixelAlpha(p);
03040               float4 cp = convert_float4(p);
03041 
03042               filteredPixel.x += alpha * cp.x;
03043               filteredPixel.y += alpha * cp.y;
03044               filteredPixel.z += alpha * cp.z;
03045               filteredPixel.w += weight * cp.w;
03046 
03047               density+=weight;
03048               gamma+=alpha;
03049             }
03050          }
03051       }
03052     }
03053 
03054     // initialize the accumulators to zero
03055     if (itemID < actualNumPixelInThisChunk) {
03056       outputPixelCache[itemID] = (float4)0.0f;
03057       densityCache[itemID] = 0.0f;
03058       if (matte != 0)
03059         gammaCache[itemID] = 0.0f;
03060     }
03061     barrier(CLK_LOCAL_MEM_FENCE);
03062 
03063     // accumulatte the filtered pixel value and the density
03064     for (unsigned int i = 0; i < numItems; i++) {
03065       if (pixelIndex != -1) {
03066         if (itemID%numItems == i) {
03067           outputPixelCache[pixelIndex]+=filteredPixel;
03068           densityCache[pixelIndex]+=density;
03069           if (matte!=0) {
03070             gammaCache[pixelIndex]+=gamma;
03071           }
03072         }
03073       }
03074       barrier(CLK_LOCAL_MEM_FENCE);
03075     }
03076 
03077     if (itemID < actualNumPixelInThisChunk) {
03078       if (matte==0) {
03079         float density = densityCache[itemID];
03080         float4 filteredPixel = outputPixelCache[itemID];
03081         if (density!= 0.0f && density != 1.0)
03082         {
03083           density = PerceptibleReciprocal(density);
03084           filteredPixel *= (float4)density;
03085         }
03086         filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
03087                                                                        , ClampToQuantum(filteredPixel.y)
03088                                                                        , ClampToQuantum(filteredPixel.z)
03089                                                                        , ClampToQuantum(filteredPixel.w));
03090       }
03091       else {
03092         float density = densityCache[itemID];
03093         float gamma = gammaCache[itemID];
03094         float4 filteredPixel = outputPixelCache[itemID];
03095 
03096         if (density!= 0.0f && density != 1.0) {
03097           density = PerceptibleReciprocal(density);
03098           filteredPixel *= (float4)density;
03099           gamma *= density;
03100         }
03101         gamma = PerceptibleReciprocal(gamma);
03102 
03103         CLPixelType fp;
03104         fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
03105           , ClampToQuantum(gamma*filteredPixel.y)
03106           , ClampToQuantum(gamma*filteredPixel.z)
03107           , ClampToQuantum(filteredPixel.w));
03108 
03109         filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
03110 
03111       }
03112     }
03113 
03114     } // end of chunking loop
03115   }
03116   )
03117 
03118 /*
03119 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
03120 %                                                                             %
03121 %                                                                             %
03122 %                                                                             %
03123 %     U n s h a r p M a s k                                                   %
03124 %                                                                             %
03125 %                                                                             %
03126 %                                                                             %
03127 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
03128 */
03129 
03130     STRINGIFY(
03131     __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage, 
03132           const __global float4 *blurRowData, __global CLPixelType *filtered_im,
03133           const unsigned int imageColumns, const unsigned int imageRows, 
03134           __local float4* cachedData, __local float* cachedFilter,
03135           const ChannelType channel, const __global float *filter, const unsigned int width, 
03136           const float gain, const float threshold)
03137     {
03138       const unsigned int radius = (width-1)/2;
03139 
03140       // cache the pixel shared by the workgroup
03141       const int groupX = get_group_id(0);
03142       const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
03143       const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
03144 
03145       if (groupStartY >= 0
03146           && groupStopY < imageRows) {
03147         event_t e = async_work_group_strided_copy(cachedData
03148                                                 ,blurRowData+groupStartY*imageColumns+groupX
03149                                                 ,groupStopY-groupStartY,imageColumns,0);
03150         wait_group_events(1,&e);
03151       }
03152       else {
03153         for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
03154           cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
03155         }
03156         barrier(CLK_LOCAL_MEM_FENCE);
03157       }
03158       // cache the filter as well
03159       event_t e = async_work_group_copy(cachedFilter,filter,width,0);
03160       wait_group_events(1,&e);
03161 
03162       // only do the work if this is not a patched item
03163       //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
03164       const int cy = get_global_id(1);
03165 
03166       if (cy < imageRows) {
03167         float4 blurredPixel = (float4) 0.0f;
03168 
03169         int i = 0;
03170 
03171         \n #ifndef UFACTOR   \n 
03172           \n #define UFACTOR 8 \n 
03173           \n #endif                  \n 
03174 
03175           for ( ; i+UFACTOR < width; ) 
03176           {
03177             \n #pragma unroll UFACTOR \n
03178               for (int j=0; j < UFACTOR; j++, i++)
03179               {
03180                 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
03181               }
03182           }
03183 
03184         for ( ; i < width; i++)
03185         {
03186           blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
03187         }
03188 
03189         blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
03190                                       ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
03191 
03192         float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
03193         float4 outputPixel = inputImagePixel - blurredPixel;
03194 
03195         float quantumThreshold = QuantumRange*threshold;
03196 
03197         int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
03198         outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
03199 
03200         //write back
03201         filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
03202                                                             ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
03203 
03204       }
03205     }
03206     )
03207 
03208 
03209 
03210     STRINGIFY(
03211       __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,
03212                          __constant float *filter,
03213                          const unsigned int width, 
03214                          const unsigned int imageColumns, const unsigned int imageRows,
03215                          __local float4 *pixels, 
03216                          const float gain, const float threshold, const unsigned int justBlur)
03217       {
03218         const int x = get_global_id(0);
03219         const int y = get_global_id(1);
03220 
03221         const unsigned int radius = (width - 1) / 2;
03222                                 
03223                 int row = y - radius;
03224                 int baseRow = get_group_id(1) * get_local_size(1) - radius;
03225                 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
03226                                 
03227                 while (row < endRow) {
03228                         int srcy =  (row < 0) ? -row : row;                     // mirror pad
03229                         srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;
03230                                         
03231                         float4 value = 0.0f;
03232                                         
03233                         int ix = x - radius;
03234                         int i = 0;
03235 
03236                         while (i + 7 < width) {
03237                                 for (int j = 0; j < 8; ++j) {           // unrolled
03238                                         int srcx = ix + j;
03239                                         srcx = (srcx < 0) ? -srcx : srcx;
03240                                         srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
03241                                         value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);
03242                                 }
03243                                 ix += 8;
03244                                 i += 8;
03245                         }
03246 
03247                         while (i < width) {
03248                                 int srcx = (ix < 0) ? -ix : ix;                 // mirror pad
03249                                 srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
03250                                 value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);
03251                                 ++i;
03252                                 ++ix;
03253                         }       
03254                         pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
03255                         row += get_local_size(1);
03256                 }
03257                                 
03258                         
03259                 barrier(CLK_LOCAL_MEM_FENCE);
03260 
03261                                                 
03262                 const int px = get_local_id(0);
03263                 const int py = get_local_id(1);
03264                 const int prp = get_local_size(0);
03265                 float4 value = (float4)(0.0f);
03266                         
03267                 int i = 0;
03268                 while (i + 7 < width) {                 // unrolled
03269                         value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
03270                         value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];
03271                         value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];
03272                         value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];
03273                         value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];
03274                         value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];
03275                         value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];
03276                         value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];
03277                         i += 8;
03278                 }
03279                 while (i < width) {
03280                         value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
03281                         ++i;
03282                 }
03283                 if ((x < imageColumns) && (y < imageRows)) {
03284                         if (justBlur == 0) {            // apply sharpening
03285                                 float4 srcPixel = convert_float4(im[x + y * imageColumns]);
03286                                 float4 diff = srcPixel - value;
03287 
03288                                 float quantumThreshold = QuantumRange*threshold;
03289 
03290                                 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
03291                                 value = select(srcPixel + diff * gain, srcPixel, mask);
03292                         }
03293                         filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));
03294                 }
03295         }
03296         )
03297 
03298         STRINGIFY(
03299                 __kernel __attribute__((reqd_work_group_size(64, 4, 1))) void WaveletDenoise(__global CLPixelType *srcImage, __global CLPixelType *dstImage,
03300                                         const float threshold,
03301                                         const int passes,
03302                                         const int imageWidth,
03303                                         const int imageHeight)
03304         {
03305                 const int pad = (1 << (passes - 1));
03306                 const int tileSize = 64;
03307                 const int tileRowPixels = 64;
03308                 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
03309 
03310                 CLPixelType stage[16];
03311 
03312                 local float buffer[64 * 64];
03313 
03314                 int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
03315                 int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
03316 
03317                 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03318                         stage[i / 4] = srcImage[mirrorTop(mirrorBottom(srcx), imageWidth) + (mirrorTop(mirrorBottom(srcy + i) , imageHeight)) * imageWidth];
03319                 }
03320 
03321 
03322                 for (int channel = 0; channel < 3; ++channel) {
03323                         // Load LDS
03324                         switch (channel) {
03325                         case 0:
03326                                 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03327                                         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s0);
03328                                 break;
03329                         case 1:
03330                                 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03331                                         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s1);
03332                                 break;
03333                         case 2:
03334                                 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03335                                         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s2);
03336                                 break;
03337                         }
03338 
03339 
03340                         // Process
03341 
03342                         float tmp[16];
03343                         float accum[16];
03344                         float pixel;
03345 
03346                         for (int pass = 0; pass < passes; ++pass) {
03347                                 const int radius = 1 << pass;
03348                                 const int x = get_local_id(0);
03349                                 const float thresh = threshold * noise[pass];
03350 
03351                                 if (pass == 0)
03352                                         accum[0] = accum[1] = accum[2] = accum[3] = accum[4] = accum[5] = accum[6] = accum[6] = accum[7] = accum[8] = accum[9] = accum[10] = accum[11] = accum[12] = accum[13] = accum[14] = accum[15] = 0.0f;
03353 
03354                                 // Snapshot input
03355 
03356                                 // Apply horizontal hat
03357                                 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03358                                         const int offset = i * tileRowPixels;
03359                                         if (pass == 0)
03360                                                 tmp[i / 4] = buffer[x + offset];                // snapshot input on first pass
03361                                         pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
03362                                         barrier(CLK_LOCAL_MEM_FENCE);
03363                                         buffer[x + offset] = pixel;
03364                                 }
03365                                 barrier(CLK_LOCAL_MEM_FENCE);
03366                                 // Apply vertical hat
03367                                 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03368                                         pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
03369                                         float delta = tmp[i / 4] - pixel;
03370                                         tmp[i / 4] = pixel;                                                     // hold output in tmp until all workitems are done
03371                                         if (delta < -thresh)
03372                                                 delta += thresh;
03373                                         else if (delta > thresh)
03374                                                 delta -= thresh;
03375                                         else
03376                                                 delta = 0;
03377                                         accum[i / 4] += delta;
03378 
03379                                 }
03380                                 barrier(CLK_LOCAL_MEM_FENCE);
03381                                 if (pass < passes - 1)
03382                                         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03383                                                 buffer[x + i * tileRowPixels] = tmp[i / 4];             // store lowpass for next pass
03384                                 else  // last pass
03385                                         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03386                                                 accum[i / 4] += tmp[i / 4];                                                     // add the lowpass signal back to output
03387                                 barrier(CLK_LOCAL_MEM_FENCE);
03388                         }
03389 
03390                         switch (channel) {
03391                         case 0:
03392                                 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03393                                         stage[i / 4].s0 = ClampToQuantum(accum[i / 4]);
03394                                 break;
03395                         case 1:
03396                                 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03397                                         stage[i / 4].s1 = ClampToQuantum(accum[i / 4]);
03398                                 break;
03399                         case 2:
03400                                 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
03401                                         stage[i / 4].s2 = ClampToQuantum(accum[i / 4]);
03402                                 break;
03403                         }
03404 
03405                         barrier(CLK_LOCAL_MEM_FENCE);
03406                 }
03407 
03408                 // Write from stage to output
03409 
03410                 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx  < imageWidth)) {
03411                         //for (int i = pad + get_local_id(1); i < tileSize - pad; i += get_local_size(1)) {
03412                         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
03413                                 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
03414                                         dstImage[srcx + (srcy + i) * imageWidth] = stage[i / 4];
03415                                 }
03416                         }
03417                 }
03418         }
03419         )
03420   ;
03421 
03422 #endif // MAGICKCORE_OPENCL_SUPPORT
03423 
03424 #if defined(__cplusplus) || defined(c_plusplus)
03425 }
03426 #endif
03427 
03428 #endif // MAGICKCORE_ACCELERATE_PRIVATE_H

Generated on 2 Sep 2019 for MagickCore by  doxygen 1.6.1