ExaFMM: include/kernel.h Source File

ExaFMM 1
Fast-multipole Method for exascale systems
00001 /*
00002 Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
00003 
00004 Permission is hereby granted, free of charge, to any person obtaining a copy
00005 of this software and associated documentation files (the "Software"), to deal
00006 in the Software without restriction, including without limitation the rights
00007 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00008 copies of the Software, and to permit persons to whom the Software is
00009 furnished to do so, subject to the following conditions:
00010 
00011 The above copyright notice and this permission notice shall be included in
00012 all copies or substantial portions of the Software.
00013 
00014 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00015 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00016 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00017 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00018 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00019 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00020 THE SOFTWARE.
00021 */
00022 #ifndef kernel_h
00023 #define kernel_h
00024 #include "sort.h"
00025 #define ODDEVEN(n) ((((n) & 1) == 1) ? -1 : 1)
00026 
00027 const int  P2 = P * P;                                          //!< P^2
00028 const int  P4 = P2 * P2;                                        //!< P^4
00029 
00030 //! Unified CPU/GPU kernel class
00031 class KernelBase : public Sort {
00032 protected:
00033   vect                 X0;                                      //!< Center of root cell
00034   real                 R0;                                      //!< Radius of root cell
00035   C_iter               Ci0;                                     //!< icells.begin()
00036   C_iter               Cj0;                                     //!< jcells.begin()
00037 
00038   int                  ATOMS;                                   //!< Number of atom types in Van der Waals
00039   std::vector<real>    RSCALE;                                  //!< Scaling parameter for Van der Waals
00040   std::vector<real>    GSCALE;                                  //!< Scaling parameter for Van der Waals
00041   real                 KSIZE;                                   //!< Number of waves in Ewald summation
00042   real                 ALPHA;                                   //!< Scaling parameter for Ewald summation
00043   real                 SIGMA;                                   //!< Scaling parameter for Ewald summation
00044 
00045   std::vector<int>     keysHost;                                //!< Offsets for rangeHost
00046   std::vector<int>     rangeHost;                               //!< Offsets for sourceHost
00047   std::vector<gpureal> sourceHost;                              //!< Sources on host
00048   std::vector<gpureal> targetHost;                              //!< Targets on host
00049   std::vector<gpureal> constHost;                               //!< Constants on host
00050   Map                  sourceBegin;                             //!< Define map for offset of source cells
00051   Map                  sourceSize;                              //!< Define map for size of source cells
00052   Map                  targetBegin;                             //!< Define map for offset of target cells
00053   size_t               keysDevcSize;                            //!< Size of offsets for rangeDevc
00054   size_t               rangeDevcSize;                           //!< Size of offsets for sourceDevc
00055   size_t               sourceDevcSize;                          //!< Size of sources on device
00056   size_t               targetDevcSize;                          //!< Size of targets on device
00057   int                 *keysDevc;                                //!< Offsets for rangeDevc
00058   int                 *rangeDevc;                               //!< Offsets for sourceDevc
00059   gpureal             *sourceDevc;                              //!< Sources on device
00060   gpureal             *targetDevc;                              //!< Targets on device
00061 
00062   real *factorial;                                              //!< Factorial
00063   real *prefactor;                                              //!< \f$ \sqrt{ \frac{(n - |m|)!}{(n + |m|)!} } \f$
00064   real *Anm;                                                    //!< \f$ (-1)^n / \sqrt{ \frac{(n + m)!}{(n - m)!} } \f$
00065   complex *Cnm;                                                 //!< M2L translation matrix \f$ C_{jn}^{km} \f$
00066 public:
00067   real NP2P;                                                    //!< Number of P2P kernel calls
00068   real NM2P;                                                    //!< Number of M2P kernel calls
00069   real NM2L;                                                    //!< Number of M2L kernel calls
00070 
00071 protected:
00072 //! Evaluate solid harmonics \f$ r^n Y_{n}^{m} \f$
00073   void evalMultipole(real rho, real alpha, real beta, complex *Ynm, complex *YnmTheta) const {
00074     const complex I(0.,1.);                                     // Imaginary unit
00075     real x = std::cos(alpha);                                   // x = cos(alpha)
00076     real y = std::sin(alpha);                                   // y = sin(alpha)
00077     real fact = 1;                                              // Initialize 2 * m + 1
00078     real pn = 1;                                                // Initialize Legendre polynomial Pn
00079     real rhom = 1;                                              // Initialize rho^m
00080     for( int m=0; m!=P; ++m ) {                                 // Loop over m in Ynm
00081       complex eim = std::exp(I * real(m * beta));               //  exp(i * m * beta)
00082       real p = pn;                                              //  Associated Legendre polynomial Pnm
00083       int npn = m * m + 2 * m;                                  //  Index of Ynm for m > 0
00084       int nmn = m * m;                                          //  Index of Ynm for m < 0
00085       Ynm[npn] = rhom * p * prefactor[npn] * eim;               //  rho^m * Ynm for m > 0
00086       Ynm[nmn] = std::conj(Ynm[npn]);                           //  Use conjugate relation for m < 0
00087       real p1 = p;                                              //  Pnm-1
00088       p = x * (2 * m + 1) * p1;                                 //  Pnm using recurrence relation
00089       YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm
00090       rhom *= rho;                                              //  rho^m
00091       real rhon = rhom;                                         //  rho^n
00092       for( int n=m+1; n!=P; ++n ) {                             //  Loop over n in Ynm
00093         int npm = n * n + n + m;                                //   Index of Ynm for m > 0
00094         int nmm = n * n + n - m;                                //   Index of Ynm for m < 0
00095         Ynm[npm] = rhon * p * prefactor[npm] * eim;             //   rho^n * Ynm
00096         Ynm[nmm] = std::conj(Ynm[npm]);                         //   Use conjugate relation for m < 0
00097         real p2 = p1;                                           //   Pnm-2
00098         p1 = p;                                                 //   Pnm-1
00099         p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);//   Pnm using recurrence relation
00100         YnmTheta[npm] = rhon * ((n - m + 1) * p - (n + 1) * x * p1) / y * prefactor[npm] * eim;// theta derivative
00101         rhon *= rho;                                            //   Update rho^n
00102       }                                                         //  End loop over n in Ynm
00103       pn = -pn * fact * y;                                      //  Pn
00104       fact += 2;                                                //  2 * m + 1
00105     }                                                           // End loop over m in Ynm
00106   }
00107 
00108 //! Evaluate singular harmonics \f$ r^{-n-1} Y_n^m \f$
00109   void evalLocal(real rho, real alpha, real beta, complex *Ynm, complex *YnmTheta) const {
00110     const complex I(0.,1.);                                     // Imaginary unit
00111     real x = std::cos(alpha);                                   // x = cos(alpha)
00112     real y = std::sin(alpha);                                   // y = sin(alpha)
00113     real fact = 1;                                              // Initialize 2 * m + 1
00114     real pn = 1;                                                // Initialize Legendre polynomial Pn
00115     real rhom = 1.0 / rho;                                      // Initialize rho^(-m-1)
00116     for( int m=0; m!=2*P; ++m ) {                               // Loop over m in Ynm
00117       complex eim = std::exp(I * real(m * beta));               //  exp(i * m * beta)
00118       real p = pn;                                              //  Associated Legendre polynomial Pnm
00119       int npn = m * m + 2 * m;                                  //  Index of Ynm for m > 0
00120       int nmn = m * m;                                          //  Index of Ynm for m < 0
00121       Ynm[npn] = rhom * p * prefactor[npn] * eim;               //  rho^(-m-1) * Ynm for m > 0
00122       Ynm[nmn] = std::conj(Ynm[npn]);                           //  Use conjugate relation for m < 0
00123       real p1 = p;                                              //  Pnm-1
00124       p = x * (2 * m + 1) * p1;                                 //  Pnm using recurrence relation
00125       YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm
00126       rhom /= rho;                                              //  rho^(-m-1)
00127       real rhon = rhom;                                         //  rho^(-n-1)
00128       for( int n=m+1; n!=2*P; ++n ) {                           //  Loop over n in Ynm
00129         int npm = n * n + n + m;                                //   Index of Ynm for m > 0
00130         int nmm = n * n + n - m;                                //   Index of Ynm for m < 0
00131         Ynm[npm] = rhon * p * prefactor[npm] * eim;             //   rho^n * Ynm for m > 0
00132         Ynm[nmm] = std::conj(Ynm[npm]);                         //   Use conjugate relation for m < 0
00133         real p2 = p1;                                           //   Pnm-2
00134         p1 = p;                                                 //   Pnm-1
00135         p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);//   Pnm using recurrence relation
00136         YnmTheta[npm] = rhon * ((n - m + 1) * p - (n + 1) * x * p1) / y * prefactor[npm] * eim;// theta derivative
00137         rhon /= rho;                                            //   rho^(-n-1)
00138       }                                                         //  End loop over n in Ynm
00139       pn = -pn * fact * y;                                      //  Pn
00140       fact += 2;                                                //  2 * m + 1
00141     }                                                           // End loop over m in Ynm
00142   }
00143 
00144 public:
00145 //! Constructor
00146   KernelBase() : X0(0), R0(0), keysDevcSize(0), rangeDevcSize(0),
00147                  sourceDevcSize(0), targetDevcSize(0),
00148                  NP2P(0), NM2P(0), NM2L(0) {}
00149 //! Destructor
00150   ~KernelBase() {}
00151 
00152 //! Set center of root cell
00153   void setX0(vect x0) {X0 = x0;}
00154 //! Set radius of root cell
00155   void setR0(real r0) {R0 = r0;}
00156 
00157 //! Get center of root cell
00158   vect getX0() const {return X0;}
00159 //! Get radius of root cell
00160   real getR0() const {return R0;}
00161 
00162 //! Set center and size of root cell
00163   void setDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
00164     vect xmin,xmax;                                             // Min,Max of domain
00165     B_iter B = bodies.begin();                                  // Reset body iterator
00166     xmin = xmax = B->X;                                         // Initialize xmin,xmax
00167     for( B=bodies.begin(); B!=bodies.end(); ++B ) {             // Loop over bodies
00168       for( int d=0; d!=3; ++d ) {                               //  Loop over each dimension
00169         if     (B->X[d] < xmin[d]) xmin[d] = B->X[d];           //   Determine xmin
00170         else if(B->X[d] > xmax[d]) xmax[d] = B->X[d];           //   Determine xmax
00171       }                                                         //  End loop over each dimension
00172     }                                                           // End loop over bodies
00173     for( int d=0; d!=3; ++d ) {                                 // Loop over each dimension
00174       X0[d] = (xmax[d] + xmin[d]) / 2;                          // Calculate center of domain
00175       X0[d] = int(X0[d]+.5);                                    //  Shift center to nearest integer
00176       R0 = std::max(xmax[d] - X0[d], R0);                       //  Calculate max distance from center
00177       R0 = std::max(X0[d] - xmin[d], R0);                       //  Calculate max distance from center
00178     }                                                           // End loop over each dimension
00179     if( IMAGES != 0 ) {                                         // If periodic boundary condition
00180       if( X0[0]-R0 < x0[0]-r0 || x0[0]+r0 < X0[0]+R0            //  Check for outliers in x direction
00181        || X0[1]-R0 < x0[1]-r0 || x0[1]+r0 < X0[1]+R0            //  Check for outliers in y direction
00182        || X0[2]-R0 < x0[2]-r0 || x0[2]+r0 < X0[2]+R0 ) {        //  Check for outliers in z direction
00183         std::cout << "Error: Particles located outside periodic domain : " << std::endl;// Print error message
00184         std::cout << X0-R0 << std::endl;
00185         std::cout << X0+R0 << std::endl;
00186       }                                                         //  End if for outlier checking
00187       X0 = x0;                                                  //  Center is [0, 0, 0]
00188       R0 = r0;                                                  //  Radius is r0
00189     } else {
00190       R0 += 1e-5;                                               // Add some leeway to root radius
00191     }                                                           // Endif for periodic boundary condition
00192   }
00193 
00194 //! Precalculate M2L translation matrix
00195   void preCalculation() {
00196     const complex I(0.,1.);                                     // Imaginary unit
00197     factorial = new real  [P];                                  // Factorial
00198     prefactor = new real  [4*P2];                               // sqrt( (n - |m|)! / (n + |m|)! )
00199     Anm       = new real  [4*P2];                               // (-1)^n / sqrt( (n + m)! / (n - m)! )
00200     Cnm       = new complex [P4];                               // M2L translation matrix Cjknm
00201 
00202     factorial[0] = 1;                                           // Initialize factorial
00203     for( int n=1; n!=P; ++n ) {                                 // Loop to P
00204       factorial[n] = factorial[n-1] * n;                        //  n!
00205     }                                                           // End loop to P
00206 
00207     for( int n=0; n!=2*P; ++n ) {                               // Loop over n in Anm
00208       for( int m=-n; m<=n; ++m ) {                              //  Loop over m in Anm
00209         int nm = n*n+n+m;                                       //   Index of Anm
00210         int nabsm = abs(m);                                     //   |m|
00211         real fnmm = EPS;                                        //   Initialize (n - m)!
00212         for( int i=1; i<=n-m; ++i ) fnmm *= i;                  //   (n - m)!
00213         real fnpm = EPS;                                        //   Initialize (n + m)!
00214         for( int i=1; i<=n+m; ++i ) fnpm *= i;                  //   (n + m)!
00215         real fnma = 1.0;                                        //   Initialize (n - |m|)!
00216         for( int i=1; i<=n-nabsm; ++i ) fnma *= i;              //   (n - |m|)!
00217         real fnpa = 1.0;                                        //   Initialize (n + |m|)!
00218         for( int i=1; i<=n+nabsm; ++i ) fnpa *= i;              //   (n + |m|)!
00219         prefactor[nm] = std::sqrt(fnma/fnpa);                   //   sqrt( (n - |m|)! / (n + |m|)! )
00220         Anm[nm] = ODDEVEN(n)/std::sqrt(fnmm*fnpm);              //   (-1)^n / sqrt( (n + m)! / (n - m)! )
00221       }                                                         //  End loop over m in Anm
00222     }                                                           // End loop over n in Anm
00223 
00224     for( int j=0, jk=0, jknm=0; j!=P; ++j ) {                   // Loop over j in Cjknm
00225       for( int k=-j; k<=j; ++k, ++jk ){                         //  Loop over k in Cjknm
00226         for( int n=0, nm=0; n!=P; ++n ) {                       //   Loop over n in Cjknm
00227           for( int m=-n; m<=n; ++m, ++nm, ++jknm ) {            //    Loop over m in Cjknm
00228             const int jnkm = (j+n)*(j+n)+j+n+m-k;               //     Index C_{j+n}^{m-k}
00229             Cnm[jknm] = std::pow(I,real(abs(k-m)-abs(k)-abs(m)))//     Cjknm
00230                       * real(ODDEVEN(j)*Anm[nm]*Anm[jk]/Anm[jnkm]) * EPS;
00231           }                                                     //    End loop over m in Cjknm
00232         }                                                       //   End loop over n in Cjknm
00233       }                                                         //  End loop over in k in Cjknm
00234     }                                                           // End loop over in j in Cjknm
00235   }
00236 
00237 //! Free temporary allocations
00238   void postCalculation() {
00239     delete[] factorial;                                         // Free factorial
00240     delete[] prefactor;                                         // Free sqrt( (n - |m|)! / (n + |m|)! )
00241     delete[] Anm;                                               // Free (-1)^n / sqrt( (n + m)! / (n - m)! )
00242     delete[] Cnm;                                               // Free M2L translation matrix Cjknm
00243   }
00244 
00245 //! Set paramters for Van der Waals
00246   void setVanDerWaals(int atoms, double *rscale, double *gscale) {
00247     assert(atoms <= 16);                                        // Change GPU constant memory alloc if needed
00248     THETA = .1;                                                 // Force opening angle to be small
00249     ATOMS = atoms;                                              // Set number of atom types
00250     RSCALE.resize(ATOMS*ATOMS);                                 // Resize rscale vector
00251     GSCALE.resize(ATOMS*ATOMS);                                 // Resize gscale vector
00252     for( int i=0; i!=ATOMS*ATOMS; ++i ) {                       // Loop over scale vector
00253       RSCALE[i] = rscale[i];                                    //  Set rscale vector
00254       GSCALE[i] = gscale[i];                                    //  Set gscale vector
00255     }                                                           // End loop over scale vector
00256   }
00257 
00258 //! Set paramters for Ewald summation
00259   void setEwald(real ksize, real alpha, real sigma) {
00260     KSIZE = ksize;                                              // Set number of waves
00261     ALPHA = alpha;                                              // Set scaling parameter
00262     SIGMA = sigma;                                              // Set scaling parameter
00263   }
00264 
00265 };
00266 
00267 template<Equation equation>
00268 class Kernel : public KernelBase {
00269 public:
00270   void initialize();                                            //!< Initialize kernels
00271   void P2M(C_iter Ci) const;                                    //!< Evaluate P2M kernel on CPU
00272   void M2M(C_iter Ci, C_iter Cj) const;                         //!< Evaluate M2M kernel on CPU
00273   void M2L(C_iter Ci, C_iter Cj) const;                         //!< Evaluate M2L kernel on CPU
00274   void M2P(C_iter Ci, C_iter Cj) const;                         //!< Evaluate M2P kernel on CPU
00275   void P2P(C_iter Ci, C_iter Cj) const;                         //!< Evaluate P2P kernel on CPU
00276   void L2L(C_iter Ci, C_iter Cj) const;                         //!< Evaluate L2L kernel on CPU
00277   void L2P(C_iter Ci) const;                                    //!< Evaluate L2P kernel on CPU
00278   void EwaldReal(C_iter Ci, C_iter Cj) const;                   //!< Evaluate Ewald real part on CPU
00279   void EwaldWave(Bodies &bodies) const;                         //!< Evaluate Ewald wave part on CPU
00280   void P2M();                                                   //!< Evaluate P2M kernel on GPU
00281   void M2M();                                                   //!< Evaluate M2M kernel on GPU
00282   void M2L();                                                   //!< Evaluate M2L kernel on GPU
00283   void M2P();                                                   //!< Evaluate M2P kernel on GPU
00284   void P2P();                                                   //!< Evalaute P2P kernel on GPU
00285   void L2L();                                                   //!< Evaluate L2L kernel on GPU
00286   void L2P();                                                   //!< Evaluate L2P kernel on GPU
00287   void EwaldReal();                                             //!< Evaluate Ewald real part on GPU
00288   void EwaldWave();                                             //!< Evalaute Ewald wave part on GPU
00289   void finalize();                                              //!< Finalize kernels
00290 
00291   void allocate();                                              //!< Allocate GPU variables
00292   void hostToDevice();                                          //!< Copy from host to device
00293   void deviceToHost();
00294 };
00295 
00296 #endif