ExaFMM 1
Fast-multipole Method for exascale systems
|
00001 /* 00002 Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba 00003 00004 Permission is hereby granted, free of charge, to any person obtaining a copy 00005 of this software and associated documentation files (the "Software"), to deal 00006 in the Software without restriction, including without limitation the rights 00007 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00008 copies of the Software, and to permit persons to whom the Software is 00009 furnished to do so, subject to the following conditions: 00010 00011 The above copyright notice and this permission notice shall be included in 00012 all copies or substantial portions of the Software. 00013 00014 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00015 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00016 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00017 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00018 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00019 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00020 THE SOFTWARE. 00021 */ 00022 #ifndef kernel_h 00023 #define kernel_h 00024 #include "sort.h" 00025 #define ODDEVEN(n) ((((n) & 1) == 1) ? -1 : 1) 00026 00027 const int P2 = P * P; //!< P^2 00028 const int P4 = P2 * P2; //!< P^4 00029 00030 //! Unified CPU/GPU kernel class 00031 class KernelBase : public Sort { 00032 protected: 00033 vect X0; //!< Center of root cell 00034 real R0; //!< Radius of root cell 00035 C_iter Ci0; //!< icells.begin() 00036 C_iter Cj0; //!< jcells.begin() 00037 00038 int ATOMS; //!< Number of atom types in Van der Waals 00039 std::vector<real> RSCALE; //!< Scaling parameter for Van der Waals 00040 std::vector<real> GSCALE; //!< Scaling parameter for Van der Waals 00041 real KSIZE; //!< Number of waves in Ewald summation 00042 real ALPHA; //!< Scaling parameter for Ewald summation 00043 real SIGMA; //!< Scaling parameter for Ewald summation 00044 00045 std::vector<int> keysHost; //!< Offsets for rangeHost 00046 std::vector<int> rangeHost; //!< Offsets for sourceHost 00047 std::vector<gpureal> sourceHost; //!< Sources on host 00048 std::vector<gpureal> targetHost; //!< Targets on host 00049 std::vector<gpureal> constHost; //!< Constants on host 00050 Map sourceBegin; //!< Define map for offset of source cells 00051 Map sourceSize; //!< Define map for size of source cells 00052 Map targetBegin; //!< Define map for offset of target cells 00053 size_t keysDevcSize; //!< Size of offsets for rangeDevc 00054 size_t rangeDevcSize; //!< Size of offsets for sourceDevc 00055 size_t sourceDevcSize; //!< Size of sources on device 00056 size_t targetDevcSize; //!< Size of targets on device 00057 int *keysDevc; //!< Offsets for rangeDevc 00058 int *rangeDevc; //!< Offsets for sourceDevc 00059 gpureal *sourceDevc; //!< Sources on device 00060 gpureal *targetDevc; //!< Targets on device 00061 00062 real *factorial; //!< Factorial 00063 real *prefactor; //!< \f$ \sqrt{ \frac{(n - |m|)!}{(n + |m|)!} } \f$ 00064 real *Anm; //!< \f$ (-1)^n / \sqrt{ \frac{(n + m)!}{(n - m)!} } \f$ 00065 complex *Cnm; //!< M2L translation matrix \f$ C_{jn}^{km} \f$ 00066 public: 00067 real NP2P; //!< Number of P2P kernel calls 00068 real NM2P; //!< Number of M2P kernel calls 00069 real NM2L; //!< Number of M2L kernel calls 00070 00071 protected: 00072 //! Evaluate solid harmonics \f$ r^n Y_{n}^{m} \f$ 00073 void evalMultipole(real rho, real alpha, real beta, complex *Ynm, complex *YnmTheta) const { 00074 const complex I(0.,1.); // Imaginary unit 00075 real x = std::cos(alpha); // x = cos(alpha) 00076 real y = std::sin(alpha); // y = sin(alpha) 00077 real fact = 1; // Initialize 2 * m + 1 00078 real pn = 1; // Initialize Legendre polynomial Pn 00079 real rhom = 1; // Initialize rho^m 00080 for( int m=0; m!=P; ++m ) { // Loop over m in Ynm 00081 complex eim = std::exp(I * real(m * beta)); // exp(i * m * beta) 00082 real p = pn; // Associated Legendre polynomial Pnm 00083 int npn = m * m + 2 * m; // Index of Ynm for m > 0 00084 int nmn = m * m; // Index of Ynm for m < 0 00085 Ynm[npn] = rhom * p * prefactor[npn] * eim; // rho^m * Ynm for m > 0 00086 Ynm[nmn] = std::conj(Ynm[npn]); // Use conjugate relation for m < 0 00087 real p1 = p; // Pnm-1 00088 p = x * (2 * m + 1) * p1; // Pnm using recurrence relation 00089 YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm 00090 rhom *= rho; // rho^m 00091 real rhon = rhom; // rho^n 00092 for( int n=m+1; n!=P; ++n ) { // Loop over n in Ynm 00093 int npm = n * n + n + m; // Index of Ynm for m > 0 00094 int nmm = n * n + n - m; // Index of Ynm for m < 0 00095 Ynm[npm] = rhon * p * prefactor[npm] * eim; // rho^n * Ynm 00096 Ynm[nmm] = std::conj(Ynm[npm]); // Use conjugate relation for m < 0 00097 real p2 = p1; // Pnm-2 00098 p1 = p; // Pnm-1 00099 p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);// Pnm using recurrence relation 00100 YnmTheta[npm] = rhon * ((n - m + 1) * p - (n + 1) * x * p1) / y * prefactor[npm] * eim;// theta derivative 00101 rhon *= rho; // Update rho^n 00102 } // End loop over n in Ynm 00103 pn = -pn * fact * y; // Pn 00104 fact += 2; // 2 * m + 1 00105 } // End loop over m in Ynm 00106 } 00107 00108 //! Evaluate singular harmonics \f$ r^{-n-1} Y_n^m \f$ 00109 void evalLocal(real rho, real alpha, real beta, complex *Ynm, complex *YnmTheta) const { 00110 const complex I(0.,1.); // Imaginary unit 00111 real x = std::cos(alpha); // x = cos(alpha) 00112 real y = std::sin(alpha); // y = sin(alpha) 00113 real fact = 1; // Initialize 2 * m + 1 00114 real pn = 1; // Initialize Legendre polynomial Pn 00115 real rhom = 1.0 / rho; // Initialize rho^(-m-1) 00116 for( int m=0; m!=2*P; ++m ) { // Loop over m in Ynm 00117 complex eim = std::exp(I * real(m * beta)); // exp(i * m * beta) 00118 real p = pn; // Associated Legendre polynomial Pnm 00119 int npn = m * m + 2 * m; // Index of Ynm for m > 0 00120 int nmn = m * m; // Index of Ynm for m < 0 00121 Ynm[npn] = rhom * p * prefactor[npn] * eim; // rho^(-m-1) * Ynm for m > 0 00122 Ynm[nmn] = std::conj(Ynm[npn]); // Use conjugate relation for m < 0 00123 real p1 = p; // Pnm-1 00124 p = x * (2 * m + 1) * p1; // Pnm using recurrence relation 00125 YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm 00126 rhom /= rho; // rho^(-m-1) 00127 real rhon = rhom; // rho^(-n-1) 00128 for( int n=m+1; n!=2*P; ++n ) { // Loop over n in Ynm 00129 int npm = n * n + n + m; // Index of Ynm for m > 0 00130 int nmm = n * n + n - m; // Index of Ynm for m < 0 00131 Ynm[npm] = rhon * p * prefactor[npm] * eim; // rho^n * Ynm for m > 0 00132 Ynm[nmm] = std::conj(Ynm[npm]); // Use conjugate relation for m < 0 00133 real p2 = p1; // Pnm-2 00134 p1 = p; // Pnm-1 00135 p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);// Pnm using recurrence relation 00136 YnmTheta[npm] = rhon * ((n - m + 1) * p - (n + 1) * x * p1) / y * prefactor[npm] * eim;// theta derivative 00137 rhon /= rho; // rho^(-n-1) 00138 } // End loop over n in Ynm 00139 pn = -pn * fact * y; // Pn 00140 fact += 2; // 2 * m + 1 00141 } // End loop over m in Ynm 00142 } 00143 00144 public: 00145 //! Constructor 00146 KernelBase() : X0(0), R0(0), keysDevcSize(0), rangeDevcSize(0), 00147 sourceDevcSize(0), targetDevcSize(0), 00148 NP2P(0), NM2P(0), NM2L(0) {} 00149 //! Destructor 00150 ~KernelBase() {} 00151 00152 //! Set center of root cell 00153 void setX0(vect x0) {X0 = x0;} 00154 //! Set radius of root cell 00155 void setR0(real r0) {R0 = r0;} 00156 00157 //! Get center of root cell 00158 vect getX0() const {return X0;} 00159 //! Get radius of root cell 00160 real getR0() const {return R0;} 00161 00162 //! Set center and size of root cell 00163 void setDomain(Bodies &bodies, vect x0=0, real r0=M_PI) { 00164 vect xmin,xmax; // Min,Max of domain 00165 B_iter B = bodies.begin(); // Reset body iterator 00166 xmin = xmax = B->X; // Initialize xmin,xmax 00167 for( B=bodies.begin(); B!=bodies.end(); ++B ) { // Loop over bodies 00168 for( int d=0; d!=3; ++d ) { // Loop over each dimension 00169 if (B->X[d] < xmin[d]) xmin[d] = B->X[d]; // Determine xmin 00170 else if(B->X[d] > xmax[d]) xmax[d] = B->X[d]; // Determine xmax 00171 } // End loop over each dimension 00172 } // End loop over bodies 00173 for( int d=0; d!=3; ++d ) { // Loop over each dimension 00174 X0[d] = (xmax[d] + xmin[d]) / 2; // Calculate center of domain 00175 X0[d] = int(X0[d]+.5); // Shift center to nearest integer 00176 R0 = std::max(xmax[d] - X0[d], R0); // Calculate max distance from center 00177 R0 = std::max(X0[d] - xmin[d], R0); // Calculate max distance from center 00178 } // End loop over each dimension 00179 if( IMAGES != 0 ) { // If periodic boundary condition 00180 if( X0[0]-R0 < x0[0]-r0 || x0[0]+r0 < X0[0]+R0 // Check for outliers in x direction 00181 || X0[1]-R0 < x0[1]-r0 || x0[1]+r0 < X0[1]+R0 // Check for outliers in y direction 00182 || X0[2]-R0 < x0[2]-r0 || x0[2]+r0 < X0[2]+R0 ) { // Check for outliers in z direction 00183 std::cout << "Error: Particles located outside periodic domain : " << std::endl;// Print error message 00184 std::cout << X0-R0 << std::endl; 00185 std::cout << X0+R0 << std::endl; 00186 } // End if for outlier checking 00187 X0 = x0; // Center is [0, 0, 0] 00188 R0 = r0; // Radius is r0 00189 } else { 00190 R0 += 1e-5; // Add some leeway to root radius 00191 } // Endif for periodic boundary condition 00192 } 00193 00194 //! Precalculate M2L translation matrix 00195 void preCalculation() { 00196 const complex I(0.,1.); // Imaginary unit 00197 factorial = new real [P]; // Factorial 00198 prefactor = new real [4*P2]; // sqrt( (n - |m|)! / (n + |m|)! ) 00199 Anm = new real [4*P2]; // (-1)^n / sqrt( (n + m)! / (n - m)! ) 00200 Cnm = new complex [P4]; // M2L translation matrix Cjknm 00201 00202 factorial[0] = 1; // Initialize factorial 00203 for( int n=1; n!=P; ++n ) { // Loop to P 00204 factorial[n] = factorial[n-1] * n; // n! 00205 } // End loop to P 00206 00207 for( int n=0; n!=2*P; ++n ) { // Loop over n in Anm 00208 for( int m=-n; m<=n; ++m ) { // Loop over m in Anm 00209 int nm = n*n+n+m; // Index of Anm 00210 int nabsm = abs(m); // |m| 00211 real fnmm = EPS; // Initialize (n - m)! 00212 for( int i=1; i<=n-m; ++i ) fnmm *= i; // (n - m)! 00213 real fnpm = EPS; // Initialize (n + m)! 00214 for( int i=1; i<=n+m; ++i ) fnpm *= i; // (n + m)! 00215 real fnma = 1.0; // Initialize (n - |m|)! 00216 for( int i=1; i<=n-nabsm; ++i ) fnma *= i; // (n - |m|)! 00217 real fnpa = 1.0; // Initialize (n + |m|)! 00218 for( int i=1; i<=n+nabsm; ++i ) fnpa *= i; // (n + |m|)! 00219 prefactor[nm] = std::sqrt(fnma/fnpa); // sqrt( (n - |m|)! / (n + |m|)! ) 00220 Anm[nm] = ODDEVEN(n)/std::sqrt(fnmm*fnpm); // (-1)^n / sqrt( (n + m)! / (n - m)! ) 00221 } // End loop over m in Anm 00222 } // End loop over n in Anm 00223 00224 for( int j=0, jk=0, jknm=0; j!=P; ++j ) { // Loop over j in Cjknm 00225 for( int k=-j; k<=j; ++k, ++jk ){ // Loop over k in Cjknm 00226 for( int n=0, nm=0; n!=P; ++n ) { // Loop over n in Cjknm 00227 for( int m=-n; m<=n; ++m, ++nm, ++jknm ) { // Loop over m in Cjknm 00228 const int jnkm = (j+n)*(j+n)+j+n+m-k; // Index C_{j+n}^{m-k} 00229 Cnm[jknm] = std::pow(I,real(abs(k-m)-abs(k)-abs(m)))// Cjknm 00230 * real(ODDEVEN(j)*Anm[nm]*Anm[jk]/Anm[jnkm]) * EPS; 00231 } // End loop over m in Cjknm 00232 } // End loop over n in Cjknm 00233 } // End loop over in k in Cjknm 00234 } // End loop over in j in Cjknm 00235 } 00236 00237 //! Free temporary allocations 00238 void postCalculation() { 00239 delete[] factorial; // Free factorial 00240 delete[] prefactor; // Free sqrt( (n - |m|)! / (n + |m|)! ) 00241 delete[] Anm; // Free (-1)^n / sqrt( (n + m)! / (n - m)! ) 00242 delete[] Cnm; // Free M2L translation matrix Cjknm 00243 } 00244 00245 //! Set paramters for Van der Waals 00246 void setVanDerWaals(int atoms, double *rscale, double *gscale) { 00247 assert(atoms <= 16); // Change GPU constant memory alloc if needed 00248 THETA = .1; // Force opening angle to be small 00249 ATOMS = atoms; // Set number of atom types 00250 RSCALE.resize(ATOMS*ATOMS); // Resize rscale vector 00251 GSCALE.resize(ATOMS*ATOMS); // Resize gscale vector 00252 for( int i=0; i!=ATOMS*ATOMS; ++i ) { // Loop over scale vector 00253 RSCALE[i] = rscale[i]; // Set rscale vector 00254 GSCALE[i] = gscale[i]; // Set gscale vector 00255 } // End loop over scale vector 00256 } 00257 00258 //! Set paramters for Ewald summation 00259 void setEwald(real ksize, real alpha, real sigma) { 00260 KSIZE = ksize; // Set number of waves 00261 ALPHA = alpha; // Set scaling parameter 00262 SIGMA = sigma; // Set scaling parameter 00263 } 00264 00265 }; 00266 00267 template<Equation equation> 00268 class Kernel : public KernelBase { 00269 public: 00270 void initialize(); //!< Initialize kernels 00271 void P2M(C_iter Ci) const; //!< Evaluate P2M kernel on CPU 00272 void M2M(C_iter Ci, C_iter Cj) const; //!< Evaluate M2M kernel on CPU 00273 void M2L(C_iter Ci, C_iter Cj) const; //!< Evaluate M2L kernel on CPU 00274 void M2P(C_iter Ci, C_iter Cj) const; //!< Evaluate M2P kernel on CPU 00275 void P2P(C_iter Ci, C_iter Cj) const; //!< Evaluate P2P kernel on CPU 00276 void L2L(C_iter Ci, C_iter Cj) const; //!< Evaluate L2L kernel on CPU 00277 void L2P(C_iter Ci) const; //!< Evaluate L2P kernel on CPU 00278 void EwaldReal(C_iter Ci, C_iter Cj) const; //!< Evaluate Ewald real part on CPU 00279 void EwaldWave(Bodies &bodies) const; //!< Evaluate Ewald wave part on CPU 00280 void P2M(); //!< Evaluate P2M kernel on GPU 00281 void M2M(); //!< Evaluate M2M kernel on GPU 00282 void M2L(); //!< Evaluate M2L kernel on GPU 00283 void M2P(); //!< Evaluate M2P kernel on GPU 00284 void P2P(); //!< Evalaute P2P kernel on GPU 00285 void L2L(); //!< Evaluate L2L kernel on GPU 00286 void L2P(); //!< Evaluate L2P kernel on GPU 00287 void EwaldReal(); //!< Evaluate Ewald real part on GPU 00288 void EwaldWave(); //!< Evalaute Ewald wave part on GPU 00289 void finalize(); //!< Finalize kernels 00290 00291 void allocate(); //!< Allocate GPU variables 00292 void hostToDevice(); //!< Copy from host to device 00293 void deviceToHost(); 00294 }; 00295 00296 #endif