ExaFMM: include/partition.h Source File

ExaFMM 1
Fast-multipole Method for exascale systems
00001 /*
00002 Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
00003 
00004 Permission is hereby granted, free of charge, to any person obtaining a copy
00005 of this software and associated documentation files (the "Software"), to deal
00006 in the Software without restriction, including without limitation the rights
00007 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00008 copies of the Software, and to permit persons to whom the Software is
00009 furnished to do so, subject to the following conditions:
00010 
00011 The above copyright notice and this permission notice shall be included in
00012 all copies or substantial portions of the Software.
00013 
00014 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00015 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00016 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00017 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00018 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00019 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00020 THE SOFTWARE.
00021 */
00022 #ifndef partition_h
00023 #define partition_h
00024 #include "mympi.h"
00025 #include "serialfmm.h"
00026 
00027 //! Handles all the partitioning of domains
00028 template<Equation equation>
00029 class Partition : public MyMPI, public SerialFMM<equation> {
00030 private:
00031   int numCells1D;                                               //!< Number of cells in one dimension (leaf level)
00032 
00033 protected:
00034   int LEVEL;                                                    //!< Level of the MPI process binary tree
00035   std::vector<vect> XMIN;                                       //!< Minimum position vector of bodies
00036   std::vector<vect> XMAX;                                       //!< Maximum position vector of bodies
00037   int nprocs[64][2];                                            //!< Number of processes in the two split groups
00038   int offset[64][2];                                            //!< Offset of body in the two split groups
00039   int  color[64][3];                                            //!< Color of hypercube communicators
00040   int    key[64][3];                                            //!< Key of hypercube communicators
00041   MPI_Comm MPI_COMM[64][3];                                     //!< Hypercube communicators
00042 
00043 public:
00044   using Kernel<equation>::printNow;                             //!< Switch to print timings
00045   using Kernel<equation>::startTimer;                           //!< Start timer for given event
00046   using Kernel<equation>::stopTimer;                            //!< Stop timer for given event
00047   using Kernel<equation>::sortBodies;                           //!< Sort bodies according to cell index
00048   using Kernel<equation>::X0;                                   //!< Center of root cell
00049   using Kernel<equation>::R0;                                   //!< Radius of root cell
00050   using TreeStructure<equation>::buffer;                        //!< Buffer for MPI communication & sorting
00051   using TreeStructure<equation>::getLevel;                      //!< Get level from cell index
00052   using BottomUp<equation>::getMaxLevel;                        //!< Max level for bottom up tree build
00053 
00054 private:
00055 //! Split domain according to iSplit
00056   void splitDomain(bigint iSplit, int l, int d) {
00057     real X = (iSplit + 1) * (XMAX[l][d] - XMIN[l][d]) / numCells1D + XMIN[l][d];// Coordinate corresponding to iSplit
00058     XMAX[l+1] = XMAX[l];                                        // Set XMAX for next subdivision
00059     XMIN[l+1] = XMIN[l];                                        // Set XMIN for next subdivision
00060     if( color[l+1][0] % 2 == 0 ) {                              // If on left side
00061       XMAX[l+1][d] = X;                                         //  Set max to X
00062     } else {                                                    // If on right side
00063       XMIN[l+1][d] = X;                                         //  Set min to X
00064     }                                                           // Endif for sides
00065   }
00066 
00067 //! Get global bucket data for parallel nth_element
00068   template<typename T>
00069   int getBucket(T &data, int numData, int lOffset, Bigints &send, Bigints &recv, MPI_Comm MPI_COMM0) {
00070     int maxBucket = send.size();                                // Maximum number of buckets
00071     int numBucket;                                              // Number of buckets
00072     int numSample = std::min(maxBucket/MPISIZES,numData);       // Number of local samples
00073     MPI_Datatype MPI_TYPE = getType(data[0].ICELL);             // Get MPI data type
00074     int *rcnt = new int [MPISIZES];                             // MPI recv count
00075     int *rdsp = new int [MPISIZES];                             // MPI recv displacement
00076     for( int i=0; i!=numSample; ++i ) {                         // Loop over local samples
00077       int stride = numData/numSample;                           //  Sampling stride
00078       send[i] = data[lOffset + i * stride].ICELL;               //  Put sampled data in send buffer
00079     }                                                           // End loop over samples
00080     MPI_Gather(&numSample,1,MPI_INT,                            // Gather size of sample data to rank 0
00081                rcnt,      1,MPI_INT,
00082                0,MPI_COMM0);
00083     if( MPIRANKS == 0 ) {                                       // Only rank 0 operates on gathered info
00084       numBucket = 0;                                            //  Initialize number of buckets
00085       for( int irank=0; irank!=MPISIZES; ++irank ) {            //  Loop over processes
00086         rdsp[irank] = numBucket;                                //   Calculate recv displacement
00087         numBucket += rcnt[irank];                               //   Accumulate recv count to get number of buckets
00088       }                                                         //  End loop over processes
00089       recv.resize(numBucket);                                   //  Resize recv so that end() is valid
00090     }                                                           // Endif for rank 0
00091     MPI_Gatherv(&send[0],numSample,MPI_TYPE,                    // Gather sample data to rank 0
00092                 &recv[0],rcnt,rdsp,MPI_TYPE,
00093                 0,MPI_COMM0);
00094     if( MPIRANKS == 0 ) {                                       // Only rank 0 operates on gathered info
00095       std::sort(recv.begin(),recv.end());                       //  Sort the bucket data
00096       numBucket = std::unique(recv.begin(),recv.end()) - recv.begin();// Remove duplicate bucket data
00097       recv.resize(numBucket);                                   //  Resize recv again
00098     }                                                           // Endif for rank 0
00099     MPI_Bcast(&numBucket,1,MPI_INT,0,MPI_COMM0);                // Broadcast number of buckets
00100     MPI_Bcast(&recv[0],numBucket,MPI_TYPE,0,MPI_COMM0);         // Broadcast bucket data
00101     delete[] rcnt;                                              // Delete recv count
00102     delete[] rdsp;                                              // Delete recv displacement
00103     return numBucket;                                           // Return number of buckets
00104   }
00105 
00106 protected:
00107 //! Split the MPI communicator into N-D hypercube
00108   void bisectionGetComm(int l) {
00109     for( int i=1; i>=0; --i ) {                                 // Loop over 1 and 0
00110       int pSplit = (nprocs[l][0] + i) / 2;                      //  Splitting criteria
00111       if( MPIRANK - offset[l][0] < pSplit ) {                   //  If on left side
00112         nprocs[l+1][i] = pSplit;                                //   Group size is the splitting criteria
00113         offset[l+1][i] = offset[l][0];                          //   Offset is the same as previous
00114          color[l+1][i] =  color[l][0] * 2;                      //   Color is twice the previous
00115       } else {                                                  //  If on right side
00116         nprocs[l+1][i] = nprocs[l][0] - pSplit;                 //   Group size is what remains from the left side
00117         offset[l+1][i] = offset[l][0] + pSplit;                 //   Offset is incremented by splitting criteria
00118          color[l+1][i] =  color[l][0] * 2 + 1;                  //   Color is twice the previous plus one
00119       }                                                         //  Endif for sides
00120       key[l+1][i] = MPIRANK - offset[l+1][i];                   //  Key is determined from offset
00121     }                                                           // End loop over 1 and 0
00122     key[l+1][2] = color[l+1][1] % 2;                            // The third type of key is determined from color[1]
00123     color[l+1][2] = key[l+1][1] + color[l][0] * (1 << (LEVEL - l - 1));// The corresponding color is determined from key[1]
00124     for( int i=0; i!=3; ++i ) {                                 // Loop over the three types of colors and keys
00125       MPI_Comm_split(MPI_COMM_WORLD,color[l+1][i],key[l+1][i],&MPI_COMM[l+1][i]);// Split the MPI communicator
00126     }                                                           // End loop over three types
00127 #ifdef DEBUG
00128     print("level : ",0);                                        // Print identifier
00129     print(l+1,0);                                               // Print current level
00130     print("\n",0);                                              // New line
00131     print("key   : \n",0);                                      // Print identifier
00132     print(key[l+1],0,3);                                        // Print key
00133     print("color : \n",0);                                      // Print identifier
00134     print(color[l+1],0,3);                                      // Print color
00135 #endif
00136   }
00137 
00138 //! One-to-one MPI_Alltoallv
00139   void bisectionAlltoall(Bodies &bodies, int nthLocal, int numLocal, int &newSize, int l) {
00140     startTimer("Bi Alltoall  ");                                // Start timer
00141     const int bytes = sizeof(bodies[0]);                        // Byte size of body structure
00142     int scnt[2] = {nthLocal, numLocal - nthLocal};              // Set send count to right and left size
00143     int rcnt[2] = {0, 0};                                       // Initialize recv count
00144     MPI_Alltoall(scnt,1,MPI_INT,rcnt,1,MPI_INT,MPI_COMM[l+1][2]);// Communicate the send count to get recv count
00145     int sdsp[2] = {0, scnt[0]};                                 // Send displacement
00146     int rdsp[2] = {0, rcnt[0]};                                 // Recv displacement
00147     newSize = rcnt[0] + rcnt[1];                                // Get new size from recv count
00148     if( color[l+1][0] != color[l+1][1] ) newSize = numLocal;    // Unless it's a leftover process of an odd group
00149     buffer.resize(newSize);                                     // Resize recv buffer
00150 
00151     for(int i=0; i!=2; ++i ) {                                  // Loop over 0 and 1
00152       scnt[i] *= bytes;                                         // Multiply send count by byte size of data
00153       sdsp[i] *= bytes;                                         // Multiply send displacement by byte size of data
00154       rcnt[i] *= bytes;                                         // Multiply recv count by byte size of data
00155       rdsp[i] *= bytes;                                         // Multiply recv displacement by byte size of data
00156     }                                                           // End loop over 0 and 1
00157     MPI_Alltoallv(&bodies[0],scnt,sdsp,MPI_BYTE,                // Communicate bodies
00158                   &buffer[0],rcnt,rdsp,MPI_BYTE,                // Using same buffer as sort buffer
00159                   MPI_COMM[l+1][2]);                            // MPI_COMM[2] is for the one-to-one pair
00160     if( color[l+1][0] == color[l+1][1] ) bodies = buffer;       // Don't update if leftover process
00161     buffer.resize(bodies.size());                               // Resize sort buffer
00162     stopTimer("Bi Alltoall  ",printNow);                        // Stop timer 
00163     sortBodies(bodies,buffer);                                  // Sort bodies in ascending order
00164   }
00165 
00166 //! Scattering from leftover processes
00167   void bisectionScatter(Bodies &bodies, int nthLocal, int &newSize, int l) {
00168     startTimer("Bi Scatter   ");                                // Start timer
00169     const int bytes = sizeof(bodies[0]);                        // Byte size of body structure
00170     int numScatter = nprocs[l+1][1] - 1;                        // Number of processes to scatter to
00171     int oldSize = newSize;                                      // Size of recv buffer before communication
00172     int *scnt = new int [nprocs[l+1][1]];                       // Send count
00173     int *sdsp = new int [nprocs[l+1][1]];                       // Send displacement
00174     int rcnt;                                                   // Recv count
00175     if( key[l+1][1] == numScatter ) {                           // If this is the leftover proc to scatter from
00176       sdsp[0] = 0;                                              //  Initialize send displacement
00177       for(int i=0; i!=numScatter; ++i ) {                       //  Loop over processes to send to
00178         int begin = 0, end = nthLocal;                          //  Set begin and end of range to send
00179         splitRange(begin,end,i,numScatter);                     //  Split range into numScatter and get i-th range
00180         scnt[i] = end - begin;                                  //  Set send count based on range
00181         sdsp[i+1] = sdsp[i] + scnt[i];                          //  Set send displacement based on send count
00182       }                                                         // End loop over processes to send to
00183       scnt[numScatter] = 0;                                     // Send count to self should be 0
00184       oldSize = 0;                                              // Reset oldSize to account for sent data
00185       newSize -= sdsp[numScatter];                              // Set newSize to account for sent data
00186       buffer.erase(buffer.begin(),buffer.begin()+sdsp[numScatter]);// Erase from recv buffer the part that was sent
00187     }                                                           // Endif for leftover proc
00188     MPI_Scatter(scnt,1,MPI_INT,&rcnt,1,MPI_INT,numScatter,MPI_COMM[l+1][1]);// Scatter the send count to get recv count
00189     if( key[l+1][1] != numScatter ) {                           // If it's one the receiving procs
00190       newSize += rcnt;                                          //  Increment newSize by the recv count
00191       buffer.resize(newSize);                                   //  Resize the recv buffer
00192     }                                                           // Endif for receiving procs
00193     for(int i=0; i!= nprocs[l+1][1]; ++i ) {                    // Loop over group of processes
00194       scnt[i] *= bytes;                                         //  Multiply send count by byte size of data
00195       sdsp[i] *= bytes;                                         //  Multiply send displacement by byte size of data
00196     }                                                           // End loop over group of processes
00197     rcnt *= bytes;                                              // Multiply recv count by byte size of data
00198     MPI_Scatterv(&bodies[0],      scnt,sdsp,MPI_BYTE,           // Communicate bodies via MPI_Scatterv
00199                  &buffer[oldSize],rcnt,     MPI_BYTE,           // Offset recv buffer by oldSize
00200                  numScatter,MPI_COMM[l+1][1]);                  // MPI_COMM[1] is used for scatter
00201     bodies = buffer;                                            // Copy recv buffer to bodies
00202     if( key[l+1][1] != numScatter ) sortBodies(bodies,buffer);  // Sort bodies in ascending order
00203     delete[] scnt;                                              // Delete send count
00204     delete[] sdsp;                                              // Delete send displacement
00205     stopTimer("Bi Scatter   ",printNow);                        // Stop timer 
00206   }
00207 
00208 //! Gathering to leftover processes
00209   void bisectionGather(Bodies &bodies, int nthLocal, int numLocal, int &newSize, int l) {
00210     startTimer("Bi Gather    ");                                // Start timer
00211     const int bytes = sizeof(bodies[0]);                        // Byte size of body structure
00212     int numGather = nprocs[l+1][0] - 1;                         // Number of processes to gather to
00213     int oldSize = newSize;                                      // Size of recv buffer before communication
00214     int scnt;                                                   // Send count
00215     int *rcnt = new int [nprocs[l+1][0]];                       // Recv count
00216     int *rdsp = new int [nprocs[l+1][0]];                       // Recv displacement
00217     if( key[l+1][0] != 0 ) {                                    // If this is not the leftover proc
00218       int begin = 0, end = numLocal - nthLocal;                 //  Set begin and end of range to send
00219       splitRange(begin,end,key[l+1][0]-1,nprocs[l+1][0]);       //  Split range into nprocs[0]
00220       scnt = end - begin;                                       //  Set send count based on range
00221       newSize -= scnt;                                          //  Set newSize to account for sent data
00222       buffer.erase(buffer.begin(),buffer.begin()+scnt);         //  Erase from recv buffer the part that was sent
00223     }                                                           // Endif for leftover proc
00224     MPI_Barrier(MPI_COMM[l+1][0]);                              // Sync processes
00225     MPI_Gather(&scnt,1,MPI_INT,rcnt,1,MPI_INT,0,MPI_COMM[l+1][0]);// Gather the send count to get recv count
00226     if( key[l+1][0] == 0 ) {                                    // If this is the leftover proc
00227       rdsp[0] = 0;                                              //  Initialize the recv displacement
00228       for(int i=0; i!=numGather; ++i ) {                        //  Loop over processes to gather from
00229         rdsp[i+1] = rdsp[i] + rcnt[i];                          //   Set recv displacement based on recv count
00230       }                                                         //  End loop over processes
00231       newSize += rdsp[numGather] + rcnt[numGather];             //  Increment newSize by the recv count
00232       buffer.resize(newSize);                                   //  Resize recv buffer
00233     }                                                           // Endif for leftover proc
00234     scnt *= bytes;                                              // Multiply send count by byte size of data
00235     for(int i=0; i!= nprocs[l+1][0]; ++i ) {                    // Loop over group of processes
00236       rcnt[i] *= bytes;                                         //  Multiply recv count by byte size of data
00237       rdsp[i] *= bytes;                                         //  Multiply recv displacement by byte size of data
00238     }                                                           // End loop over group of processes
00239     MPI_Gatherv(&bodies[0],      scnt,     MPI_BYTE,            // Communicate bodies via MPI_Gatherv
00240                 &buffer[oldSize],rcnt,rdsp,MPI_BYTE,            // Offset recv buffer by oldSize
00241                 0,MPI_COMM[l+1][0]);                            // MPI_COMM[0] is used for gather
00242     bodies = buffer;                                            // Copy recv buffer to bodies
00243     delete[] rcnt;                                              // Delete recv count
00244     delete[] rdsp;                                              // Delete send count
00245     if( key[l+1][0] == 0 ) sortBodies(bodies,buffer);           // Sort bodies in ascending order
00246     stopTimer("Bi Gather    ",printNow);                        // Stop timer 
00247   }
00248 
00249 public:
00250 //! Constructor
00251   Partition() : SerialFMM<equation>() {
00252     LEVEL = int(log(MPISIZE) / M_LN2 - 1e-5) + 1;               // Level of the process binary tree
00253     if(MPISIZE == 1) LEVEL = 0;                                 // Level is 0 for a serial execution
00254     XMIN.resize(LEVEL+1);                                       // Minimum position vector at each level
00255     XMAX.resize(LEVEL+1);                                       // Maximum position vector at each level
00256     startTimer("Split comm   ");                                // Start timer
00257     nprocs[0][0] = nprocs[0][1] = MPISIZE;                      // Initialize number of processes in groups
00258     offset[0][0] = offset[0][1] = 0;                            // Initialize offset of body in groups
00259      color[0][0] =  color[0][1] =  color[0][2] = 0;             // Initialize color of communicators
00260        key[0][0] =    key[0][1] =    key[0][2] = 0;             // Initialize key of communicators
00261     for( int l=0; l!=LEVEL; ++l ) {                             // Loop over levels of N-D hypercube communication
00262       bisectionGetComm(l);                                      //  Split the MPI communicator for that level
00263     }                                                           // End loop over levels of N-D hypercube communication
00264     stopTimer("Split comm   ",printNow);                        // Stop timer 
00265   }
00266 //! Destructor
00267   ~Partition() {}
00268 
00269 //! Set bounds of domain to be partitioned
00270   void setGlobDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
00271     numCells1D = 1 << getMaxLevel(bodies);                      // Set initial number of bodies
00272     B_iter B = bodies.begin();                                  // Reset body iterator
00273     XMIN[0] = XMAX[0] = B->X;                                   // Initialize xmin,xmax
00274     MPI_Datatype MPI_TYPE = getType(XMIN[0][0]);                // Get MPI data type
00275     for( B=bodies.begin(); B!=bodies.end(); ++B ) {             // Loop over bodies
00276       for( int d=0; d!=3; ++d ) {                               //  Loop over each dimension
00277         if     (B->X[d] < XMIN[0][d]) XMIN[0][d] = B->X[d];     //   Determine xmin
00278         else if(B->X[d] > XMAX[0][d]) XMAX[0][d] = B->X[d];     //   Determine xmax
00279       }                                                         //  End loop over each dimension
00280     }                                                           // End loop over bodies
00281     vect X;                                                     // Recv buffer
00282     MPI_Allreduce(XMAX[0],X,3,MPI_TYPE,MPI_MAX,MPI_COMM_WORLD); // Reduce global maximum
00283     XMAX[0] = X;                                                // Get data from buffer
00284     MPI_Allreduce(XMIN[0],X,3,MPI_TYPE,MPI_MIN,MPI_COMM_WORLD); // Reduce global minimum
00285     XMIN[0] = X;                                                // Get data from buffer
00286     for( int d=0; d!=3; ++d ) {                                 // Loop over each dimension
00287       X0[d] = (XMAX[0][d] + XMIN[0][d]) / 2;                    //  Calculate center of domain
00288       X0[d] = int(X0[d]+.5);                                    //  Shift center to nearest integer
00289       R0 = std::max(XMAX[0][d] - X0[d], R0);                    //  Calculate max distance from center
00290       R0 = std::max(X0[d] - XMIN[0][d], R0);                    //  Calculate max distance from center
00291     }                                                           // End loop over each dimension
00292     if( IMAGES != 0 ) {                                         // If periodic boundary condition
00293       if( X0[0]-R0 < x0[0]-r0 || x0[0]+r0 < X0[0]+R0            //  Check for outliers in x direction
00294        || X0[1]-R0 < x0[1]-r0 || x0[1]+r0 < X0[1]+R0            //  Check for outliers in y direction
00295        || X0[2]-R0 < x0[2]-r0 || x0[2]+r0 < X0[2]+R0 ) {        //  Check for outliers in z direction
00296         std::cout << "Error: Particles located outside periodic domain @ rank "
00297                   << MPIRANK << std::endl;                      //   Print error message
00298       }                                                         //  End if for outlier checking
00299       X0 = x0;                                                  //  Center is [0, 0, 0]
00300       R0 = r0;                                                  //  Radius is M_PI
00301     } else {                                                    // If not periodic boundary condition
00302       R0 += 1e-5;                                               //  Add some leeway to root radius
00303     }                                                           // Endif for periodic boundary condition
00304     XMAX[0] = X0 + R0;                                          // Reposition global maximum
00305     XMIN[0] = X0 - R0;                                          // Reposition global minimum
00306   }
00307 
00308 //! Turn positions into indices of bins
00309   void binBodies(Bodies &bodies, int d) {
00310     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
00311       B->ICELL = bigint((B->X[d] - XMIN[0][d])                  // Bin body positions into integers
00312         / (XMAX[0][d] - XMIN[0][d]) * numCells1D);
00313     }                                                           // End loop over bodies
00314   }
00315 
00316 //! Split bodies according to iSplit
00317   int splitBodies(Bodies &bodies, bigint iSplit) {
00318     int nth = 0;                                                // Initialize splitting index
00319     while( bodies[nth].ICELL <= iSplit && nth < int(bodies.size()) ) nth++;// Determine end index of left side
00320     return nth;                                                 // Return end index of left side
00321   }
00322 
00323 //! Send bodies to next rank (round robin)
00324   void shiftBodies(Bodies &bodies) {
00325     int newSize;                                                // New number of bodies
00326     int oldSize = bodies.size();                                // Current number of bodies
00327     const int bytes = sizeof(bodies[0]);                        // Byte size of body structure
00328     const int isend = (MPIRANK + 1          ) % MPISIZE;        // Send to next rank (wrap around)
00329     const int irecv = (MPIRANK - 1 + MPISIZE) % MPISIZE;        // Receive from previous rank (wrap around)
00330     MPI_Request sreq,rreq;                                      // Send, recv request handles
00331 
00332     MPI_Isend(&oldSize,1,MPI_INT,irecv,0,MPI_COMM_WORLD,&sreq); // Send current number of bodies
00333     MPI_Irecv(&newSize,1,MPI_INT,isend,0,MPI_COMM_WORLD,&rreq); // Receive new number of bodies
00334     MPI_Wait(&sreq,MPI_STATUS_IGNORE);                          // Wait for send to complete
00335     MPI_Wait(&rreq,MPI_STATUS_IGNORE);                          // Wait for recv to complete
00336 
00337     buffer.resize(newSize);                                     // Resize buffer to new number of bodies
00338     MPI_Isend(&bodies[0],oldSize*bytes,MPI_BYTE,irecv,          // Send bodies to next rank
00339               1,MPI_COMM_WORLD,&sreq);
00340     MPI_Irecv(&buffer[0],newSize*bytes,MPI_BYTE,isend,          // Receive bodies from previous rank
00341               1,MPI_COMM_WORLD,&rreq);
00342     MPI_Wait(&sreq,MPI_STATUS_IGNORE);                          // Wait for send to complete
00343     MPI_Wait(&rreq,MPI_STATUS_IGNORE);                          // Wait for recv to complete
00344     bodies = buffer;                                            // Copy bodies from buffer
00345   }
00346 
00347 //! Parallel global nth_element on distributed memory
00348   template<typename T, typename T2>
00349   T2 nth_element(T &data, T2 n, MPI_Comm MPI_COMM0=0) {
00350     if( MPI_COMM0 == 0 ) {                                      // If MPI_COMM is not specified
00351       MPI_Comm_split(MPI_COMM_WORLD,0,MPIRANK,&MPI_COMM0);      //  Create an artificial MPI_COMM
00352     }
00353     MPI_Comm_size(MPI_COMM0,&MPISIZES);                         // Get number of MPI processes for split comm
00354     MPI_Comm_rank(MPI_COMM0,&MPIRANKS);                         // Get index of current MPI process for split comm
00355     int numData = data.size();                                  // Total size of data to perform nth_element
00356     int maxBucket = 1000;                                       // Maximum number of buckets
00357     int numBucket;                                              // Number of buckets
00358     int lOffset = 0;                                            // Local offset of region being considered
00359     MPI_Datatype MPI_TYPE = getType(n);                         // Get MPI data type
00360     int *rcnt = new int [MPISIZES];                             // MPI recv count
00361     Bigints send(maxBucket);                                    // MPI send buffer for data
00362     Bigints recv(maxBucket);                                    // MPI recv buffer for data
00363     T2 gOffset = 0;                                             // Global offset of region being considered
00364     T2 *isend = new T2 [maxBucket];                             // MPI send buffer for index
00365     T2 *irecv = new T2 [maxBucket];                             // MPI recv buffer for index
00366     T2 *iredu = new T2 [maxBucket];                             // Local scan of index buffer
00367     numBucket = getBucket(data,numData,lOffset,send,recv,MPI_COMM0);// Get global bucket data
00368 
00369     while( numBucket > 1 ) {                                    // While there are multipole candidates
00370       int ic=0, nth=0;                                          //  Initialize counters
00371       for( int i=0; i!=maxBucket; ++i ) isend[i] = 0;           //  Initialize bucket counter
00372       for( int i=0; i!=numData; ++i ) {                         //  Loop over range of data
00373         while( data[lOffset + i].ICELL > recv[ic] && ic < numBucket-1 ) ++ic;// Set counter to current bucket
00374         isend[ic]++;                                            //   Increment bucket counter
00375       }                                                         //  End loop over data
00376       MPI_Reduce(isend,irecv,numBucket,MPI_TYPE,                //  Reduce bucket counter
00377                  MPI_SUM,0,MPI_COMM0);
00378       if( MPIRANKS == 0 ) {                                     //  Only rank 0 operates on reduced data
00379         iredu[0] = 0;                                           //   Initialize global scan index
00380         for( int i=0; i!=numBucket-1; ++i ) {                   //   Loop over buckets
00381           iredu[i+1] = iredu[i] + irecv[i];                     //    Increment global scan index
00382         }                                                       //   End loop over buckets
00383         nth = 0;                                                //   Initialize index for bucket containing nth element
00384         while( n - gOffset > iredu[nth] && nth < numBucket ) ++nth;// Find index for bucket containing nth element
00385         --nth;                                                  //   Account for overshoot (do while?)
00386         if( nth == -1 ) nth = 0;                                //   If nth is -1 don't split
00387         gOffset += iredu[nth];                                  //   Increment global offset of region being considered
00388       }                                                         //  Endif for rank 0
00389       MPI_Bcast(&nth,1,MPI_INT,0,MPI_COMM0);                    //  Broadcast index for bucket containing nth element
00390       MPI_Bcast(&gOffset,1,MPI_TYPE,0,MPI_COMM0);               //  Broadcast global offset
00391       iredu[0] = 0;                                             //  Initialize local scan index
00392       for( int i=0; i!=numBucket-1; ++i ) {                     //  Loop over buckets
00393         iredu[i+1] = iredu[i] + isend[i];                       //   Increment local scan index
00394       }                                                         //  End loop over buckets
00395       if( nth == numBucket-1 ) {                                //  If nth is last bucket
00396         numData = numData-iredu[nth];                           //   Region of interest is that bucket
00397       } else {                                                  //  If nth is not the last bucket
00398         numData = iredu[nth+1]-iredu[nth];                      //   Region of interest is that bucket
00399       }                                                         //  Endif for last bucket
00400       lOffset += iredu[nth];                                    //  Increment local offset to that bucket
00401       numBucket = getBucket(data,numData,lOffset,send,recv,MPI_COMM0);// Get global bucket data
00402     }                                                           // End while loop
00403     delete[] rcnt;                                              // Delete recv count
00404     delete[] isend;                                             // Delete send buffer for index
00405     delete[] irecv;                                             // Delete recv buffer for index
00406     delete[] iredu;                                             // Delete local scan of index buffer
00407     return recv[0]-1;                                           // Return nth element
00408   }
00409 
00410 //! Partitioning by recursive bisection
00411   void bisection(Bodies &bodies) {
00412     startTimer("Bin bodies   ");                                // Start timer
00413     MPI_Datatype MPI_TYPE = getType(bodies[0].ICELL);           // Get MPI data type
00414     int newSize;                                                // New size of recv buffer
00415     bigint numLocal = bodies.size();                            // Local data size
00416     bigint numGlobal;                                           // Global data size
00417     MPI_Allreduce(&numLocal,&numGlobal,1,MPI_TYPE,MPI_SUM,MPI_COMM_WORLD);// Reduce Global data size
00418     bigint nthGlobal = (numGlobal * (nprocs[0][0] / 2)) / nprocs[0][0];// Split at nth global element
00419     binBodies(bodies,2);                                        // Bin bodies into leaf level cells
00420     buffer.resize(numLocal);                                    // Resize sort buffer
00421     stopTimer("Bin bodies   ",printNow);                        // Stop timer 
00422     sortBodies(bodies,buffer);                                  // Sort bodies in ascending order
00423     startTimer("Split bodies ");                                // Start timer
00424     bigint iSplit = nth_element(bodies,nthGlobal);              // Get cell index of nth global element
00425     int nthLocal = splitBodies(bodies,iSplit);                  // Split bodies based on iSplit
00426     stopTimer("Split bodies ",printNow);                        // Stop timer 
00427     for( int l=0; l!=LEVEL; ++l ) {                             // Loop over levels of N-D hypercube communication
00428       splitDomain(iSplit,l,2-l%3);                              //  Split the domain according to iSplit
00429       bisectionAlltoall(bodies,nthLocal,numLocal,newSize,l);    //  Communicate bodies by one-to-one MPI_Alltoallv
00430       if( nprocs[l][0] % 2 == 1 && nprocs[l][0] != 1 && nprocs[l+1][0] <= nprocs[l+1][1] )// If scatter is necessary
00431         bisectionScatter(bodies,nthLocal,newSize,l);            //  Communicate bodies by scattering from leftover proc
00432       if( nprocs[l][0] % 2 == 1 && nprocs[l][0] != 1 && nprocs[l+1][0] >= nprocs[l+1][1] )// If gather is necessary
00433         bisectionGather(bodies,nthLocal,numLocal,newSize,l);    //  Communicate bodies by gathering to leftover proc
00434 #ifdef DEBUG
00435       for(int j=0; j!=MPISIZE; ++j) {                           //  Loop over ranks
00436         MPI_Barrier(MPI_COMM_WORLD);                            //   Sync processes
00437         usleep(WAIT);                                           //   Wait for "WAIT" milliseconds
00438         if( MPIRANK == j ) {                                    //   If it's my turn to print
00439           std::cout << "MPIRANK : " << j << std::endl;          //    Print rank
00440           for(int i=0; i!=9; ++i) std::cout << bodies[numLocal/10*i].I << " ";// Print sampled body indices
00441           std::cout << std::endl;                               //    New line
00442         }                                                       //   Endif for my turn
00443       }                                                         //  End loop over ranks
00444 #endif
00445       startTimer("Bin bodies   ");                              //  Start timer
00446       numLocal = newSize;                                       //  Update local data size
00447       MPI_Allreduce(&numLocal,&numGlobal,1,MPI_TYPE,MPI_SUM,MPI_COMM[l+1][0]);// Reduce global data size
00448       nthGlobal = (numGlobal * (nprocs[l+1][0] / 2)) / nprocs[l+1][0];//  Split at nth global element
00449       binBodies(bodies,2-(l+1)%3);                              //  Bin bodies into leaf level cells
00450       buffer.resize(numLocal);                                  //  Resize sort buffer
00451       stopTimer("Bin bodies   ",printNow);                      //  Stop timer 
00452       sortBodies(bodies,buffer);                                //  Sort bodies in ascending order
00453       startTimer("Split bodies ");                              //  Start timer
00454       iSplit = nth_element(bodies,nthGlobal,MPI_COMM[l+1][0]);  //  Get cell index of nth global element
00455       nthLocal = splitBodies(bodies,iSplit);                    //  Split bodies based on iSplit
00456       stopTimer("Split bodies ",printNow);                      //  Stop timer 
00457     }                                                           // End loop over levels of N-D hypercube communication
00458   }
00459 
00460 //! Partition by recursive octsection
00461   void octsection(Bodies &bodies) {
00462     startTimer("Partition    ");                                // Start timer
00463     int byte = sizeof(bodies[0]);                               // Byte size of body structure
00464     int level = int(log(MPISIZE-1) / M_LN2 / 3) + 1;            // Level of local root cell
00465     if( MPISIZE == 1 ) level = 0;                               // For serial execution local root cell is root cell
00466     BottomUp<equation>::setIndex(bodies,level);                 // Set index of bodies for that level
00467     buffer.resize(bodies.size());                               // Resize sort buffer
00468     stopTimer("Partition    ");                                 // Stop timer 
00469     sortBodies(bodies,buffer);                                  // Sort bodies in ascending order
00470     startTimer("Partition    ");                                // Start timer
00471     int *scnt = new int [MPISIZE];                              // Send count
00472     int *sdsp = new int [MPISIZE];                              // Send displacement
00473     int *rcnt = new int [MPISIZE];                              // Recv count
00474     int *rdsp = new int [MPISIZE];                              // Recv displacement
00475     for( int i=0; i!=MPISIZE; ++i ) {                           // Loop over ranks
00476       scnt[i] = 0;                                              //  Initialize send counts
00477     }                                                           // End loop over ranks
00478     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
00479       int index = B->ICELL - ((1 << 3*level) - 1) / 7;          //  Get levelwise index
00480       int irank = index / (int(pow(8,level)) / MPISIZE);        //  Get rank which the cell belongs to
00481       scnt[irank]++;                                            //  Fill send count bucket
00482     }                                                           // End loop over bodies
00483     MPI_Alltoall(scnt,1,MPI_INT,rcnt,1,MPI_INT,MPI_COMM_WORLD); // Communicate send count to get recv count
00484     sdsp[0] = rdsp[0] = 0;                                      // Initialize send/recv displacements
00485     for( int irank=0; irank!=MPISIZE-1; ++irank ) {             // Loop over ranks
00486       sdsp[irank+1] = sdsp[irank] + scnt[irank];                //  Set send displacement based on send count
00487       rdsp[irank+1] = rdsp[irank] + rcnt[irank];                //  Set recv displacement based on recv count
00488     }                                                           // End loop over ranks
00489     buffer.resize(rdsp[MPISIZE-1]+rcnt[MPISIZE-1]);             // Resize recv buffer
00490     for( int irank=0; irank!=MPISIZE; ++irank ) {               // Loop over ranks
00491       scnt[irank] *= byte;                                      //  Multiply send count by byte size of data
00492       sdsp[irank] *= byte;                                      //  Multiply send displacement by byte size of data
00493       rcnt[irank] *= byte;                                      //  Multiply recv count by byte size of data
00494       rdsp[irank] *= byte;                                      //  Multiply recv displacement by byte size of data
00495     }                                                           // End loop over ranks
00496     MPI_Alltoallv(&bodies[0],scnt,sdsp,MPI_BYTE,&buffer[0],rcnt,rdsp,MPI_BYTE,MPI_COMM_WORLD);// Communicat bodies
00497     bodies = buffer;                                            // Copy recv buffer to bodies
00498     for( int l=0; l!=LEVEL; ++l ) {                             // Loop over level of N-D hypercube
00499       int d = 2 - l % 3;                                        //  Dimension of subdivision
00500       XMIN[l+1] = XMIN[l];                                      //  Set XMAX for next subdivision
00501       XMAX[l+1] = XMAX[l];                                      //  Set XMIN for next subdivision
00502       if( (MPIRANK >> (LEVEL - l - 1)) % 2 ) {                  //  If on left side
00503         XMIN[l+1][d] = (XMAX[l][d]+XMIN[l][d]) / 2;             //   Set XMIN to midpoint
00504       } else {                                                  //  If on right side
00505         XMAX[l+1][d] = (XMAX[l][d]+XMIN[l][d]) / 2;             //   Set XMAX to midpoint
00506       }                                                         //  Endif for side
00507     }                                                           // End loop over levels
00508     delete[] scnt;                                              // Delete send count
00509     delete[] sdsp;                                              // Delete send displacement
00510     delete[] rcnt;                                              // Delete recv count
00511     delete[] rdsp;                                              // Delete recv displacement
00512     stopTimer("Partition    ",printNow);                        // Stop timer 
00513   }
00514 
00515 //! Send bodies back to where they came from
00516   void unpartition(Bodies &bodies) {
00517     startTimer("Unpartition  ");                                // Start timer
00518     int byte = sizeof(bodies[0]);                               // Byte size of body structure
00519     int *scnt = new int [MPISIZE];                              // Send count
00520     int *sdsp = new int [MPISIZE];                              // Send displacement
00521     int *rcnt = new int [MPISIZE];                              // Recv count
00522     int *rdsp = new int [MPISIZE];                              // Recv displacement
00523     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
00524       B->ICELL = B->IPROC;                                      //  Copy process rank to cell index for sorting
00525     }                                                           // End loop over bodies
00526     buffer.resize(bodies.size());                               // Resize sort buffer
00527     stopTimer("Unpartition  ");                                 // Stop timer 
00528     sortBodies(bodies,buffer);                                  // Sort bodies in ascending order
00529     startTimer("Unpartition  ");                                // Start timer
00530     for( int i=0; i!=MPISIZE; ++i ) {                           // Loop over ranks
00531       scnt[i] = 0;                                              //  Initialize send counts
00532     }                                                           // End loop over ranks
00533     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
00534       int irank = B->IPROC;                                     //  Get rank which the body belongs to
00535       scnt[irank]++;                                            //  Fill send count bucket
00536     }                                                           // End loop over bodies
00537     MPI_Alltoall(scnt,1,MPI_INT,rcnt,1,MPI_INT,MPI_COMM_WORLD); // Communicate send count to get recv count
00538     sdsp[0] = rdsp[0] = 0;                                      // Initialize send/recv displacements
00539     for( int irank=0; irank!=MPISIZE-1; ++irank ) {             // Loop over ranks
00540       sdsp[irank+1] = sdsp[irank] + scnt[irank];                //  Set send displacement based on send count
00541       rdsp[irank+1] = rdsp[irank] + rcnt[irank];                //  Set recv displacement based on recv count
00542     }                                                           // End loop over ranks
00543     buffer.resize(rdsp[MPISIZE-1]+rcnt[MPISIZE-1]);             // Resize recv buffer
00544     for( int irank=0; irank!=MPISIZE; ++irank ) {               // Loop over ranks
00545       scnt[irank] *= byte;                                      //  Multiply send count by byte size of data
00546       sdsp[irank] *= byte;                                      //  Multiply send displacement by byte size of data
00547       rcnt[irank] *= byte;                                      //  Multiply recv count by byte size of data
00548       rdsp[irank] *= byte;                                      //  Multiply recv displacement by byte size of data
00549     }                                                           // End loop over ranks
00550     MPI_Alltoallv(&bodies[0],scnt,sdsp,MPI_BYTE,&buffer[0],rcnt,rdsp,MPI_BYTE,MPI_COMM_WORLD);// Communicat bodies
00551     bodies = buffer;                                            // Copy recv buffer to bodies
00552     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
00553       assert(B->IPROC == MPIRANK);                              //  Make sure bodies are in the right rank
00554     }                                                           // End loop over bodies
00555     delete[] scnt;                                              // Delete send count
00556     delete[] sdsp;                                              // Delete send displacement
00557     delete[] rcnt;                                              // Delete recv count
00558     delete[] rdsp;                                              // Delete recv displacement
00559     stopTimer("Unpartition  ",printNow);                        // Stop timer 
00560   }
00561 };
00562 
00563 #endif