45 MPI_Initialized(&flag);
48#ifndef GRID_COMMS_THREADS
53 MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
55 MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
58 if( (
nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
62 if( (
nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
83 int ierr=MPI_Cart_shift(
communicator,dim,shift,&source,&dest);
105 MPI_Comm optimal_comm;
116 MPI_Comm_free(&optimal_comm);
131 for(
int d=0;d<parent_ndimension;d++){
145 for(
int d=0;d<processors.
size();d++) {
146 childsize *= processors[d];
148 int Nchild = Nparent/childsize;
149 assert (childsize * Nchild == Nparent);
156 ccoor[d] = parent_processor_coor[d] % processors[d];
157 scoor[d] = parent_processor_coor[d] / processors[d];
158 ssize[d] = parent_processors[d] / processors[d];
159 if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d];
165 Lexicographic::IndexFromCoorReversed(ccoor,crank,processors);
166 Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);
174 int ierr= MPI_Comm_split(parent.
communicator,srank,crank,&comm_split);
179 int ierr = MPI_Comm_dup (parent.
communicator,&comm_split);
196 MPI_Comm_free(&comm_split);
200 for(
int d=0;d<processors.
size();d++){
201 std::cout << d<<
" " <<
_processor_coor[d] <<
" " << ccoor[d]<<std::endl;
204 for(
int d=0;d<processors.
size();d++){
232 std::cout <<
"InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
237 std::cout << std::endl;
252 int MPI_is_finalised;
253 MPI_Finalized(&MPI_is_finalised);
261#ifdef USE_GRID_REDUCTION
274 int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,
communicator);
280 int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,
communicator);
286 int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,
communicator);
291 int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,
communicator);
296 int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,
communicator);
300 int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,
communicator);
305 int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,
communicator);
311 int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,
communicator);
317 int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,
communicator);
323 int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,
communicator);
329 int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,
communicator);
349 int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,
communicator,&rrq);
354 ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,
communicator,&xrq);
360 int nreq=list.size();
364 std::vector<MPI_Status> status(nreq);
365 int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
377 std::vector<MpiCommsRequest_t> reqs(0);
388 ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
389 recv,bytes,MPI_CHAR,from, from,
401 std::vector<CommsRequest_t> list;
403 offbytes +=
StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
410 if ( grank == MPI_UNDEFINED )
return true;
414#ifdef ACCELERATOR_AWARE_MPI
422 int xbytes,
int rbytes,
int dir)
427 void *xmit,
void *xmit_comp,
429 void *recv,
void *recv_comp,
431 int xbytes,
int rbytes,
int dir)
434 int commdir=dir%ncomm;
447 double off_node_bytes=0.0;
454 ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,
communicator_halo[commdir],&rrq);
457 off_node_bytes+=rbytes;
472 ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,
communicator_halo[commdir],&xrq);
475 off_node_bytes+=xbytes;
484 return off_node_bytes;
489 int nreq=list.size();
494 std::vector<MPI_Status> status(nreq);
495 int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
539 int xbytes,
int rbytes,
int dir)
546 int commdir=dir%ncomm;
559 double off_node_bytes=0.0;
562 void * host_recv = NULL;
563 void * host_xmit = NULL;
574 host_recv = this->HostBufferMalloc(rbytes);
575 ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,
communicator_halo[commdir],&rrq);
578 srq.PacketType = InterNodeRecv;
581 srq.host_buf = host_recv;
582 srq.device_buf = recv;
584 off_node_bytes+=rbytes;
593 host_xmit = this->HostBufferMalloc(xbytes);
602 srq.PacketType = InterNodeXmit;
605 srq.host_buf = host_xmit;
606 srq.device_buf = xmit;
609 srq.commdir = commdir;
614 return off_node_bytes;
627 for(
int idx = 0; idx<list.size();idx++){
629 if ( list[idx].PacketType==InterNodeRecv ) {
633 int ierr = MPI_Test(&list[idx].req,&flag,&status);
639 list[idx].PacketType=InterNodeReceiveHtoD;
656 for(
int idx = 0; idx<list.size();idx++){
658 if ( list[idx].PacketType==InterNodeXmit ) {
662 void *host_xmit = list[idx].host_buf;
663 uint32_t xbytes = list[idx].bytes;
664 int dest = list[idx].dest;
665 int tag = list[idx].tag;
666 int commdir = list[idx].commdir;
674 int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,
communicator_halo[commdir],&xrq);
679 list[idx].PacketType=InterNodeXmitISend;
691 void *xmit,
void *xmit_comp,
693 void *recv,
void *recv_comp,
695 int xbytes,
int rbytes,
int dir)
698 int commdir=dir%ncomm;
711 double off_node_bytes=0.0;
714 void * host_xmit = NULL;
737 srq.PacketType = IntraNodeRecv;
741 srq.device_buf = xmit;
760 srq.PacketType = IntraNodeXmit;
764 srq.device_buf = xmit;
773 return off_node_bytes;
779 std::vector<MPI_Status> status;
780 std::vector<MPI_Request> MpiRequests;
782 for(
int r=0;r<list.size();r++){
784 if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
788 int nreq=MpiRequests.size();
791 status.resize(MpiRequests.size());
792 int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]);
804 this->HostBufferFreeAll();
831 int ierr=MPI_Bcast(data,
851 int ierr= MPI_Bcast(data,
883 assert(words == iwords);
884 assert(bytes == ibytes);
885 MPI_Type_contiguous(ibytes,MPI_BYTE,&
object);
886 MPI_Type_commit(&
object);
887 MPI_Alltoall(in,iwords,
object,out,iwords,
object,
communicator);
888 MPI_Type_free(&
object);
void acceleratorCopySynchronise(void)
int acceleratorIsCommunicable(void *ptr)
int acceleratorEventIsComplete(acceleratorEvent_t ev)
acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from, void *to, size_t bytes)
acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes)
acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes)
AcceleratorVector< int, MaxDims > Coordinate
void Grid_quiesce_nodes(void)
void Grid_unquiesce_nodes(void)
#define NAMESPACE_BEGIN(A)
accelerator_inline void resize(size_type sz)
accelerator_inline size_type size(void) const
void StencilBarrier(void)
void GlobalXOR(uint32_t &)
double StencilSendToRecvFromPrepare(std::vector< CommsRequest_t > &list, void *xmit, int xmit_to_rank, int do_xmit, void *recv, int recv_from_rank, int do_recv, int xbytes, int rbytes, int dir)
Coordinate _processor_coor
void ProcessorCoorFromRank(int rank, Coordinate &coor)
CartesianCommunicator(const Coordinate &processors, const CartesianCommunicator &parent, int &srank)
void StencilSendToRecvFromComplete(std::vector< CommsRequest_t > &waitall, int i)
virtual ~CartesianCommunicator()
void Broadcast(int root, void *data, int bytes)
void StencilSendToRecvFromPollDtoH(std::vector< CommsRequest_t > &list)
Coordinate _shm_processors
static void Init(int *argc, char ***argv)
double StencilSendToRecvFrom(void *xmit, int xmit_to_rank, int do_xmit, void *recv, int recv_from_rank, int do_recv, int bytes, int dir)
void AllToAll(int dim, std::vector< T > &in, std::vector< T > &out)
static int RankWorld(void)
void StencilSendToRecvFromPollIRecv(std::vector< CommsRequest_t > &list)
double StencilSendToRecvFromBegin(std::vector< CommsRequest_t > &list, void *xmit, void *xmit_comp, int xmit_to_rank, int do_xmit, void *recv, void *recv_comp, int recv_from_rank, int do_recv, int xbytes, int rbytes, int dir)
static void BarrierWorld(void)
void CommsComplete(std::vector< MpiCommsRequest_t > &list)
void GlobalSumVector(RealF *, int N)
void SendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, int recv_from_rank, int bytes)
int RankFromProcessorCoor(Coordinate &coor)
static Grid_MPI_Comm communicator_world
static void BroadcastWorld(int root, void *data, int bytes)
void GlobalSumP2P(obj &o)
void InitFromMPICommunicator(const Coordinate &processors, Grid_MPI_Comm communicator_base)
void SendToRecvFromBegin(std::vector< MpiCommsRequest_t > &list, void *xmit, int dest, void *recv, int from, int bytes, int dir)
void ShiftedRanks(int dim, int shift, int &source, int &dest)
unsigned long _ndimension
Grid_MPI_Comm communicator
std::vector< Grid_MPI_Comm > communicator_halo
static bool StepLog(const char *name)
static void SharedMemoryAllocate(uint64_t bytes, int flags)
static void Init(Grid_MPI_Comm comm)
static uint64_t MAX_MPI_SHM_BYTES
static void OptimalCommunicator(const Coordinate &processors, Grid_MPI_Comm &optimal_comm, Coordinate &ShmDims)
void * ShmBufferTranslate(int rank, void *local_p)
std::vector< int > ShmRanks
void SetCommunicator(Grid_MPI_Comm comm)