Grid 0.7.0
SharedMemoryMPI.cc
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/communicator/SharedMemory.cc
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10Author: Christoph Lehner <christoph@lhnr.de>
11
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License along
23 with this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25
26 See the full license in the file "LICENSE" in the top level distribution directory
27*************************************************************************************/
28/* END LEGAL */
29
30#define Mheader "SharedMemoryMpi: "
31
32#include <Grid/GridCore.h>
33#include <pwd.h>
34
35#ifdef GRID_CUDA
36#include <cuda_runtime_api.h>
37#endif
38#ifdef GRID_HIP
39#include <hip/hip_runtime_api.h>
40#endif
41#ifdef GRID_SYCL
42#ifdef ACCELERATOR_AWARE_MPI
43#define GRID_SYCL_LEVEL_ZERO_IPC
44#define SHM_SOCKETS
45#else
46#ifdef HAVE_NUMAIF_H
47 #warning " Using NUMAIF "
48#include <numaif.h>
49#endif
50#endif
51#include <syscall.h>
52#endif
53
54#include <sys/socket.h>
55#include <sys/un.h>
56
58
59#ifdef SHM_SOCKETS
60
61/*
62 * Barbaric extra intranode communication route in case we need sockets to pass FDs
63 * Forced by level_zero not being nicely designed
64 */
65static int sock;
66static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
67static char sock_path[256];
68class UnixSockets {
69public:
70 static void Open(int rank)
71 {
72 int errnum;
73
74 sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
75
76 struct sockaddr_un sa_un = { 0 };
77 sa_un.sun_family = AF_UNIX;
78 snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
79 unlink(sa_un.sun_path);
80 if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
81 perror("bind failure");
82 exit(EXIT_FAILURE);
83 }
84 }
85
86 static int RecvFileDescriptor(void)
87 {
88 int n;
89 int fd;
90 char buf[1];
91 struct iovec iov;
92 struct msghdr msg;
93 struct cmsghdr *cmsg;
94 char cms[CMSG_SPACE(sizeof(int))];
95
96 iov.iov_base = buf;
97 iov.iov_len = 1;
98
99 memset(&msg, 0, sizeof msg);
100 msg.msg_name = 0;
101 msg.msg_namelen = 0;
102 msg.msg_iov = &iov;
103 msg.msg_iovlen = 1;
104
105 msg.msg_control = (caddr_t)cms;
106 msg.msg_controllen = sizeof cms;
107
108 if((n=recvmsg(sock, &msg, 0)) < 0) {
109 perror("recvmsg failed");
110 return -1;
111 }
112 if(n == 0){
113 perror("recvmsg returned 0");
114 return -1;
115 }
116 cmsg = CMSG_FIRSTHDR(&msg);
117
118 memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
119
120 return fd;
121 }
122
123 static void SendFileDescriptor(int fildes,int xmit_to_rank)
124 {
125 struct msghdr msg;
126 struct iovec iov;
127 struct cmsghdr *cmsg = NULL;
128 char ctrl[CMSG_SPACE(sizeof(int))];
129 char data = ' ';
130
131 memset(&msg, 0, sizeof(struct msghdr));
132 memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
133 iov.iov_base = &data;
134 iov.iov_len = sizeof(data);
135
136 sprintf(sock_path,sock_path_fmt,xmit_to_rank);
137
138 struct sockaddr_un sa_un = { 0 };
139 sa_un.sun_family = AF_UNIX;
140 snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
141
142 msg.msg_name = (void *)&sa_un;
143 msg.msg_namelen = sizeof(sa_un);
144 msg.msg_iov = &iov;
145 msg.msg_iovlen = 1;
146 msg.msg_controllen = CMSG_SPACE(sizeof(int));
147 msg.msg_control = ctrl;
148
149 cmsg = CMSG_FIRSTHDR(&msg);
150 cmsg->cmsg_level = SOL_SOCKET;
151 cmsg->cmsg_type = SCM_RIGHTS;
152 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
153
154 *((int *) CMSG_DATA(cmsg)) = fildes;
155
156 sendmsg(sock, &msg, 0);
157 };
158};
159#endif
160
161
162/*Construct from an MPI communicator*/
164{
165 assert(_ShmSetup==0);
166 WorldComm = comm;
167 MPI_Comm_rank(WorldComm,&WorldRank);
168 MPI_Comm_size(WorldComm,&WorldSize);
169 // WorldComm, WorldSize, WorldRank
170
172 // Split into groups that can share memory
174#ifndef GRID_MPI3_SHM_NONE
175 MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
176#else
177 MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
178#endif
179
180 MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
181 MPI_Comm_size(WorldShmComm ,&WorldShmSize);
182
183 if ( WorldRank == 0) {
184 std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;
185 std::cout << Mheader " Node communicator of size " <<WorldShmSize << std::endl;
186 }
187 // WorldShmComm, WorldShmSize, WorldShmRank
188
189 // WorldNodes
191 assert( (WorldNodes * WorldShmSize) == WorldSize );
192
193
194 // FIXME: Check all WorldShmSize are the same ?
195
197 // find world ranks in our SHM group (i.e. which ranks are on our node)
199 MPI_Group WorldGroup, ShmGroup;
200 MPI_Comm_group (WorldComm, &WorldGroup);
201 MPI_Comm_group (WorldShmComm, &ShmGroup);
202
203 std::vector<int> world_ranks(WorldSize); for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
204
205 WorldShmRanks.resize(WorldSize);
206 MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]);
207
209 // Identify who is in my group and nominate the leader
211 int g=0;
212 std::vector<int> MyGroup;
213 MyGroup.resize(WorldShmSize);
214 for(int rank=0;rank<WorldSize;rank++){
215 if(WorldShmRanks[rank]!=MPI_UNDEFINED){
216 assert(g<WorldShmSize);
217 MyGroup[g++] = rank;
218 }
219 }
220
221 std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
222 int myleader = MyGroup[0];
223
224 std::vector<int> leaders_1hot(WorldSize,0);
225 std::vector<int> leaders_group(WorldNodes,0);
226 leaders_1hot [ myleader ] = 1;
227
229 // global sum leaders over comm world
231 int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
232 assert(ierr==0);
233
235 // find the group leaders world rank
237 int group=0;
238 for(int l=0;l<WorldSize;l++){
239 if(leaders_1hot[l]){
240 leaders_group[group++] = l;
241 }
242 }
243
245 // Identify the node of the group in which I (and my leader) live
247 WorldNode=-1;
248 for(int g=0;g<WorldNodes;g++){
249 if (myleader == leaders_group[g]){
250 WorldNode=g;
251 }
252 }
253 assert(WorldNode!=-1);
254 _ShmSetup=1;
255}
256// Gray encode support
258 int gray = (binary>>1)^binary;
259 return gray;
260}
261int Log2Size(int TwoToPower,int MAXLOG2)
262{
263 int log2size = -1;
264 for(int i=0;i<=MAXLOG2;i++){
265 if ( (0x1<<i) == TwoToPower ) {
266 log2size = i;
267 break;
268 }
269 }
270 return log2size;
271}
273{
275 // Look and see if it looks like an HPE 8600 based on hostname conventions
277 const int namelen = _POSIX_HOST_NAME_MAX;
278 char name[namelen];
279 int R;
280 int I;
281 int N;
282 gethostname(name,namelen);
283 int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
284
285 if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
286 else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
287}
288
290{
292 // Assert power of two shm_size.
295 assert(log2size != -1);
296
298 // Identify the hypercube coordinate of this node using hostname
300 // n runs 0...7 9...16 18...25 27...34 (8*4) 5 bits
301 // i runs 0..7 3 bits
302 // r runs 0..3 2 bits
303 // 2^10 = 1024 nodes
304 const int maxhdim = 10;
305 std::vector<int> HyperCubeCoords(maxhdim,0);
306 std::vector<int> RootHyperCubeCoords(maxhdim,0);
307 int R;
308 int I;
309 int N;
310 const int namelen = _POSIX_HOST_NAME_MAX;
311 char name[namelen];
312
313 // Parse ICE-XA hostname to get hypercube location
314 gethostname(name,namelen);
315 int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
316 assert(nscan==3);
317
318 int nlo = N%9;
319 int nhi = N/9;
320 uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
321 uint32_t rootcoor = hypercoor;
322
324 // Print debug info
326 for(int d=0;d<maxhdim;d++){
327 HyperCubeCoords[d] = (hypercoor>>d)&0x1;
328 }
329
330 std::string hname(name);
331 // std::cout << "hostname "<<hname<<std::endl;
332 // std::cout << "R " << R << " I " << I << " N "<< N
333 // << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
334
336 // broadcast node 0's base coordinate for this partition.
338 MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
339 hypercoor=hypercoor-rootcoor;
340 assert(hypercoor<WorldSize);
341 assert(hypercoor>=0);
342
344 // Printing
346 for(int d=0;d<maxhdim;d++){
347 HyperCubeCoords[d] = (hypercoor>>d)&0x1;
348 }
349
351 // Identify subblock of ranks on node spreading across dims
352 // in a maximally symmetrical way
354 int ndimension = processors.size();
355 Coordinate processor_coor(ndimension);
356 Coordinate WorldDims = processors;
357 Coordinate ShmDims (ndimension); Coordinate NodeDims (ndimension);
358 Coordinate ShmCoor (ndimension); Coordinate NodeCoor (ndimension); Coordinate WorldCoor(ndimension);
359 Coordinate HyperCoor(ndimension);
360
361 GetShmDims(WorldDims,ShmDims);
362 SHM = ShmDims;
363
365 // Establish torus of processes and nodes with sub-blockings
367 for(int d=0;d<ndimension;d++){
368 NodeDims[d] = WorldDims[d]/ShmDims[d];
369 }
371 // Map Hcube according to physical lattice
372 // must partition. Loop over dims and find out who would join.
374 int hcoor = hypercoor;
375 for(int d=0;d<ndimension;d++){
376 int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
377 int msk = (0x1<<bits)-1;
378 HyperCoor[d]=hcoor & msk;
379 HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
380 hcoor = hcoor >> bits;
381 }
383 // Check processor counts match
385 int Nprocessors=1;
386 for(int i=0;i<ndimension;i++){
387 Nprocessors*=processors[i];
388 }
389 assert(WorldSize==Nprocessors);
390
392 // Establish mapping between lexico physics coord and WorldRank
394 int rank;
395
396 Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
397
398 for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
399
400 Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
401 for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
402 Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
403
405 // Build the new communicator
407 int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
408 assert(ierr==0);
409}
411{
413 // Identify subblock of ranks on node spreading across dims
414 // in a maximally symmetrical way
416 int ndimension = processors.size();
417 Coordinate processor_coor(ndimension);
418 Coordinate WorldDims = processors; Coordinate ShmDims(ndimension); Coordinate NodeDims (ndimension);
419 Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
420
421 GetShmDims(WorldDims,ShmDims);
422 SHM=ShmDims;
423
425 // Establish torus of processes and nodes with sub-blockings
427 for(int d=0;d<ndimension;d++){
428 NodeDims[d] = WorldDims[d]/ShmDims[d];
429 }
430
432 // Check processor counts match
434 int Nprocessors=1;
435 for(int i=0;i<ndimension;i++){
436 Nprocessors*=processors[i];
437 }
438 assert(WorldSize==Nprocessors);
439
441 // Establish mapping between lexico physics coord and WorldRank
443 int rank;
444
445 Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
446 Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
447 for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
448 Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
449
451 // Build the new communicator
453 int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
454 assert(ierr==0);
455}
456
457// SHMGET
459#ifdef GRID_MPI3_SHMGET
460void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
461{
462 std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
463 assert(_ShmSetup==1);
464 assert(_ShmAlloc==0);
465
467 // allocate the shared windows for our group
469 MPI_Barrier(WorldShmComm);
471 std::vector<int> shmids(WorldShmSize);
472
473 if ( WorldShmRank == 0 ) {
474 for(int r=0;r<WorldShmSize;r++){
475 size_t size = bytes;
476 key_t key = IPC_PRIVATE;
477 int flags = IPC_CREAT | SHM_R | SHM_W;
478#ifdef SHM_HUGETLB
479 if (Hugepages) flags|=SHM_HUGETLB;
480#endif
481 if ((shmids[r]= shmget(key,size, flags)) ==-1) {
482 int errsv = errno;
483 printf("Errno %d\n",errsv);
484 printf("key %d\n",key);
485 printf("size %ld\n",size);
486 printf("flags %d\n",flags);
487 perror("shmget");
488 exit(1);
489 }
490 }
491 }
492 MPI_Barrier(WorldShmComm);
493 MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
494 MPI_Barrier(WorldShmComm);
495
496 for(int r=0;r<WorldShmSize;r++){
497 WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
498 if (WorldShmCommBufs[r] == (uint64_t *)-1) {
499 perror("Shared memory attach failure");
500 shmctl(shmids[r], IPC_RMID, NULL);
501 exit(2);
502 }
503 }
504 MPI_Barrier(WorldShmComm);
506 // Mark for clean up
508 for(int r=0;r<WorldShmSize;r++){
509 shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
510 }
511 MPI_Barrier(WorldShmComm);
512
513 _ShmAlloc=1;
514 _ShmAllocBytes = bytes;
515}
516#endif
517
519// Hugetlbfs mapping intended
521#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
522void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
523{
524 void * ShmCommBuf ;
525 assert(_ShmSetup==1);
526 assert(_ShmAlloc==0);
527
529 // allocate the pointer array for shared windows for our group
531 MPI_Barrier(WorldShmComm);
533
535 // TODO/FIXME : NOT ALL NVLINK BOARDS have full Peer to peer connectivity.
536 // The annoyance is that they have partial peer 2 peer. This occurs on the 8 GPU blades.
537 // e.g. DGX1, supermicro board,
539 // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
540
542 // Each MPI rank should allocate our own buffer
544#ifndef ACCELERATOR_AWARE_MPI
545 // printf("Host buffer allocate for GPU non-aware MPI\n");
546 HostCommBuf= malloc(bytes);
547#endif
548 ShmCommBuf = acceleratorAllocDevice(bytes);
549 if (ShmCommBuf == (void *)NULL ) {
550 std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
551 exit(EXIT_FAILURE);
552 }
553 if ( WorldRank == 0 ){
554 std::cout << Mheader " acceleratorAllocDevice "<< bytes
555 << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
556 }
557 SharedMemoryZero(ShmCommBuf,bytes);
558 if ( WorldRank == 0 ){
559 std::cout<< Mheader "Setting up IPC"<<std::endl;
560 }
562 // Loop over ranks/gpu's on our node
564#ifdef SHM_SOCKETS
565 UnixSockets::Open(WorldShmRank);
566#endif
567 for(int r=0;r<WorldShmSize;r++){
568
569 MPI_Barrier(WorldShmComm);
570
571#ifndef GRID_MPI3_SHM_NONE
573 // If it is me, pass around the IPC access key
575 void * thisBuf = ShmCommBuf;
576 if(!Stencil_force_mpi) {
577#ifdef GRID_SYCL_LEVEL_ZERO_IPC
578 typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
579
580 auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
581 auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
582
583 ze_ipc_mem_handle_t ihandle;
584 clone_mem_t handle;
585
586 if ( r==WorldShmRank ) {
587 auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
588 if ( err != ZE_RESULT_SUCCESS ) {
589 std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
590 exit(EXIT_FAILURE);
591 }
592 memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
593 handle.pid = getpid();
594 memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
595#ifdef SHM_SOCKETS
596 for(int rr=0;rr<WorldShmSize;rr++){
597 if(rr!=r){
598 UnixSockets::SendFileDescriptor(handle.fd,rr);
599 }
600 }
601#endif
602 }
603#endif
604#ifdef GRID_CUDA
605 cudaIpcMemHandle_t handle;
606 if ( r==WorldShmRank ) {
607 auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
608 if ( err != cudaSuccess) {
609 std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
610 exit(EXIT_FAILURE);
611 }
612 }
613#endif
614#ifdef GRID_HIP
615 hipIpcMemHandle_t handle;
616 if ( r==WorldShmRank ) {
617 auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
618 if ( err != hipSuccess) {
619 std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
620 exit(EXIT_FAILURE);
621 }
622 }
623#endif
624
626 // Share this IPC handle across the Shm Comm
628 {
629 MPI_Barrier(WorldShmComm);
630 int ierr=MPI_Bcast(&handle,
631 sizeof(handle),
632 MPI_BYTE,
633 r,
635 assert(ierr==0);
636 }
637
639 // If I am not the source, overwrite thisBuf with remote buffer
641
642#ifdef GRID_SYCL_LEVEL_ZERO_IPC
643 if ( r!=WorldShmRank ) {
644 thisBuf = nullptr;
645 int myfd;
646#ifdef SHM_SOCKETS
647 myfd=UnixSockets::RecvFileDescriptor();
648#else
649 // std::cout<<"mapping seeking remote pid/fd "
650 // <<handle.pid<<"/"
651 // <<handle.fd<<std::endl;
652
653 int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
654 // std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
655 // int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
656 myfd = syscall(438,pidfd,handle.fd,0);
657 int err_t = errno;
658 if (myfd < 0) {
659 fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
660 perror("pidfd_getfd failed ");
661 assert(0);
662 }
663#endif
664 // std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
665 memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
666 memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
667
668 auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
669 if ( err != ZE_RESULT_SUCCESS ) {
670 std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
671 std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
672 exit(EXIT_FAILURE);
673 }
674 assert(thisBuf!=nullptr);
675 }
676#endif
677#ifdef GRID_CUDA
678 if ( r!=WorldShmRank ) {
679 auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
680 if ( err != cudaSuccess) {
681 std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
682 exit(EXIT_FAILURE);
683 }
684 }
685#endif
686#ifdef GRID_HIP
687 if ( r!=WorldShmRank ) {
688 auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
689 if ( err != hipSuccess) {
690 std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
691 exit(EXIT_FAILURE);
692 }
693 }
694#endif
696 // Save a copy of the device buffers
698 }
699 WorldShmCommBufs[r] = thisBuf;
700#else
701 WorldShmCommBufs[r] = ShmCommBuf;
702#endif
703 MPI_Barrier(WorldShmComm);
704 }
705
706 _ShmAllocBytes=bytes;
707 _ShmAlloc=1;
708}
709
710#else
711#ifdef GRID_MPI3_SHMMMAP
712void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
713{
714 std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
715 assert(_ShmSetup==1);
716 assert(_ShmAlloc==0);
718 // allocate the shared windows for our group
720 MPI_Barrier(WorldShmComm);
722
724 // Hugetlbfs and others map filesystems as mappable huge pages
726 char shm_name [NAME_MAX];
727 for(int r=0;r<WorldShmSize;r++){
728
729 sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
730 int fd=open(shm_name,O_RDWR|O_CREAT,0666);
731 if ( fd == -1) {
732 printf("open %s failed\n",shm_name);
733 perror("open hugetlbfs");
734 exit(0);
735 }
736 int mmap_flag = MAP_SHARED ;
737#ifdef MAP_POPULATE
738 mmap_flag|=MAP_POPULATE;
739#endif
740#ifdef MAP_HUGETLB
741 if ( flags ) mmap_flag |= MAP_HUGETLB;
742#endif
743 void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
744 if ( ptr == (void *)MAP_FAILED ) {
745 printf("mmap %s failed\n",shm_name);
746 perror("failed mmap"); assert(0);
747 }
748 assert(((uint64_t)ptr&0x3F)==0);
749 close(fd);
750 WorldShmCommBufs[r] =ptr;
751 // std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
752 }
753 std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
754 _ShmAlloc=1;
755 _ShmAllocBytes = bytes;
756};
757#endif // MMAP
758
759#ifdef GRID_MPI3_SHM_NONE
760void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
761{
762 std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
763 assert(_ShmSetup==1);
764 assert(_ShmAlloc==0);
766 // allocate the shared windows for our group
768 MPI_Barrier(WorldShmComm);
770
772 // Hugetlbf and others map filesystems as mappable huge pages
774 char shm_name [NAME_MAX];
775 assert(WorldShmSize == 1);
776 for(int r=0;r<WorldShmSize;r++){
777
778 int fd=-1;
779 int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
780#ifdef MAP_POPULATE
781 mmap_flag|=MAP_POPULATE;
782#endif
783#ifdef MAP_HUGETLB
784 if ( flags ) mmap_flag |= MAP_HUGETLB;
785#endif
786 void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
787 if ( ptr == (void *)MAP_FAILED ) {
788 printf("mmap %s failed\n",shm_name);
789 perror("failed mmap"); assert(0);
790 }
791 assert(((uint64_t)ptr&0x3F)==0);
792 close(fd);
793 WorldShmCommBufs[r] =ptr;
794 // std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
795 }
796 _ShmAlloc=1;
797 _ShmAllocBytes = bytes;
798};
799#endif // MMAP
800
801#ifdef GRID_MPI3_SHMOPEN
803// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
804// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
805// the posix shm virtual file system
807void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
808{
809 std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
810 assert(_ShmSetup==1);
811 assert(_ShmAlloc==0);
812 MPI_Barrier(WorldShmComm);
814
815 char shm_name [NAME_MAX];
816 if ( WorldShmRank == 0 ) {
817 for(int r=0;r<WorldShmSize;r++){
818
819 size_t size = bytes;
820
821 struct passwd *pw = getpwuid (getuid());
822 sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
823
824 shm_unlink(shm_name);
825 int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
826 if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
827 ftruncate(fd, size);
828
829 int mmap_flag = MAP_SHARED;
830#ifdef MAP_POPULATE
831 mmap_flag |= MAP_POPULATE;
832#endif
833#ifdef MAP_HUGETLB
834 if (flags) mmap_flag |= MAP_HUGETLB;
835#endif
836 void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
837
838 if ( ptr == (void * )MAP_FAILED ) {
839 perror("failed mmap");
840 assert(0);
841 }
842 assert(((uint64_t)ptr&0x3F)==0);
843
844 WorldShmCommBufs[r] =ptr;
845 close(fd);
846 }
847 }
848
849 MPI_Barrier(WorldShmComm);
850
851 if ( WorldShmRank != 0 ) {
852 for(int r=0;r<WorldShmSize;r++){
853
854 size_t size = bytes ;
855
856 struct passwd *pw = getpwuid (getuid());
857 sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
858
859 int fd=shm_open(shm_name,O_RDWR,0666);
860 if ( fd<0 ) { perror("failed shm_open"); assert(0); }
861
862 void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
863 if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
864 assert(((uint64_t)ptr&0x3F)==0);
865 WorldShmCommBufs[r] =ptr;
866
867 close(fd);
868 }
869 }
870 _ShmAlloc=1;
871 _ShmAllocBytes = bytes;
872}
873#endif
874#endif // End NVCC case for GPU device buffers
875
877// Routines accessing shared memory should route through for GPU safety
879void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
880{
881#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
882 acceleratorMemSet(dest,0,bytes);
883#else
884 bzero(dest,bytes);
885#endif
886}
887//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
888//{
889//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
890// acceleratorCopyToDevice(src,dest,bytes);
891//#else
892// bcopy(src,dest,bytes);
893//#endif
894//}
896// Global shared functionality finished
897// Now move to per communicator functionality
900{
901 int rank, size;
902 MPI_Comm_rank(comm,&rank);
903 MPI_Comm_size(comm,&size);
904 ShmRanks.resize(size);
905
907 // Split into groups that can share memory
909#ifndef GRID_MPI3_SHM_NONE
910 MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
911#else
912 MPI_Comm_split(comm, rank, 0, &ShmComm);
913#endif
914 MPI_Comm_rank(ShmComm ,&ShmRank);
915 MPI_Comm_size(ShmComm ,&ShmSize);
916 ShmCommBufs.resize(ShmSize);
917
919 // Map ShmRank to WorldShmRank and use the right buffer
921 assert (GlobalSharedMemory::ShmAlloc()==1);
923 for(int r=0;r<ShmSize;r++){
924
925 uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
926
927 MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
928
930 // std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
931 }
933
934#ifndef ACCELERATOR_AWARE_MPI
935 host_heap_size = heap_size;
936 HostCommBuf= GlobalSharedMemory::HostCommBuf;
937 HostBufferFreeAll();
938#endif
939
941 // find comm ranks in our SHM group (i.e. which ranks are on our node)
943 MPI_Group FullGroup, ShmGroup;
944 MPI_Comm_group (comm , &FullGroup);
945 MPI_Comm_group (ShmComm, &ShmGroup);
946
947 std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
948 MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
949
950#ifdef GRID_SHM_FORCE_MPI
951 // Hide the shared memory path between ranks
952 {
953 for(int r=0;r<size;r++){
954 if ( r!=rank ) {
955 ShmRanks[r] = MPI_UNDEFINED;
956 }
957 }
958 }
959#endif
960
961 // SharedMemoryTest();
962}
963
964// On node barrier
967{
968 MPI_Barrier (ShmComm);
969}
970
971// Test the shared memory is working
974{
975 ShmBarrier();
976 uint64_t check[3];
977 uint64_t magic = 0x5A5A5A;
978 if ( ShmRank == 0 ) {
979 for(uint64_t r=0;r<ShmSize;r++){
981 check[1]=r;
982 check[2]=magic;
983 acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
984 }
985 }
986 ShmBarrier();
987 for(uint64_t r=0;r<ShmSize;r++){
988 acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
989 assert(check[0]==GlobalSharedMemory::WorldNode);
990 assert(check[1]==r);
991 assert(check[2]==magic);
992 }
993 ShmBarrier();
994 std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
995}
996
998{
999 int gpeer = ShmRanks[rank];
1000 if (gpeer == MPI_UNDEFINED){
1001 return NULL;
1002 } else {
1003 return ShmCommBufs[gpeer];
1004 }
1005}
1006void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
1007{
1008 int gpeer = ShmRanks[rank];
1009 assert(gpeer!=ShmRank); // never send to self
1010 // std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
1011 if (gpeer == MPI_UNDEFINED){
1012 return NULL;
1013 } else {
1014 uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
1015 uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
1016 // std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
1017 return (void *) remote;
1018 }
1019}
1021{
1022 int MPI_is_finalised; MPI_Finalized(&MPI_is_finalised);
1023 if ( !MPI_is_finalised ) {
1024 MPI_Comm_free(&ShmComm);
1025 }
1026};
1027
1029
void * acceleratorAllocDevice(size_t bytes)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
void acceleratorMemSet(void *base, int value, size_t bytes)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
bool Stencil_force_mpi
#define GRID_SHM_PATH
Definition Config.h:104
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95
Out accelerator_inline binary(Input1 src_1, Input2 src_2, Operation op)
GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE")
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
int BinaryToGray(int binary)
#define Mheader
int Log2Size(int TwoToPower, int MAXLOG2)
int Grid_MPI_Comm
accelerator_inline size_type size(void) const
Definition Coordinate.h:52
static void SharedMemoryAllocate(uint64_t bytes, int flags)
static const int MAXLOG2RANKSPERNODE
static int WorldShmSize
static void OptimalCommunicatorHypercube(const Coordinate &processors, Grid_MPI_Comm &optimal_comm, Coordinate &ShmDims)
static Grid_MPI_Comm WorldComm
static uint64_t _ShmAllocBytes
static std::vector< int > WorldShmRanks
static uint64_t ShmAllocBytes(void)
static void Init(Grid_MPI_Comm comm)
static void GetShmDims(const Coordinate &WorldDims, Coordinate &ShmDims)
static int _ShmAlloc
static void SharedMemoryZero(void *dest, size_t bytes)
static int _ShmSetup
static int HPEhypercube
static int WorldNodes
static int WorldShmRank
static void OptimalCommunicatorSharedMemory(const Coordinate &processors, Grid_MPI_Comm &optimal_comm, Coordinate &ShmDims)
static std::vector< void * > WorldShmCommBufs
static int ShmAlloc(void)
static Grid_MPI_Comm WorldShmComm
static void OptimalCommunicator(const Coordinate &processors, Grid_MPI_Comm &optimal_comm, Coordinate &ShmDims)
void * ShmBufferTranslate(int rank, void *local_p)
std::vector< int > ShmRanks
void ShmBarrier(void)
void * ShmBuffer(int rank)
void ShmBufferFreeAll(void)
void SharedMemoryTest(void)
void SetCommunicator(Grid_MPI_Comm comm)
std::vector< void * > ShmCommBufs
Grid_MPI_Comm ShmComm