Grid/dev/SharedMemoryMPI_8cc_source.html

/*************************************************************************************


    Grid physics library, www.github.com/paboyle/Grid


    Source file: ./lib/communicator/SharedMemory.cc


    Copyright (C) 2015


Author: Peter Boyle <paboyle@ph.ed.ac.uk>

Author: Christoph Lehner <christoph@lhnr.de>


    This program is free software; you can redistribute it and/or modify

    it under the terms of the GNU General Public License as published by

    the Free Software Foundation; either version 2 of the License, or

    (at your option) any later version.


    This program is distributed in the hope that it will be useful,

    but WITHOUT ANY WARRANTY; without even the implied warranty of

    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    GNU General Public License for more details.


    You should have received a copy of the GNU General Public License along

    with this program; if not, write to the Free Software Foundation, Inc.,

    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.


    See the full license in the file "LICENSE" in the top level distribution directory

*************************************************************************************/

/*  END LEGAL */


#define Mheader "SharedMemoryMpi: "


#include <Grid/GridCore.h>

#include <pwd.h>


#ifdef GRID_CUDA

#include <cuda_runtime_api.h>

#endif

#ifdef GRID_HIP

#include <hip/hip_runtime_api.h>

#endif

#ifdef GRID_SYCL

#ifdef ACCELERATOR_AWARE_MPI

#define GRID_SYCL_LEVEL_ZERO_IPC

#define SHM_SOCKETS

#else

#ifdef HAVE_NUMAIF_H

  #warning " Using NUMAIF "

#include <numaif.h>

#endif

#endif

#include <syscall.h>

#endif


#include <sys/socket.h>

#include <sys/un.h>


NAMESPACE_BEGIN(Grid);


#ifdef SHM_SOCKETS


/*

 * Barbaric extra intranode communication route in case we need sockets to pass FDs

 * Forced by level_zero not being nicely designed

 */

static int sock;

static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";

static char sock_path[256];

class UnixSockets {

public:

  static void Open(int rank)

  {

    int errnum;


    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);


    struct sockaddr_un sa_un = { 0 };

    sa_un.sun_family = AF_UNIX;

    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);

    unlink(sa_un.sun_path);

    if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {

      perror("bind failure");

      exit(EXIT_FAILURE);

    }

  }


  static int RecvFileDescriptor(void)

  {

    int n;

    int fd;

    char buf[1];

    struct iovec iov;

    struct msghdr msg;

    struct cmsghdr *cmsg;

    char cms[CMSG_SPACE(sizeof(int))];


    iov.iov_base = buf;

    iov.iov_len = 1;


    memset(&msg, 0, sizeof msg);

    msg.msg_name = 0;

    msg.msg_namelen = 0;

    msg.msg_iov = &iov;

    msg.msg_iovlen = 1;


    msg.msg_control = (caddr_t)cms;

    msg.msg_controllen = sizeof cms;


    if((n=recvmsg(sock, &msg, 0)) < 0) {

      perror("recvmsg failed");

      return -1;

    }

    if(n == 0){

      perror("recvmsg returned 0");

      return -1;

    }

    cmsg = CMSG_FIRSTHDR(&msg);


    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));


    return fd;

  }


  static void SendFileDescriptor(int fildes,int xmit_to_rank)

  {

    struct msghdr msg;

    struct iovec iov;

    struct cmsghdr *cmsg = NULL;

    char ctrl[CMSG_SPACE(sizeof(int))];

    char data = ' ';


    memset(&msg, 0, sizeof(struct msghdr));

    memset(ctrl, 0, CMSG_SPACE(sizeof(int)));

    iov.iov_base = &data;

    iov.iov_len = sizeof(data);


    sprintf(sock_path,sock_path_fmt,xmit_to_rank);


    struct sockaddr_un sa_un = { 0 };

    sa_un.sun_family = AF_UNIX;

    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);


    msg.msg_name = (void *)&sa_un;

    msg.msg_namelen = sizeof(sa_un);

    msg.msg_iov = &iov;

    msg.msg_iovlen = 1;

    msg.msg_controllen =  CMSG_SPACE(sizeof(int));

    msg.msg_control = ctrl;


    cmsg = CMSG_FIRSTHDR(&msg);

    cmsg->cmsg_level = SOL_SOCKET;

    cmsg->cmsg_type = SCM_RIGHTS;

    cmsg->cmsg_len = CMSG_LEN(sizeof(int));


    *((int *) CMSG_DATA(cmsg)) = fildes;


    sendmsg(sock, &msg, 0);

  };

};

#endif


/*Construct from an MPI communicator*/


void GlobalSharedMemory::Init(Grid_MPI_Comm comm)

{

  assert(_ShmSetup==0);

  WorldComm = comm;

  MPI_Comm_rank(WorldComm,&WorldRank);

  MPI_Comm_size(WorldComm,&WorldSize);

  // WorldComm, WorldSize, WorldRank


  // Split into groups that can share memory

#ifndef GRID_MPI3_SHM_NONE

  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);

#else

  MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);

#endif


  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);

  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);


  if ( WorldRank == 0) {

    std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;

    std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl;

  }

  // WorldShmComm, WorldShmSize, WorldShmRank


  // WorldNodes

  WorldNodes = WorldSize/WorldShmSize;

  assert( (WorldNodes * WorldShmSize) == WorldSize );


  // FIXME: Check all WorldShmSize are the same ?


  // find world ranks in our SHM group (i.e. which ranks are on our node)

  MPI_Group WorldGroup, ShmGroup;

  MPI_Comm_group (WorldComm, &WorldGroup);

  MPI_Comm_group (WorldShmComm, &ShmGroup);


  std::vector<int> world_ranks(WorldSize);   for(int r=0;r<WorldSize;r++) world_ranks[r]=r;


  WorldShmRanks.resize(WorldSize);

  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]);


  // Identify who is in my group and nominate the leader

  int g=0;

  std::vector<int> MyGroup;

  MyGroup.resize(WorldShmSize);

  for(int rank=0;rank<WorldSize;rank++){

    if(WorldShmRanks[rank]!=MPI_UNDEFINED){

      assert(g<WorldShmSize);

      MyGroup[g++] = rank;

    }

  }


  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());

  int myleader = MyGroup[0];


  std::vector<int> leaders_1hot(WorldSize,0);

  std::vector<int> leaders_group(WorldNodes,0);

  leaders_1hot [ myleader ] = 1;


  // global sum leaders over comm world

  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);

  assert(ierr==0);


  // find the group leaders world rank

  int group=0;

  for(int l=0;l<WorldSize;l++){

    if(leaders_1hot[l]){

      leaders_group[group++] = l;

    }

  }


  // Identify the node of the group in which I (and my leader) live

  WorldNode=-1;

  for(int g=0;g<WorldNodes;g++){

    if (myleader == leaders_group[g]){

      WorldNode=g;

    }

  }

  assert(WorldNode!=-1);

  _ShmSetup=1;

}


// Gray encode support


int BinaryToGray (int  binary) {

  int gray = (binary>>1)^binary;

  return gray;

}


int Log2Size(int TwoToPower,int MAXLOG2)

{

  int log2size = -1;

  for(int i=0;i<=MAXLOG2;i++){

    if ( (0x1<<i) == TwoToPower ) {

      log2size = i;

      break;

    }

  }

  return log2size;

}


void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)

{

  // Look and see if it looks like an HPE 8600 based on hostname conventions

  const int namelen = _POSIX_HOST_NAME_MAX;

  char name[namelen];

  int R;

  int I;

  int N;

  gethostname(name,namelen);

  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;


  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);

  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);

}


void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)

{

  // Assert power of two shm_size.

  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);

  assert(log2size != -1);


  // Identify the hypercube coordinate of this node using hostname

  // n runs 0...7 9...16 18...25 27...34     (8*4)  5 bits

  // i runs 0..7                                    3 bits

  // r runs 0..3                                    2 bits

  // 2^10 = 1024 nodes

  const int maxhdim = 10;

  std::vector<int> HyperCubeCoords(maxhdim,0);

  std::vector<int> RootHyperCubeCoords(maxhdim,0);

  int R;

  int I;

  int N;

  const int namelen = _POSIX_HOST_NAME_MAX;

  char name[namelen];


  // Parse ICE-XA hostname to get hypercube location

  gethostname(name,namelen);

  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;

  assert(nscan==3);


  int nlo = N%9;

  int nhi = N/9;

  uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;

  uint32_t rootcoor  = hypercoor;


  // Print debug info

  for(int d=0;d<maxhdim;d++){

    HyperCubeCoords[d] = (hypercoor>>d)&0x1;

  }


  std::string hname(name);

  //  std::cout << "hostname "<<hname<<std::endl;

  //  std::cout << "R " << R << " I " << I << " N "<< N

  //            << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;


  // broadcast node 0's base coordinate for this partition.

  MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);

  hypercoor=hypercoor-rootcoor;

  assert(hypercoor<WorldSize);

  assert(hypercoor>=0);


  // Printing

  for(int d=0;d<maxhdim;d++){

    HyperCubeCoords[d] = (hypercoor>>d)&0x1;

  }


  // Identify subblock of ranks on node spreading across dims

  // in a maximally symmetrical way

  int ndimension              = processors.size();

  Coordinate processor_coor(ndimension);

  Coordinate WorldDims = processors;

  Coordinate ShmDims  (ndimension);  Coordinate NodeDims (ndimension);

  Coordinate ShmCoor  (ndimension);    Coordinate NodeCoor (ndimension);    Coordinate WorldCoor(ndimension);

  Coordinate HyperCoor(ndimension);


  GetShmDims(WorldDims,ShmDims);

  SHM = ShmDims;


  // Establish torus of processes and nodes with sub-blockings

  for(int d=0;d<ndimension;d++){

    NodeDims[d] = WorldDims[d]/ShmDims[d];

  }

  // Map Hcube according to physical lattice

  // must partition. Loop over dims and find out who would join.

  int hcoor = hypercoor;

  for(int d=0;d<ndimension;d++){

     int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);

     int msk  = (0x1<<bits)-1;

     HyperCoor[d]=hcoor & msk;

     HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic

     hcoor = hcoor >> bits;

  }

  // Check processor counts match

  int Nprocessors=1;

  for(int i=0;i<ndimension;i++){

    Nprocessors*=processors[i];

  }

  assert(WorldSize==Nprocessors);


  // Establish mapping between lexico physics coord and WorldRank

  int rank;


  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);


  for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];


  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);

  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];

  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);


  // Build the new communicator

  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);

  assert(ierr==0);

}


void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)

{

  // Identify subblock of ranks on node spreading across dims

  // in a maximally symmetrical way

  int ndimension              = processors.size();

  Coordinate processor_coor(ndimension);

  Coordinate WorldDims = processors; Coordinate ShmDims(ndimension);  Coordinate NodeDims (ndimension);

  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);


  GetShmDims(WorldDims,ShmDims);

  SHM=ShmDims;


  // Establish torus of processes and nodes with sub-blockings

  for(int d=0;d<ndimension;d++){

    NodeDims[d] = WorldDims[d]/ShmDims[d];

  }


  // Check processor counts match

  int Nprocessors=1;

  for(int i=0;i<ndimension;i++){

    Nprocessors*=processors[i];

  }

  assert(WorldSize==Nprocessors);


  // Establish mapping between lexico physics coord and WorldRank

  int rank;


  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);

  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);

  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];

  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);


  // Build the new communicator

  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);

  assert(ierr==0);

}


// SHMGET

#ifdef GRID_MPI3_SHMGET

void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)

{

  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;

  assert(_ShmSetup==1);

  assert(_ShmAlloc==0);


  // allocate the shared windows for our group

  MPI_Barrier(WorldShmComm);

  WorldShmCommBufs.resize(WorldShmSize);

  std::vector<int> shmids(WorldShmSize);


  if ( WorldShmRank == 0 ) {

    for(int r=0;r<WorldShmSize;r++){

      size_t size = bytes;

      key_t key   = IPC_PRIVATE;

      int flags = IPC_CREAT | SHM_R | SHM_W;

#ifdef SHM_HUGETLB

      if (Hugepages) flags|=SHM_HUGETLB;

#endif

      if ((shmids[r]= shmget(key,size, flags)) ==-1) {

        int errsv = errno;

        printf("Errno %d\n",errsv);

        printf("key   %d\n",key);

        printf("size  %ld\n",size);

        printf("flags %d\n",flags);

        perror("shmget");

        exit(1);

      }

    }

  }

  MPI_Barrier(WorldShmComm);

  MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);

  MPI_Barrier(WorldShmComm);


  for(int r=0;r<WorldShmSize;r++){

    WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);

    if (WorldShmCommBufs[r] == (uint64_t *)-1) {

      perror("Shared memory attach failure");

      shmctl(shmids[r], IPC_RMID, NULL);

      exit(2);

    }

  }

  MPI_Barrier(WorldShmComm);

  // Mark for clean up

  for(int r=0;r<WorldShmSize;r++){

    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);

  }

  MPI_Barrier(WorldShmComm);


  _ShmAlloc=1;

  _ShmAllocBytes  = bytes;

}

#endif


// Hugetlbfs mapping intended

#if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)

void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)

{

  void * ShmCommBuf ;

  assert(_ShmSetup==1);

  assert(_ShmAlloc==0);


  // allocate the pointer array for shared windows for our group

  MPI_Barrier(WorldShmComm);

  WorldShmCommBufs.resize(WorldShmSize);


  // TODO/FIXME : NOT ALL NVLINK BOARDS have full Peer to peer connectivity.

  // The annoyance is that they have partial peer 2 peer. This occurs on the 8 GPU blades.

  // e.g. DGX1, supermicro board,

  //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);


  // Each MPI rank should allocate our own buffer

#ifndef ACCELERATOR_AWARE_MPI

  // printf("Host buffer allocate for GPU non-aware MPI\n");

  HostCommBuf= malloc(bytes);

#endif

  ShmCommBuf = acceleratorAllocDevice(bytes);

  if (ShmCommBuf == (void *)NULL ) {

    std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;

    exit(EXIT_FAILURE);

  }

  if ( WorldRank == 0 ){

    std::cout << Mheader " acceleratorAllocDevice "<< bytes

          << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;

  }

  SharedMemoryZero(ShmCommBuf,bytes);

  if ( WorldRank == 0 ){

    std::cout<< Mheader "Setting up IPC"<<std::endl;

  }

  // Loop over ranks/gpu's on our node

#ifdef SHM_SOCKETS

  UnixSockets::Open(WorldShmRank);

#endif

  for(int r=0;r<WorldShmSize;r++){


    MPI_Barrier(WorldShmComm);


#ifndef GRID_MPI3_SHM_NONE

    // If it is me, pass around the IPC access key

    void * thisBuf = ShmCommBuf;

    if(!Stencil_force_mpi) {

#ifdef GRID_SYCL_LEVEL_ZERO_IPC

    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;


    auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());

    auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());


    ze_ipc_mem_handle_t ihandle;

    clone_mem_t handle;


    if ( r==WorldShmRank ) {

      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);

      if ( err != ZE_RESULT_SUCCESS ) {

    std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;

    exit(EXIT_FAILURE);

      }

      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));

      handle.pid = getpid();

      memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));

#ifdef SHM_SOCKETS

      for(int rr=0;rr<WorldShmSize;rr++){

    if(rr!=r){

      UnixSockets::SendFileDescriptor(handle.fd,rr);

    }

      }

#endif

    }

#endif

#ifdef GRID_CUDA

    cudaIpcMemHandle_t handle;

    if ( r==WorldShmRank ) {

      auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);

      if ( err !=  cudaSuccess) {

    std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;

    exit(EXIT_FAILURE);

      }

    }

#endif

#ifdef GRID_HIP

    hipIpcMemHandle_t handle;

    if ( r==WorldShmRank ) {

      auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);

      if ( err !=  hipSuccess) {

    std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;

    exit(EXIT_FAILURE);

      }

    }

#endif


    // Share this IPC handle across the Shm Comm

    {

      MPI_Barrier(WorldShmComm);

      int ierr=MPI_Bcast(&handle,

             sizeof(handle),

             MPI_BYTE,

             r,

             WorldShmComm);

      assert(ierr==0);

    }


    // If I am not the source, overwrite thisBuf with remote buffer


#ifdef GRID_SYCL_LEVEL_ZERO_IPC

    if ( r!=WorldShmRank ) {

      thisBuf = nullptr;

      int myfd;

#ifdef SHM_SOCKETS

      myfd=UnixSockets::RecvFileDescriptor();

#else

      //      std::cout<<"mapping seeking remote pid/fd "

      //           <<handle.pid<<"/"

      //           <<handle.fd<<std::endl;


      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);

      //      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";

      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);

      myfd  = syscall(438,pidfd,handle.fd,0);

      int err_t = errno;

      if (myfd < 0) {

        fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);

    perror("pidfd_getfd failed ");

    assert(0);

      }

#endif

      //      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";

      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));

      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));


      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);

      if ( err != ZE_RESULT_SUCCESS ) {

    std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;

    std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;

    exit(EXIT_FAILURE);

      }

      assert(thisBuf!=nullptr);

    }

#endif

#ifdef GRID_CUDA

    if ( r!=WorldShmRank ) {

      auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);

      if ( err !=  cudaSuccess) {

    std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;

    exit(EXIT_FAILURE);

      }

    }

#endif

#ifdef GRID_HIP

    if ( r!=WorldShmRank ) {

      auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);

      if ( err !=  hipSuccess) {

    std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;

    exit(EXIT_FAILURE);

      }

    }

#endif

    // Save a copy of the device buffers

    }

    WorldShmCommBufs[r] = thisBuf;

#else

    WorldShmCommBufs[r] = ShmCommBuf;

#endif

    MPI_Barrier(WorldShmComm);

  }


  _ShmAllocBytes=bytes;

  _ShmAlloc=1;

}


#else

#ifdef GRID_MPI3_SHMMMAP

void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)

{

  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;

  assert(_ShmSetup==1);

  assert(_ShmAlloc==0);

  // allocate the shared windows for our group

  MPI_Barrier(WorldShmComm);

  WorldShmCommBufs.resize(WorldShmSize);


  // Hugetlbfs and others map filesystems as mappable huge pages

  char shm_name [NAME_MAX];

  for(int r=0;r<WorldShmSize;r++){


    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);

    int fd=open(shm_name,O_RDWR|O_CREAT,0666);

    if ( fd == -1) {

      printf("open %s failed\n",shm_name);

      perror("open hugetlbfs");

      exit(0);

    }

    int mmap_flag = MAP_SHARED ;

#ifdef MAP_POPULATE

    mmap_flag|=MAP_POPULATE;

#endif

#ifdef MAP_HUGETLB

    if ( flags ) mmap_flag |= MAP_HUGETLB;

#endif

    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);

    if ( ptr == (void *)MAP_FAILED ) {

      printf("mmap %s failed\n",shm_name);

      perror("failed mmap");      assert(0);

    }

    assert(((uint64_t)ptr&0x3F)==0);

    close(fd);

    WorldShmCommBufs[r] =ptr;

    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;

  }

  std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;

  _ShmAlloc=1;

  _ShmAllocBytes  = bytes;

};

#endif // MMAP


#ifdef GRID_MPI3_SHM_NONE


void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)

{

  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;

  assert(_ShmSetup==1);

  assert(_ShmAlloc==0);

  // allocate the shared windows for our group

  MPI_Barrier(WorldShmComm);

  WorldShmCommBufs.resize(WorldShmSize);


  // Hugetlbf and others map filesystems as mappable huge pages

  char shm_name [NAME_MAX];

  assert(WorldShmSize == 1);

  for(int r=0;r<WorldShmSize;r++){


    int fd=-1;

    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;

#ifdef MAP_POPULATE

    mmap_flag|=MAP_POPULATE;

#endif

#ifdef MAP_HUGETLB

    if ( flags ) mmap_flag |= MAP_HUGETLB;

#endif

    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);

    if ( ptr == (void *)MAP_FAILED ) {

      printf("mmap %s failed\n",shm_name);

      perror("failed mmap");      assert(0);

    }

    assert(((uint64_t)ptr&0x3F)==0);

    close(fd);

    WorldShmCommBufs[r] =ptr;

    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;

  }

  _ShmAlloc=1;

  _ShmAllocBytes  = bytes;

};


#endif // MMAP


#ifdef GRID_MPI3_SHMOPEN

// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case

// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for

// the posix shm virtual file system

void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)

{

  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;

  assert(_ShmSetup==1);

  assert(_ShmAlloc==0);

  MPI_Barrier(WorldShmComm);

  WorldShmCommBufs.resize(WorldShmSize);


  char shm_name [NAME_MAX];

  if ( WorldShmRank == 0 ) {

    for(int r=0;r<WorldShmSize;r++){


      size_t size = bytes;


      struct passwd *pw = getpwuid (getuid());

      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);


      shm_unlink(shm_name);

      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);

      if ( fd < 0 ) {   perror("failed shm_open");  assert(0);      }

      ftruncate(fd, size);


      int mmap_flag = MAP_SHARED;

#ifdef MAP_POPULATE

      mmap_flag |= MAP_POPULATE;

#endif

#ifdef MAP_HUGETLB

      if (flags) mmap_flag |= MAP_HUGETLB;

#endif

      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);


      if ( ptr == (void * )MAP_FAILED ) {

    perror("failed mmap");

    assert(0);

      }

      assert(((uint64_t)ptr&0x3F)==0);


      WorldShmCommBufs[r] =ptr;

      close(fd);

    }

  }


  MPI_Barrier(WorldShmComm);


  if ( WorldShmRank != 0 ) {

    for(int r=0;r<WorldShmSize;r++){


      size_t size = bytes ;


      struct passwd *pw = getpwuid (getuid());

      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);


      int fd=shm_open(shm_name,O_RDWR,0666);

      if ( fd<0 ) { perror("failed shm_open");  assert(0);      }


      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }

      assert(((uint64_t)ptr&0x3F)==0);

      WorldShmCommBufs[r] =ptr;


      close(fd);

    }

  }

  _ShmAlloc=1;

  _ShmAllocBytes = bytes;

}

#endif

#endif // End NVCC case for GPU device buffers


// Routines accessing shared memory should route through for GPU safety


void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)

{

#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)

  acceleratorMemSet(dest,0,bytes);

#else

  bzero(dest,bytes);

#endif

}


//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)

//{

//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)

//  acceleratorCopyToDevice(src,dest,bytes);

//#else

//  bcopy(src,dest,bytes);

//#endif

//}

// Global shared functionality finished

// Now move to per communicator functionality


void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)

{

  int rank, size;

  MPI_Comm_rank(comm,&rank);

  MPI_Comm_size(comm,&size);

  ShmRanks.resize(size);


  // Split into groups that can share memory

#ifndef GRID_MPI3_SHM_NONE

  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);

#else

  MPI_Comm_split(comm, rank, 0, &ShmComm);

#endif

  MPI_Comm_rank(ShmComm     ,&ShmRank);

  MPI_Comm_size(ShmComm     ,&ShmSize);

  ShmCommBufs.resize(ShmSize);


  // Map ShmRank to WorldShmRank and use the right buffer

  assert (GlobalSharedMemory::ShmAlloc()==1);

  heap_size = GlobalSharedMemory::ShmAllocBytes();

  for(int r=0;r<ShmSize;r++){


    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;


    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);


    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];

    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;

  }

  ShmBufferFreeAll();


#ifndef ACCELERATOR_AWARE_MPI

  host_heap_size = heap_size;

  HostCommBuf= GlobalSharedMemory::HostCommBuf;

  HostBufferFreeAll();

#endif


  // find comm ranks in our SHM group (i.e. which ranks are on our node)

  MPI_Group FullGroup, ShmGroup;

  MPI_Comm_group (comm   , &FullGroup);

  MPI_Comm_group (ShmComm, &ShmGroup);


  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;

  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);


#ifdef GRID_SHM_FORCE_MPI

  // Hide the shared memory path between ranks

  {

    for(int r=0;r<size;r++){

      if ( r!=rank ) {

    ShmRanks[r] = MPI_UNDEFINED;

      }

    }

  }

#endif


  //  SharedMemoryTest();

}


// On node barrier


void SharedMemory::ShmBarrier(void)

{

  MPI_Barrier  (ShmComm);

}


// Test the shared memory is working


void SharedMemory::SharedMemoryTest(void)

{

  ShmBarrier();

  uint64_t check[3];

  uint64_t magic = 0x5A5A5A;

  if ( ShmRank == 0 ) {

    for(uint64_t r=0;r<ShmSize;r++){

       check[0]=GlobalSharedMemory::WorldNode;

       check[1]=r;

       check[2]=magic;

       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));

    }

  }

  ShmBarrier();

  for(uint64_t r=0;r<ShmSize;r++){

    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));

    assert(check[0]==GlobalSharedMemory::WorldNode);

    assert(check[1]==r);

    assert(check[2]==magic);

  }

  ShmBarrier();

  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;

}


void *SharedMemory::ShmBuffer(int rank)

{

  int gpeer = ShmRanks[rank];

  if (gpeer == MPI_UNDEFINED){

    return NULL;

  } else {

    return ShmCommBufs[gpeer];

  }

}


void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)

{

  int gpeer = ShmRanks[rank];

  assert(gpeer!=ShmRank); // never send to self

  //  std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;

  if (gpeer == MPI_UNDEFINED){

    return NULL;

  } else {

    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];

    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;

    //    std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;

    return (void *) remote;

  }

}


SharedMemory::~SharedMemory()

{

  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);

  if ( !MPI_is_finalised ) {

    MPI_Comm_free(&ShmComm);

  }

};


NAMESPACE_END(Grid);


acceleratorAllocDevice
void * acceleratorAllocDevice(size_t bytes)
Definition Accelerator.h:637

acceleratorCopyToDevice
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
Definition Accelerator.h:616

acceleratorMemSet
void acceleratorMemSet(void *base, int value, size_t bytes)
Definition Accelerator.h:627

acceleratorCopyFromDevice
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
Definition Accelerator.h:617

Stencil_force_mpi
bool Stencil_force_mpi
Definition Communicator_base.cc:36

GRID_SHM_PATH
#define GRID_SHM_PATH
Definition Config.h:104

Coordinate
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95

GridCore.h

binary
Out accelerator_inline binary(Input1 src_1, Input2 src_2, Operation op)
Definition Grid_vector_types.h:235

GridLogDebug
GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE")

NAMESPACE_BEGIN
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35

NAMESPACE_END
#define NAMESPACE_END(A)
Definition Namespace.h:36

BinaryToGray
int BinaryToGray(int binary)
Definition SharedMemoryMPI.cc:257

Mheader
#define Mheader
Definition SharedMemoryMPI.cc:30

Log2Size
int Log2Size(int TwoToPower, int MAXLOG2)
Definition SharedMemoryMPI.cc:261

Grid_MPI_Comm
int Grid_MPI_Comm
Definition SharedMemory.h:84

AcceleratorVector::size
accelerator_inline size_type size(void) const
Definition Coordinate.h:52

GlobalSharedMemory::WorldNode
static int WorldNode
Definition SharedMemory.h:122

GlobalSharedMemory::Hugepages
static int Hugepages
Definition SharedMemory.h:107

GlobalSharedMemory::SharedMemoryAllocate
static void SharedMemoryAllocate(uint64_t bytes, int flags)
Definition SharedMemoryMPI.cc:760

GlobalSharedMemory::MAXLOG2RANKSPERNODE
static const int MAXLOG2RANKSPERNODE
Definition SharedMemory.h:89

GlobalSharedMemory::WorldShmSize
static int WorldShmSize
Definition SharedMemory.h:119

GlobalSharedMemory::OptimalCommunicatorHypercube
static void OptimalCommunicatorHypercube(const Coordinate &processors, Grid_MPI_Comm &optimal_comm, Coordinate &ShmDims)
Definition SharedMemoryMPI.cc:289

GlobalSharedMemory::WorldComm
static Grid_MPI_Comm WorldComm
Definition SharedMemory.h:113

GlobalSharedMemory::_ShmAllocBytes
static uint64_t _ShmAllocBytes
Definition SharedMemory.h:95

GlobalSharedMemory::WorldShmRanks
static std::vector< int > WorldShmRanks
Definition SharedMemory.h:124

GlobalSharedMemory::ShmAllocBytes
static uint64_t ShmAllocBytes(void)
Definition SharedMemory.h:105

GlobalSharedMemory::Init
static void Init(Grid_MPI_Comm comm)
Definition SharedMemoryMPI.cc:163

GlobalSharedMemory::GetShmDims
static void GetShmDims(const Coordinate &WorldDims, Coordinate &ShmDims)
Definition SharedMemory.cc:121

GlobalSharedMemory::_ShmAlloc
static int _ShmAlloc
Definition SharedMemory.h:94

GlobalSharedMemory::SharedMemoryZero
static void SharedMemoryZero(void *dest, size_t bytes)
Definition SharedMemoryMPI.cc:879

GlobalSharedMemory::_ShmSetup
static int _ShmSetup
Definition SharedMemory.h:93

GlobalSharedMemory::HPEhypercube
static int HPEhypercube
Definition SharedMemory.h:101

GlobalSharedMemory::WorldNodes
static int WorldNodes
Definition SharedMemory.h:121

GlobalSharedMemory::WorldShmRank
static int WorldShmRank
Definition SharedMemory.h:118

GlobalSharedMemory::OptimalCommunicatorSharedMemory
static void OptimalCommunicatorSharedMemory(const Coordinate &processors, Grid_MPI_Comm &optimal_comm, Coordinate &ShmDims)
Definition SharedMemoryMPI.cc:410

GlobalSharedMemory::WorldShmCommBufs
static std::vector< void * > WorldShmCommBufs
Definition SharedMemory.h:109

GlobalSharedMemory::WorldSize
static int WorldSize
Definition SharedMemory.h:115

GlobalSharedMemory::ShmAlloc
static int ShmAlloc(void)
Definition SharedMemory.h:104

GlobalSharedMemory::WorldShmComm
static Grid_MPI_Comm WorldShmComm
Definition SharedMemory.h:117

GlobalSharedMemory::OptimalCommunicator
static void OptimalCommunicator(const Coordinate &processors, Grid_MPI_Comm &optimal_comm, Coordinate &ShmDims)
Definition SharedMemoryMPI.cc:272

GlobalSharedMemory::WorldRank
static int WorldRank
Definition SharedMemory.h:114

SharedMemory::ShmSize
int ShmSize
Definition SharedMemory.h:168

SharedMemory::ShmBufferTranslate
void * ShmBufferTranslate(int rank, void *local_p)
Definition SharedMemoryMPI.cc:1006

SharedMemory::ShmRanks
std::vector< int > ShmRanks
Definition SharedMemory.h:170

SharedMemory::ShmBarrier
void ShmBarrier(void)
Definition SharedMemoryMPI.cc:966

SharedMemory::ShmRank
int ShmRank
Definition SharedMemory.h:167

SharedMemory::ShmBuffer
void * ShmBuffer(int rank)
Definition SharedMemoryMPI.cc:997

SharedMemory::heap_size
size_t heap_size
Definition SharedMemory.h:155

SharedMemory::ShmBufferFreeAll
void ShmBufferFreeAll(void)
Definition SharedMemory.cc:108

SharedMemory::SharedMemoryTest
void SharedMemoryTest(void)
Definition SharedMemoryMPI.cc:973

SharedMemory::SetCommunicator
void SetCommunicator(Grid_MPI_Comm comm)
Definition SharedMemoryMPI.cc:899

SharedMemory::ShmCommBufs
std::vector< void * > ShmCommBufs
Definition SharedMemory.h:169

SharedMemory::ShmComm
Grid_MPI_Comm ShmComm
Definition SharedMemory.h:166

SharedMemory::~SharedMemory
~SharedMemory()
Definition SharedMemoryMPI.cc:1020

Grid
Definition Deflation.h:31