29#ifndef _GRID_CSHIFT_MPI_H_
30#define _GRID_CSHIFT_MPI_H_
37 typedef typename vobj::vector_type vector_type;
46 shift = (shift+fd)%fd;
60 }
else if ( splice_dim ) {
80 if ( sshift[0] == sshift[1] ) {
98 if ( sshift[0] == sshift[1] ) {
109 typedef typename vobj::vector_type vector_type;
120 assert(simd_layout==1);
128#ifndef ACCELERATOR_AWARE_MPI
133 int cb= (cbmask==0x2)?
Odd :
Even;
146 for(
int x=0;x<rd;x++){
148 int sx = (x+sshift)%rd;
149 int comm_proc = ((x+sshift)/rd)%pd;
159 int words = buffer_size;
160 if (cbmask != 0x3) words=words>>1;
162 int bytes = words *
sizeof(vobj);
174 grid->
ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
180#ifdef ACCELERATOR_AWARE_MPI
183 (
void *)&recv_buf[0],
191 (
void *)&hrecv_buf[0],
211 std::cout <<
GridLogPerformance <<
" Cshift scatter "<<tscatter/1e3<<
" ms"<<std::endl;
213 std::cout <<
GridLogPerformance <<
" Cshift BW "<<(2.0*xbytes)/tcomms<<
" MB/s "<<2*xbytes<<
" Bytes "<<std::endl;
220 const int Nsimd = grid->
Nsimd();
221 typedef typename vobj::vector_type vector_type;
222 typedef typename vobj::scalar_object scalar_object;
237 assert(simd_layout==2);
255 static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.
resize(Nsimd);
256 static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
257 scalar_object * recv_buf_extract_mpi;
258 scalar_object * send_buf_extract_mpi;
260 for(
int s=0;s<Nsimd;s++){
261 send_buf_extract[s].resize(buffer_size);
262 recv_buf_extract[s].resize(buffer_size);
264#ifndef ACCELERATOR_AWARE_MPI
269 int bytes = buffer_size*
sizeof(scalar_object);
277 int cb = (cbmask==0x2)?
Odd :
Even;
281 for(
int x=0;x<rd;x++){
284 for(
int i=0;i<Nsimd;i++){
285 pointers[i] = &send_buf_extract[i][0];
287 int sx = (x+sshift)%rd;
292 for(
int i=0;i<Nsimd;i++){
294 int inner_bit = (Nsimd>>(permute_type+1));
295 int ic= (i&inner_bit)? 1:0;
297 int my_coor = rd*ic + x;
298 int nbr_coor = my_coor+sshift;
299 int nbr_proc = ((nbr_coor)/ld) % pd;
301 int nbr_ic = (nbr_coor%ld)/rd;
302 int nbr_ox = (nbr_coor%rd);
303 int nbr_lane = (i&(~inner_bit));
308 if (nbr_ic) nbr_lane|=inner_bit;
310 assert (sx == nbr_ox);
313 grid->
ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
318 send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
319 recv_buf_extract_mpi = &recv_buf_extract[i][0];
320#ifdef ACCELERATOR_AWARE_MPI
323 (
void *)recv_buf_extract_mpi,
331 (
void *)&hrecv_buf[0],
341 rpointers[i] = &recv_buf_extract[i][0];
343 rpointers[i] = &send_buf_extract[nbr_lane][0];
353 std::cout <<
GridLogPerformance <<
" Cshift (s) gather "<<tgather/1e3<<
" ms"<<std::endl;
354 std::cout <<
GridLogPerformance <<
" Cshift (s) scatter "<<tscatter/1e3<<
" ms"<<std::endl;
356 std::cout <<
GridLogPerformance <<
" Cshift BW "<<(2.0*xbytes)/tcomms<<
" MB/s "<<2*xbytes<<
" Bytes "<<std::endl;
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
std::vector< T, devAllocator< T > > deviceVector
std::vector< T, alignedAllocator< T > > hostVector
void Copy_plane(Lattice< vobj > &lhs, const Lattice< vobj > &rhs, int dimension, int lplane, int rplane, int cbmask)
void Gather_plane_extract(const Lattice< vobj > &rhs, ExtractPointerArray< typename vobj::scalar_object > pointers, int dimension, int plane, int cbmask)
std::vector< int > Cshift_vector
void CalculateCshiftVector(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int cbmask)
void Scatter_plane_simple(Lattice< vobj > &rhs, deviceVector< vobj > &buffer, int dimension, int plane, int cbmask)
void Scatter_plane_merge(Lattice< vobj > &rhs, ExtractPointerArray< typename vobj::scalar_object > pointers, int dimension, int plane, int cbmask)
void MapCshiftCopy(std::vector< vobj > &Cshift_obj, deviceVector< vobj > &Cshift_obj_device)
void Cshift_local(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int shift)
deviceVector< int > Cshift_vector_device
void Gather_plane_simple(const Lattice< vobj > &rhs, deviceVector< vobj > &buffer, int dimension, int plane, int cbmask, int off=0)
Lattice< vobj > Cshift(const Lattice< vobj > &rhs, int dimension, int shift)
void Cshift_comms_simd(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int shift)
void Cshift_comms(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int shift)
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN")
#define NAMESPACE_BEGIN(A)
accelerator_inline void resize(size_type sz)
void SendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, int recv_from_rank, int bytes)
void ShiftedRanks(int dim, int shift, int &source, int &dest)
static bool StepLog(const char *name)
int PermuteType(int dimension)
virtual int CheckerBoardDestination(int source_cb, int shift, int dim)=0
virtual int CheckerBoardShiftForCB(int source_cb, int dim, int shift, int cb)=0
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const