34template<
typename vobj>
39template<
typename vobj>
43template<
typename Gimpl>
45 typename Gimpl::GaugeLinkField
Cshift(
const typename Gimpl::GaugeLinkField &in,
int dir,
int shift)
const override{
return Gimpl::CshiftLink(in,dir,shift); }
63 const int Nsimd=vobj::Nsimd();
64 typedef typename vobj::scalar_object sobj;
66 typedef typename vobj::vector_type vector_type;
79 int isites = 1;
for(
int d=0;d<
Nd;d++)
if( d!=dim) isites*=simd[d];
83 int rNsimd = 1;
for(
int d=0;d<
Nd;d++) rNsimd*=rsimd[d];
84 int rNsimda= Nsimd/simd[dim];
85 assert(rNsimda==rNsimd);
86 int face_ovol=block*nblock;
95 auto buf_p = & buf[0];
104 for(
int blane=0;blane<Nsimd;blane++) {
106 int olane=blane%rNsimd;
107 int obit =blane/rNsimd;
112 int ssp = ss*simd[dim]+obit;
115 int osite= b+n*stride + ox*block;
122 Lexicographic::CoorFromIndex(icoor,olane,rsimd);
124 Lexicographic::IndexFromCoor(icoor,lane,simd);
131 const int words=
sizeof(vobj)/
sizeof(vector_type);
132 vector_type * from = (vector_type *)&buf_p[ss+offset];
133 vector_type * to = (vector_type *)&lat_v[osite];
135 for(
int w=0;w<words;w++){
136 stmp =
getlane(from[w], blane);
149 const int Nsimd=vobj::Nsimd();
150 typedef typename vobj::scalar_object sobj;
152 typedef typename vobj::vector_type vector_type;
167 int isites = 1;
for(
int d=0;d<
Nd;d++)
if( d!=dim) isites*=simd[d];
171 int rNsimd = 1;
for(
int d=0;d<
Nd;d++) rNsimd*=rsimd[d];
173 int face_ovol=block*nblock;
182 auto buf_p = & buf[0];
190 for(
int blane=0;blane<Nsimd;blane++) {
192 int olane=blane%rNsimd;
193 int obit =blane/rNsimd;
198 int ssp = ss*simd[dim]+obit;
201 int osite= b+n*stride + ox*block;
208 Lexicographic::CoorFromIndex(icoor,olane,rsimd);
210 Lexicographic::IndexFromCoor(icoor,lane,simd);
217 const int words=
sizeof(vobj)/
sizeof(vector_type);
218 vector_type * to = (vector_type *)&buf_p[ss+offset];
219 vector_type * from = (vector_type *)&lat_v[osite];
221 for(
int w=0;w<words;w++){
249 for(
int d=0;d<
dims;d++){
256 for(
int d=0;d<
grids.size();d++){
257 if ( processors[d] > 1 ) {
272 for(
int d=0;d<
dims;d++){
274 if ( processors[d] > 1 ) {
277 for(
int d=0;d<
dims;d++){
278 global[d] =
plocal[d]*processors[d];
283 grids.push_back(old_grid);
297 for(
int d=0;d<
dims;d++){
298 if( processors[d]==1 ) fll[d]=0;
307 int dims = old_grid->
Nd();
309 for(
int d=0;d<
dims;d++){
310 tmp =
Expand(d,tmp,cshift);
318 int dims = old_grid->
Nd();
320 for(
int d=0;d<
dims;d++){
339 double tins=0, tshift=0;
342 if ( processors[dim] == 1 ) islocal = 1;
365 for(
int x=0;x<
local[dim];x++){
372 shifted = cshift.Cshift(in,dim,
depth);
376 for(
int x=0;x<
depth;x++){
383 shifted = cshift.Cshift(in,dim,-
depth);
387 for(
int x=0;x<
depth;x++){
393 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand timings: cshift:" << tshift/1000 <<
"ms, insert-slice:" << tins/1000 <<
"ms" << std::endl;
412 double tins=0, tshift=0;
415 if ( processors[dim] == 1 ) islocal = 1;
428 int dimension,
int depth)
const
430 typedef typename vobj::vector_type vector_type;
432 typedef typename vobj::scalar_object sobj;
447 int ld = lds[dimension];
449 const int Nsimd = vobj::Nsimd();
451 assert(
depth<=lds[dimension]);
453 assert(ld+2*
depth==nld);
458 for(
int d=0;d<lds.
size();d++){
459 if ( d!= dimension) buffer_size=buffer_size*lds[d];
461 buffer_size = buffer_size / Nsimd;
462 int rNsimd = Nsimd / simd[dimension];
467 send_buf.resize(buffer_size*2*
depth);
468 recv_buf.resize(buffer_size*2*
depth);
469#ifndef ACCELERATOR_AWARE_MPI
472 hsend_buf.resize(buffer_size*2*
depth);
473 hrecv_buf.resize(buffer_size*2*
depth);
476 std::vector<MpiCommsRequest_t> fwd_req;
477 std::vector<MpiCommsRequest_t> bwd_req;
479 int words = buffer_size;
480 int bytes = words *
sizeof(vobj);
488 grid->
ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
496 for (
int d=0;d <
depth ; d ++ ) {
497 int tag = d*1024 + dimension*2+0;
500 GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
504#ifdef ACCELERATOR_AWARE_MPI
506 (
void *)&send_buf[d*buffer_size], xmit_to_rank,
507 (
void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
511 (
void *)&hsend_buf[d*buffer_size], xmit_to_rank,
512 (
void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
516 for (
int d=0;d <
depth ; d ++ ) {
517 int tag = d*1024 + dimension*2+1;
524#ifdef ACCELERATOR_AWARE_MPI
526 (
void *)&send_buf[(d+
depth)*buffer_size], recv_from_rank,
527 (
void *)&recv_buf[(d+
depth)*buffer_size], xmit_to_rank, bytes,tag);
531 (
void *)&hsend_buf[(d+
depth)*buffer_size], recv_from_rank,
532 (
void *)&hrecv_buf[(d+
depth)*buffer_size], xmit_to_rank, bytes,tag);
540 int Nd = new_grid->
Nd();
544 toLL[dimension]=
depth;
556#ifndef ACCELERATOR_AWARE_MPI
557 for (
int d=0;d <
depth ; d ++ ) {
564 for (
int d=0;d <
depth ; d ++ ) {
571#ifndef ACCELERATOR_AWARE_MPI
572 for (
int d=0;d <
depth ; d ++ ) {
579 for (
int d=0;d <
depth ; d ++ ) {
580 ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
585 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand new timings: gather :" << t_gather/1000 <<
"ms"<<std::endl;
586 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand new timings: scatter:" << t_scatter/1000 <<
"ms"<<std::endl;
587 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand new timings: copy :" << t_copy/1000 <<
"ms"<<std::endl;
588 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand new timings: comms :" << t_comms/1000 <<
"ms"<<std::endl;
589 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand new timings: total :" << t_tot/1000 <<
"ms"<<std::endl;
590 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand new timings: gather :" <<
depth*4.0*bytes/t_gather <<
"MB/s"<<std::endl;
591 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand new timings: scatter:" <<
depth*4.0*bytes/t_scatter<<
"MB/s"<<std::endl;
592 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand new timings: comms :" << (
RealD)4.0*bytes/t_comms <<
"MB/s"<<std::endl;
593 std::cout <<
GridLogPerformance <<
"PaddedCell::Expand new timings: face bytes :" <<
depth*bytes/1e6 <<
"MB"<<std::endl;
accelerator_inline int acceleratorSIMTlane(int Nsimd)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
std::vector< T, devAllocator< T > > deviceVector
std::vector< T, alignedAllocator< T > > hostVector
AcceleratorVector< int, MaxDims > Coordinate
accelerator_inline S getlane(const Grid_simd< S, V > &in, int lane)
accelerator_inline void putlane(Grid_simd< S, V > &vec, const S &_S, int lane)
void localCopyRegion(const Lattice< vobj > &From, Lattice< vobj > &To, Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize)
void InsertSliceLocal(const Lattice< vobj > &lowDim, Lattice< vobj > &higherDim, int slice_lo, int slice_hi, int orthog)
#define autoView(l_v, l, mode)
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN")
#define NAMESPACE_BEGIN(A)
void ScatterSlice(const deviceVector< vobj > &buf, Lattice< vobj > &lat, int x, int dim, int offset=0)
void GatherSlice(deviceVector< vobj > &buf, const Lattice< vobj > &lat, int x, int dim, int offset=0)
accelerator_inline size_type size(void) const
void CommsComplete(std::vector< MpiCommsRequest_t > &list)
void SendToRecvFromBegin(std::vector< MpiCommsRequest_t > &list, void *xmit, int dest, void *recv, int from, int bytes, int dir)
void ShiftedRanks(int dim, int shift, int &source, int &dest)
const Coordinate & LocalDimensions(void)
GridBase * Grid(void) const
void Face_exchange(const Lattice< vobj > &from, Lattice< vobj > &to, int dimension, int depth) const
Lattice< vobj > Extract(const Lattice< vobj > &in) const
std::vector< GridCartesian * > grids
Lattice< vobj > ExchangePeriodic(const Lattice< vobj > &in) const
Lattice< vobj > Exchange(const Lattice< vobj > &in, const CshiftImplBase< vobj > &cshift=CshiftImplDefault< vobj >()) const
GridCartesian * unpadded_grid
PaddedCell(int _depth, GridCartesian *_grid)
Lattice< vobj > Expand(int dim, const Lattice< vobj > &in, const CshiftImplBase< vobj > &cshift=CshiftImplDefault< vobj >()) const
Lattice< vobj > ExpandPeriodic(int dim, const Lattice< vobj > &in) const
virtual ~CshiftImplBase()
virtual Lattice< vobj > Cshift(const Lattice< vobj > &in, int dir, int shift) const =0
Lattice< vobj > Cshift(const Lattice< vobj > &in, int dir, int shift) const override
Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override