31#define STENCIL_MAX (16)
70 int off,std::vector<std::pair<int,int> > & table);
79void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
99template<
class vobj,
class cobj,
class Parameters>
140 StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite];
144 ptype = this->_permute_type[point];
145 return & this->_entries_p[point+this->_npoints*osite];
152 if (
perm)
ptype = this->_permute_type[point];
169 Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout);
173template<
class vobj,
class cobj,
class Parameters>
200template<
class vobj,
class cobj,
class Parameters>
226 static constexpr int Nsimd = vobj::Nsimd();
236 static constexpr int Nsimd = vobj::Nsimd();
350 int pd =
_grid->_processors[dimension];
351 int fd =
_grid->_fdimensions[dimension];
352 int ld =
_grid->_ldimensions[dimension];
353 int rd =
_grid->_rdimensions[dimension];
354 int simd_layout =
_grid->_simd_layout[dimension];
355 int comm_dim =
_grid->_processors[dimension] >1 ;
360 if ( ! comm_dim )
return 1;
361 if ( displacement == 0 )
return 1;
381 if ( ! comm_dim )
return 1;
384 if (displacement>0) nbr_proc = 1;
385 else nbr_proc = pd-1;
394 if ( shm==NULL )
return 0;
410 if ( packet.do_recv &&
_grid->IsOffNode(packet.from_rank) ) {
413 uint64_t words = packet.rbytes/
sizeof(word);
414 const int nsimd =
sizeof(
typename cobj::vector_type)/
sizeof(word);
415 const uint64_t outer = words/nsimd;
417 if(
sizeof(word)==8) {
422 double *dbuf =(
double *) packet.recv_buf;
423 float *fbuf =(
float *) packet.compressed_recv_buf;
427 dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane];
430 }
else if (
sizeof(word)==4){
435 uint32_t *fbuf =(uint32_t *) packet.recv_buf;
436 uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
440 fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16;
444 assert(0 &&
"unknown floating point precision");
450 packet.xbytes_compressed = packet.xbytes;
451 packet.compressed_send_buf = packet.send_buf;
453 packet.rbytes_compressed = packet.rbytes;
454 packet.compressed_recv_buf = packet.recv_buf;
461 uint64_t words = packet.xbytes/
sizeof(word);
462 const int nsimd =
sizeof(
typename cobj::vector_type)/
sizeof(word);
463 const uint64_t outer = words/nsimd;
465 if (packet.do_recv &&
_grid->IsOffNode(packet.from_rank) ) {
467 packet.rbytes_compressed = packet.rbytes/2;
476 if (packet.do_send &&
_grid->IsOffNode(packet.to_rank) ) {
478 packet.xbytes_compressed = packet.xbytes/2;
482 if(
sizeof(word)==8) {
484 double *dbuf =(
double *) packet.send_buf;
485 float *fbuf =(
float *) packet.compressed_send_buf;
489 fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane];
492 }
else if (
sizeof(word)==4){
494 uint32_t *fbuf =(uint32_t *) packet.send_buf;
495 uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
499 hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16;
503 assert(0 &&
"unknown floating point precision");
520 _grid->StencilBarrier();
526 for(
int i=0;i<
Packets.size();i++){
532 _grid->StencilBarrier();
536 for(
int i=0;i<
Packets.size();i++){
540 Packets[i].compressed_send_buf,
542 Packets[i].compressed_recv_buf,
553 for(
int i=0;i<
Packets.size();i++){
568 for(
int i=0;i<
Packets.size();i++){
590 for(
int i=0;i<
Packets.size();i++){
605 std::vector<std::vector<CommsRequest_t> > reqs;
625 int fd =
_grid->_fdimensions[dimension];
626 int rd =
_grid->_rdimensions[dimension];
629 int shift = (displacement+fd)%fd;
634 int simd_layout =
_grid->_simd_layout[dimension];
635 int comm_dim =
_grid->_processors[dimension] >1 ;
636 int splice_dim =
_grid->_simd_layout[dimension]>1 && (comm_dim);
638 int is_same_node = 1;
644 if ( sshift[0] == sshift[1] ) {
646 auto tmp =
GatherSimd(source,dimension,shift,0x3,compress,face_idx,point);
647 is_same_node = is_same_node && tmp;
649 auto tmp =
Gather(source,dimension,shift,0x3,compress,face_idx,point);
650 is_same_node = is_same_node && tmp;
656 auto tmp1 =
GatherSimd(source,dimension,shift,0x1,compress,face_idx,point);
657 auto tmp2 =
GatherSimd(source,dimension,shift,0x2,compress,face_idx,point);
658 is_same_node = is_same_node && tmp1 && tmp2;
660 auto tmp1 =
Gather(source,dimension,shift,0x1,compress,face_idx,point);
661 auto tmp2 =
Gather(source,dimension,shift,0x2,compress,face_idx,point);
662 is_same_node = is_same_node && tmp1 && tmp2;
669 template<
class compressor>
676 _grid->StencilBarrier();
684 for(
int point = 0 ; point < this->
_npoints; point++) {
685 compress.Point(point);
690 _grid->StencilBarrier();
714 CopyReceiveBuffer obj;
741 obj.direction = direction;
742 obj.OrthogPlane = OrthogPlane;
743 obj.DestProc = DestProc;
744 obj.recv_buf = recv_buf;
789 d.dims =
_grid->_fdimensions;
792 d.buffer_size = buffer_size;
798 m.dims =
_grid->_fdimensions;
800 m.mpointer = merge_p;
801 m.vpointers= rpointers;
802 m.buffer_size = buffer_size;
805 template<
class decompressor>
void CommsMerge(decompressor decompress) {
814 template<
class decompressor>
815 void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd)
817 for(
int i=0;i<mm.size();i++){
818 decompressor::MergeFace(decompress,mm[i]);
820 for(
int i=0;i<dd.size();i++){
821 decompressor::DecompressFace(decompress,dd[i]);
830 if( this->_entries[i]._is_local ) {
831 this->_entries[i]._byte_offset = this->_entries[i]._offset*
sizeof(vobj);
833 this->_entries[i]._byte_offset = this->_entries[i]._offset*
sizeof(cobj);
844 for(
int point=0;point<this->
_npoints;point++){
847 int32_t surface_list_size=0;
848 for(
int site = 0 ;site< vol4;site++){
850 for(
int point=0;point<this->_npoints;point++){
856 for(
int s=0;s<Ls;s++){
863 std::vector<int> surface_list_host(surface_list_size);
865 for(
int site = 0 ;site< vol4;site++){
867 for(
int point=0;point<this->_npoints;point++){
873 for(
int s=0;s<Ls;s++){
875 surface_list_host[ss]= idx;
886 for(
int ii=0;ii<this->
_npoints;ii++){
889 int gd =
_grid->_gdimensions[dimension];
890 int fd =
_grid->_fdimensions[dimension];
891 int pd =
_grid->_processors [dimension];
892 int pc =
_grid->_processor_coor[dimension];
898 int comm_dim =
_grid->_processors[dimension] >1 ;
899 int block = dirichlet_block[dimension];
904 if ( block && comm_dim ) {
905 assert(
abs(displacement) < ld );
907 if( displacement > 0 ) {
913 if ( ( (ld*(pc+1) ) % block ) == 0 ) this->
_comms_recv[ii] = 0;
914 if ( ( (ld*pc ) % block ) == 0 ) this->
_comms_send[ii] = 0;
921 if ( ( (ld*(pc+1) ) % block ) == 0 ) this->
_comms_send[ii] = 0;
922 if ( ( (ld*pc ) % block ) == 0 ) this->
_comms_recv[ii] = 0;
934 const std::vector<int> &directions,
935 const std::vector<int> &distances,
936 Parameters p=Parameters(),
937 bool preserve_shm=
false)
956 if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->
Nd(),0);
960 for(
int d=0;d<p.dirichlet.size();d++){
977 for(
int ii=0;ii<npoints;ii++){
982 int dimension = directions[i];
983 int displacement = distances[i];
984 int shift = displacement;
986 int gd =
_grid->_gdimensions[dimension];
987 int fd =
_grid->_fdimensions[dimension];
988 int pd =
_grid->_processors [dimension];
990 int rd =
_grid->_rdimensions[dimension];
991 int pc =
_grid->_processor_coor[dimension];
996 int simd_layout =
_grid->_simd_layout[dimension];
997 int comm_dim =
_grid->_processors[dimension] >1 ;
998 int splice_dim =
_grid->_simd_layout[dimension]>1 && (comm_dim);
999 int rotate_dim =
_grid->_simd_layout[dimension]>2;
1001 assert ( (rotate_dim && comm_dim) ==
false) ;
1013 if ( sshift[0] == sshift[1] ) {
1014 Local(point,dimension,shift,0x3);
1016 Local(point,dimension,shift,0x1);
1017 Local(point,dimension,shift,0x2);
1024 if ( sshift[0] == sshift[1] ) {
1025 Comms(point,dimension,shift,0x3);
1027 Comms(point,dimension,shift,0x1);
1028 Comms(point,dimension,shift,0x2);
1036 const int Nsimd = grid->
Nsimd();
1040 _grid->ShmBufferFreeAll();
1048 for(
int l=0;l<maxl;l++){
1056 void Local (
int point,
int dimension,
int shiftpm,
int cbmask)
1058 int fd =
_grid->_fdimensions[dimension];
1059 int rd =
_grid->_rdimensions[dimension];
1060 int ld =
_grid->_ldimensions[dimension];
1061 int gd =
_grid->_gdimensions[dimension];
1062 int ly =
_grid->_simd_layout[dimension];
1065 int shift = (shiftpm+fd)%fd;
1068 int permute_dim =
_grid->PermuteDim(dimension);
1070 for(
int x=0;x<rd;x++){
1073 int bo = x *
_grid->_ostride[dimension];
1075 int cb= (cbmask==0x2)?
Odd :
Even;
1078 int sx = (x+sshift)%rd;
1081 if ( (shiftpm==-1) && (sx>x) ) {
1084 if ( (shiftpm== 1) && (sx<x) ) {
1088 int permute_slice=0;
1090 int wrap = sshift/rd; wrap=wrap % ly;
1091 int num = sshift%rd;
1092 if ( x< rd-num ) permute_slice=wrap;
1093 else permute_slice = (wrap+1)%ly;
1096 CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
1101 void Comms (
int point,
int dimension,
int shiftpm,
int cbmask)
1104 const int Nsimd = grid->
Nsimd();
1108 int fd =
_grid->_fdimensions[dimension];
1109 int ld =
_grid->_ldimensions[dimension];
1110 int rd =
_grid->_rdimensions[dimension];
1111 int pd =
_grid->_processors[dimension];
1112 int simd_layout =
_grid->_simd_layout[dimension];
1113 int comm_dim =
_grid->_processors[dimension] >1 ;
1115 assert(comm_dim==1);
1116 int shift = (shiftpm + fd) %fd;
1121 int buffer_size =
_grid->_slice_nblock[dimension]*
_grid->_slice_block[dimension];
1127 int cb= (cbmask==0x2)?
Odd :
Even;
1130 for(
int x=0;x<rd;x++){
1135 int sx = (x+sshift)%rd;
1138 if ( simd_layout > 1 ) {
1141 for(
int i=0;i<Nsimd;i++){
1143 int inner_bit = (Nsimd>>(permute_type+1));
1144 int ic= (i&inner_bit)? 1:0;
1145 int my_coor = rd*ic + x;
1146 int nbr_coor = my_coor+sshift;
1147 int nbr_proc = ((nbr_coor)/ld) % pd;
1155 int comm_proc = ((x+sshift)/rd)%pd;
1156 offnode = (comm_proc!= 0);
1161 if ( (shiftpm==-1) && (sx>x) && (grid->
_processor_coor[dimension]==0) ) {
1172 CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
1182 CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
1189 int words = buffer_size;
1190 if (cbmask != 0x3) words=words>>1;
1198 int rd =
_grid->_rdimensions[dimension];
1200 if ( !
_grid->CheckerBoarded(dimension) ) {
1203 int ro = rplane*
_grid->_ostride[dimension];
1204 int lo = lplane*
_grid->_ostride[dimension];
1207 for(
int n=0;n<
_grid->_slice_nblock[dimension];n++){
1208 for(
int b=0;b<
_grid->_slice_block[dimension];b++){
1209 int idx=point+(lo+o+b)*this->
_npoints;
1210 this->_entries[idx]._offset =ro+o+b;
1211 this->_entries[idx]._permute=
permute;
1212 this->_entries[idx]._is_local=1;
1213 this->_entries[idx]._around_the_world=wrap;
1215 o +=
_grid->_slice_stride[dimension];
1220 int ro = rplane*
_grid->_ostride[dimension];
1221 int lo = lplane*
_grid->_ostride[dimension];
1224 for(
int n=0;n<
_grid->_slice_nblock[dimension];n++){
1225 for(
int b=0;b<
_grid->_slice_block[dimension];b++){
1227 int ocb=1<<
_grid->CheckerBoardFromOindex(o+b);
1230 int idx = point+(lo+o+b)*this->
_npoints;
1231 this->_entries[idx]._offset =ro+o+b;
1232 this->_entries[idx]._is_local=1;
1233 this->_entries[idx]._permute=
permute;
1234 this->_entries[idx]._around_the_world=wrap;
1238 o +=
_grid->_slice_stride[dimension];
1244 void ScatterPlane (
int point,
int dimension,
int plane,
int cbmask,
int offset,
int wrap)
1246 int rd =
_grid->_rdimensions[dimension];
1248 if ( !
_grid->CheckerBoarded(dimension) ) {
1250 int so = plane*
_grid->_ostride[dimension];
1255 for(
int n=0;n<
_grid->_slice_nblock[dimension];n++){
1256 for(
int b=0;b<
_grid->_slice_block[dimension];b++){
1257 int idx=point+(so+o+b)*this->
_npoints;
1258 this->_entries[idx]._offset =offset+(bo++);
1259 this->_entries[idx]._is_local=0;
1260 this->_entries[idx]._permute=0;
1261 this->_entries[idx]._around_the_world=wrap;
1263 o +=
_grid->_slice_stride[dimension];
1268 int so = plane*
_grid->_ostride[dimension];
1272 for(
int n=0;n<
_grid->_slice_nblock[dimension];n++){
1273 for(
int b=0;b<
_grid->_slice_block[dimension];b++){
1275 int ocb=1<<
_grid->CheckerBoardFromOindex(o+b);
1276 if ( ocb & cbmask ) {
1277 int idx = point+(so+o+b)*this->
_npoints;
1278 this->_entries[idx]._offset =offset+(bo++);
1279 this->_entries[idx]._is_local=0;
1280 this->_entries[idx]._permute =0;
1281 this->_entries[idx]._around_the_world=wrap;
1284 o +=
_grid->_slice_stride[dimension];
1289 template<
class compressor>
1290 int Gather(
const Lattice<vobj> &rhs,
int dimension,
int shift,
int cbmask,compressor & compress,
int &face_idx,
int point)
1302 int fd =
_grid->_fdimensions[dimension];
1303 int rd =
_grid->_rdimensions[dimension];
1304 int pd =
_grid->_processors[dimension];
1305 int simd_layout =
_grid->_simd_layout[dimension];
1306 int comm_dim =
_grid->_processors[dimension] >1 ;
1307 assert(simd_layout==1);
1308 assert(comm_dim==1);
1312 int buffer_size =
_grid->_slice_nblock[dimension]*
_grid->_slice_block[dimension];
1314 int cb= (cbmask==0x2)?
Odd :
Even;
1315 int sshift=
_grid->CheckerBoardShiftForCB(rhs.
Checkerboard(),dimension,shift,cb);
1317 for(
int x=0;x<rd;x++){
1319 int sx = (x+sshift)%rd;
1320 int comm_proc = ((x+sshift)/rd)%pd;
1324 int words = buffer_size;
1325 if (cbmask != 0x3) words=words>>1;
1327 int bytes = words * compress.CommDatumSize();
1331 if ( comms_send ) xbytes = bytes;
1335 if ( comms_recv ) rbytes = bytes;
1346 _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
1348 assert (xmit_to_rank !=
_grid->ThisRank());
1349 assert (recv_from_rank !=
_grid->ThisRank());
1353 std::vector<std::pair<int,int> > face_table_host ;
1356 face_table[face_idx].resize(face_table_host.size());
1359 face_table[face_idx].size()*
sizeof(face_table_host[0]));
1364 if ( compress.DecompressionStep()&&comms_recv) {
1375 send_buf = (cobj *)
_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
1376 if ( (send_buf==NULL) ) {
1382 void *test_ptr =
_grid->ShmBufferTranslate(recv_from_rank,recv_buf);
1383 if ( test_ptr != NULL ) shm_recv = 1;
1394 assert(send_buf!=NULL);
1399 compressor::Gather_plane_simple(
face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
1401 int duplicate =
CheckForDuplicate(dimension,sx,comm_proc,(
void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
1408 int do_send = (comms_send) && (!shm_send );
1409 int do_recv = (comms_send) && (!shm_recv );
1411 (
void *)&recv_buf[comm_off],
1412 xmit_to_rank, do_send,
1413 recv_from_rank, do_recv,
1417 if ( (compress.DecompressionStep() && comms_recv) ) {
1419 &recv_buf[comm_off],
1430 template<
class compressor>
1433 const int Nsimd =
_grid->Nsimd();
1442 int fd =
_grid->_fdimensions[dimension];
1443 int rd =
_grid->_rdimensions[dimension];
1444 int ld =
_grid->_ldimensions[dimension];
1445 int pd =
_grid->_processors[dimension];
1446 int simd_layout =
_grid->_simd_layout[dimension];
1447 int comm_dim =
_grid->_processors[dimension] >1 ;
1448 assert(comm_dim==1);
1450 assert(simd_layout==maxl);
1455 int permute_type=
_grid->PermuteType(dimension);
1460 int buffer_size =
_grid->_slice_nblock[dimension]*
_grid->_slice_block[dimension];
1463 assert(cbmask==0x3);
1465 int reduced_buffer_size = buffer_size;
1466 if (cbmask != 0x3) reduced_buffer_size=buffer_size>>1;
1468 int datum_bytes = compress.CommDatumSize();
1469 int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
1475 assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
1477 std::vector<cobj *> rpointers(maxl);
1478 std::vector<cobj *> spointers(maxl);
1484 int cb = (cbmask==0x2)?
Odd :
Even;
1485 int sshift=
_grid->CheckerBoardShiftForCB(rhs.
Checkerboard(),dimension,shift,cb);
1488 for(
int x=0;x<rd;x++){
1490 int any_offnode = ( ((x+sshift)%fd) >= rd );
1492 if ( any_offnode ) {
1495 for(
int i=0;i<maxl;i++){
1499 int sx = (x+sshift)%rd;
1503 std::vector<std::pair<int,int> > face_table_host ;
1506 face_table[face_idx].resize(face_table_host.size());
1509 face_table[face_idx].size()*
sizeof(face_table_host[0]));
1514 if ( comms_send ) xbytes = bytes;
1518 if ( comms_recv ) rbytes = bytes;
1525 if ( comms_send || comms_recv ) {
1529 compressor::Gather_plane_exchange(
face_table[face_idx],rhs,
1530 spointers,dimension,sx,cbmask,
1531 compress,permute_type,partial );
1537 for(
int i=0;i<maxl;i++){
1539 int my_coor = rd*i + x;
1540 int nbr_coor = my_coor+sshift;
1542 int nbr_proc = ((nbr_coor)/ld) % pd;
1543 int nbr_lcoor= (nbr_coor%ld);
1544 int nbr_ic = (nbr_lcoor)/rd;
1545 int nbr_ox = (nbr_lcoor%rd);
1547 int nbr_plane = nbr_ic;
1548 assert (sx == nbr_ox);
1559 _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
1561 #warning STENCIL SHM FAST PATH SELECTED
1565 cobj *shm = (cobj *)
_grid->ShmBufferTranslate(recv_from_rank,sp);
1578 void *test_ptr = (
void *)
_grid->ShmBufferTranslate(xmit_to_rank,sp);
1579 if ( test_ptr != NULL ) shm_send = 1;
1589 int duplicate =
CheckForDuplicate(dimension,sx,nbr_proc,(
void *)rp,i,xbytes,rbytes,cbmask);
1591 if ( (bytes != rbytes) && (rbytes!=0) ){
1595 int do_send = (comms_send) && (!shm_send );
1597 xmit_to_rank,do_send,
1598 recv_from_rank,do_send,
#define accelerator_forNB(iterator, num, nsimd,...)
accelerator_inline int acceleratorSIMTlane(int Nsimd)
#define accelerator_inline
void acceleratorCopySynchronise(void)
void acceleratorFenceComputeStream(void)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
void acceleratorMemSet(void *base, int value, size_t bytes)
#define accelerator_barrier(dummy)
std::vector< T, devAllocator< T > > deviceVector
AcceleratorVector< int, MaxDims > Coordinate
accelerator_inline void permute(ComplexD &y, ComplexD b, int perm)
accelerator_inline Grid_simd< S, V > abs(const Grid_simd< S, V > &r)
#define NAMESPACE_BEGIN(A)
void DslashResetCounts(void)
void DslashLogDirichlet(void)
void Gather_plane_table_compute(GridBase *grid, int dimension, int plane, int cbmask, int off, std::vector< std::pair< int, int > > &table)
void DslashLogPartial(void)
void DslashGetCounts(uint64_t &dirichlet, uint64_t &partial, uint64_t &full)
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
Coordinate _processor_coor
void ShiftedRanks(int dim, int shift, int &source, int &dest)
StencilVector _comms_send
StencilVector _permute_type
StencilVector _comm_buf_size
StencilEntry * _entries_p
accelerator_inline void iCoorFromIindex(Coordinate &coor, int lane) const
accelerator_inline StencilEntry * GetEntry(int &ptype, int point, int osite) const
int GetNodeLocal(int osite, int point) const
accelerator_inline uint64_t GetInfo(int &ptype, int &local, int &perm, int point, int ent, uint64_t base) const
StencilVector _comms_recv
accelerator_inline cobj * CommBuf(void) const
StencilEntry * _entries_host_p
AcceleratorVector< int, STENCIL_MAX > StencilVector
accelerator_inline uint64_t GetPFInfo(int ent, uint64_t base) const
StencilVector _directions
CartesianStencilView(const CartesianStencilView &refer_to_me)=default
void ViewOpen(ViewMode _mode)
CartesianStencilView(const CartesianStencilAccelerator< vobj, cobj, Parameters > &refer_to_me, ViewMode _mode)
const CartesianStencilView< SiteSpinor, SiteSpinor, ImplParams > View_type
void SetSloppyComms(int sloppy)
std::vector< Merge > Mergers
void CommsMergeSHM(decompressor decompress)
void ScatterPlane(int point, int dimension, int plane, int cbmask, int offset, int wrap)
void Comms(int point, int dimension, int shiftpm, int cbmask)
deviceVector< int > surface_list
void CommunicateComplete(std::vector< std::vector< CommsRequest_t > > &reqs)
void HaloExchange(const Lattice< vobj > &source, compressor &compress)
int GatherSimd(const Lattice< vobj > &rhs, int dimension, int shift, int cbmask, compressor &compress, int &face_idx, int point)
std::vector< CommsRequest_t > MpiReqs
void DecompressPacket(Packet &packet)
void AddMerge(cobj *merge_p, std::vector< cobj * > &rpointers, Integer buffer_size, Integer type, std::vector< Merge > &mv)
std::vector< deviceVector< std::pair< int, int > > > face_table
int Gather(const Lattice< vobj > &rhs, int dimension, int shift, int cbmask, compressor &compress, int &face_idx, int point)
void * DeviceBufferMalloc(size_t bytes)
void HaloGather(const Lattice< vobj > &source, compressor &compress)
void CopyPlane(int point, int dimension, int lplane, int rplane, int cbmask, int permute, int wrap)
void CommsMerge(decompressor decompress)
void AddDecompress(cobj *k_p, cobj *m_p, Integer buffer_size, std::vector< Decompress > &dv)
std::vector< Merge > MergersSHM
std::vector< SiteSpinor * > u_simd_send_buf
View_type::StencilVector StencilVector
void Local(int point, int dimension, int shiftpm, int cbmask)
std::vector< CachedTransfer > CachedTransfers
void BuildSurfaceList(int Ls, int vol4)
void CommsMerge(decompressor decompress, std::vector< Merge > &mm, std::vector< Decompress > &dd)
void CommunicateBegin(std::vector< std::vector< CommsRequest_t > > &reqs)
GridBase * Grid(void) const
View_type View(ViewMode mode) const
void AddCopy(void *from, void *to, Integer bytes)
std::vector< Decompress > DecompressionsSHM
SiteSpinor::scalar_object scalar_object
std::vector< CopyReceiveBuffer > CopyReceiveBuffers
void PrecomputeByteOffsets(void)
void AddPacket(void *xmit, void *rcv, Integer to, Integer do_send, Integer from, Integer do_recv, Integer xbytes, Integer rbytes)
std::vector< Decompress > Decompressions
CartesianStencil(GridBase *grid, int npoints, int checkerboard, const std::vector< int > &directions, const std::vector< int > &distances, Parameters p=Parameters(), bool preserve_shm=false)
SiteSpinor::vector_type vector_type
int HaloGatherDir(const Lattice< vobj > &source, compressor &compress, int point, int &face_idx)
void DirichletBlock(const Coordinate &dirichlet_block)
Introduce a block structure and switch off comms on boundaries.
Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf, Integer lane, Integer xbytes, Integer rbytes, Integer cb)
std::vector< Packet > Packets
void DeviceBufferFreeAll(void)
std::vector< StencilEntry > _entries
std::vector< SiteSpinor * > u_simd_recv_buf
deviceVector< StencilEntry > _entries_device
void CompressPacket(Packet &packet)
static bool StepLog(const char *name)
static void recvLog(void *, uint64_t bytes, int rank)
static void xmitLog(void *, uint64_t bytes)
int PermuteType(int dimension)
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const
void * ShmBufferTranslate(int rank, void *local_p)
static deviceVector< unsigned char > DeviceCommBuf
GridTypeMapper< scalar_type >::Realified real_scalar_type
static constexpr int Nsimd
static constexpr int Nsimd
std::vector< cobj * > vpointers
Integer rbytes_compressed
void * compressed_recv_buf
void * compressed_send_buf
Integer xbytes_compressed
uint8_t _around_the_world