Grid 0.7.0
Stencil.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/Stencil.h
6
7 Copyright (C) 2015
8
9 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#ifndef GRID_STENCIL_H
29#define GRID_STENCIL_H
30
31#define STENCIL_MAX (16)
32
33#include <Grid/stencil/SimpleCompressor.h> // subdir aggregate
35
37// Must not lose sight that goal is to be able to construct really efficient
38// gather to a point stencil code. CSHIFT is not the best way, so need
39// additional stencil support.
40//
41// Stencil based code will exchange haloes and use a table lookup for neighbours.
42// This will be done with generality to allow easier efficient implementations.
43// Overlap of comms and compute is enabled by tabulating off-node connected,
44//
45// Generic services
46// 0) Prebuild neighbour tables
47// 1) Compute sizes of all haloes/comms buffers; allocate them.
48// 2) Gather all faces, and communicate.
49// 3) Loop over result sites, giving nbr index/offnode info for each
50//
52
54
55// These can move into a params header and be given MacroMagic serialisation
57 Coordinate dirichlet; // Blocksize of dirichlet BCs
58 // int partialDirichlet;
60 dirichlet.resize(0);
61 // partialDirichlet=0;
62 };
63};
64
66// Gather for when there *is* need to SIMD split with compression
68
69void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
70 int off,std::vector<std::pair<int,int> > & table);
71
73{
74public:
75 static deviceVector<unsigned char> DeviceCommBuf; // placed in Stencil.cc
76};
77
78void DslashResetCounts(void);
79void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
80void DslashLogFull(void);
82void DslashLogDirichlet(void);
83
85#ifdef GRID_CUDA
86 uint64_t _byte_offset; // 8 bytes
87 uint32_t _offset; // 4 bytes
88#else
89 uint64_t _byte_offset; // 8 bytes
90 uint64_t _offset; // 8 bytes (8 ever required?)
91#endif
92 uint8_t _is_local; // 1 bytes
93 uint8_t _permute; // 1 bytes
94 uint8_t _around_the_world; // 1 bytes
95 uint8_t _pad; // 1 bytes
96};
97// Could pack to 8 + 4 + 4 = 128 bit and use
98
99template<class vobj,class cobj,class Parameters>
101 public:
103
104 // Stencil runs along coordinate axes only; NO diagonal fill in.
106 // Basic Grid and stencil info
109 int _npoints; // Move to template param?
114 // If true, this is FULLY communicated per face
115 // Otherwise will either be full or partial dirichlet
118 StencilVector _comms_recv; // this is FULLY communicated per face
120 // If true, this is partially communicated per face
122 // StencilVector _comms_partial_send;
123 // StencilVector _comms_partial_recv;
124 //
129 Parameters parameters;
135
136 accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; }
137
138 // Not a device function
139 inline int GetNodeLocal(int osite,int point) const {
140 StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite];
141 return SE._is_local;
142 }
143 accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const {
144 ptype = this->_permute_type[point];
145 return & this->_entries_p[point+this->_npoints*osite];
146 }
147
148 accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const {
149 uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
150 local = this->_entries_p[ent]._is_local;
151 perm = this->_entries_p[ent]._permute;
152 if (perm) ptype = this->_permute_type[point];
153 if (local) {
154 return base + this->_entries_p[ent]._byte_offset;
155 } else {
156 return cbase + this->_entries_p[ent]._byte_offset;
157 }
158 }
159
160 accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) const {
161 uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
162 int local = this->_entries_p[ent]._is_local;
163 if (local) return base + this->_entries_p[ent]._byte_offset;
164 else return cbase + this->_entries_p[ent]._byte_offset;
165 }
166
168 {
169 Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout);
170 }
171};
172
173template<class vobj,class cobj,class Parameters>
174class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parameters>
175{
176public:
177 int *closed;
178 // StencilEntry *cpu_ptr;
179 public:
180 // default copy constructor
181 CartesianStencilView (const CartesianStencilView &refer_to_me) = default;
182
184 : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me)
185 {
186 this->ViewOpen(_mode);
187 }
188 void ViewOpen(ViewMode _mode)
189 {
190 this->mode = _mode;
191 }
192
193 void ViewClose(void) { }
194
195};
196
198// The Stencil Class itself
200template<class vobj,class cobj,class Parameters>
201class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters> { // Stencil runs along coordinate axes only; NO diagonal fill in.
202public:
203
204 typedef typename cobj::vector_type vector_type;
205 typedef typename cobj::scalar_object scalar_object;
209 // Helper structs
225 struct Merge {
226 static constexpr int Nsimd = vobj::Nsimd();
227 cobj * mpointer;
228 // std::vector<scalar_object *> rpointers;
229 std::vector<cobj *> vpointers;
232 // Integer partial; // partial dirichlet BCs
234 };
235 struct Decompress {
236 static constexpr int Nsimd = vobj::Nsimd();
237 cobj * kernel_p;
238 cobj * mpi_p;
240 // Integer partial; // partial dirichlet BCs
242 };
244 void * from_p;
245 void * to_p;
247 };
258
259protected:
261
263 // Sloppy comms will make a second buffer upon comms
268 void *DeviceBufferMalloc(size_t bytes)
269 {
270 void *ptr = (void *)device_heap_top;
271 device_heap_top += bytes;
272 device_heap_bytes+= bytes;
274 std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
276 }
277 return ptr;
278 }
280 {
282 // Resize up if necessary, never down
285 }
289 }
290
291public:
292 GridBase *Grid(void) const { return _grid; }
293
295 // Control reduced precision comms
298 void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
299
301 // Needed to conveniently communicate gparity parameters into GPU memory
302 // without adding parameters. Perhaps a template parameter to StenciView is
303 // required to pass general parameters.
304 // Generalise as required later if needed
306
308 View_type accessor(*( (View_type *) this),mode);
309 return accessor;
310 }
311
313 // int partialDirichlet;
315 std::vector<deviceVector<std::pair<int,int> > > face_table ;
317
318 std::vector<StencilEntry> _entries; // Resident in host memory
319 deviceVector<StencilEntry> _entries_device; // Resident in device memory
320 std::vector<Packet> Packets;
321 std::vector<Merge> Mergers;
322 std::vector<Merge> MergersSHM;
323 std::vector<Decompress> Decompressions;
324 std::vector<Decompress> DecompressionsSHM;
325 std::vector<CopyReceiveBuffer> CopyReceiveBuffers ;
326 std::vector<CachedTransfer> CachedTransfers;
327 std::vector<CommsRequest_t> MpiReqs;
328
330 // Unified Comms buffers for all directions
332 // Vectors that live on the symmetric heap in case of SHMEM
333 // These are used; either SHM objects or refs to the above symmetric heap vectors
334 // depending on comms target
335 std::vector<cobj *> u_simd_send_buf;
336 std::vector<cobj *> u_simd_recv_buf;
337
340
342 // Stencil query
344#if 1
345 inline int SameNode(int point) {
346
347 int dimension = this->_directions[point];
348 int displacement = this->_distances[point];
349
350 int pd = _grid->_processors[dimension];
351 int fd = _grid->_fdimensions[dimension];
352 int ld = _grid->_ldimensions[dimension];
353 int rd = _grid->_rdimensions[dimension];
354 int simd_layout = _grid->_simd_layout[dimension];
355 int comm_dim = _grid->_processors[dimension] >1 ;
356
357 // int recv_from_rank;
358 // int xmit_to_rank;
359
360 if ( ! comm_dim ) return 1;
361 if ( displacement == 0 ) return 1;
362 return 0;
363 }
364#else
365 // fancy calculation for shm code
366 inline int SameNode(int point) {
367
368 int dimension = this->_directions[point];
369 int displacement = this->_distances[point];
370
371 int pd = _grid->_processors[dimension];
372 int fd = _grid->_fdimensions[dimension];
373 int ld = _grid->_ldimensions[dimension];
374 int rd = _grid->_rdimensions[dimension];
375 int simd_layout = _grid->_simd_layout[dimension];
376 int comm_dim = _grid->_processors[dimension] >1 ;
377
378 int recv_from_rank;
379 int xmit_to_rank;
380
381 if ( ! comm_dim ) return 1;
382
383 int nbr_proc;
384 if (displacement>0) nbr_proc = 1;
385 else nbr_proc = pd-1;
386
387 // FIXME this logic needs to be sorted for three link term
388 // assert( (displacement==1) || (displacement==-1));
389 // Present hack only works for >= 4^4 subvol per node
390 _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
391
392 void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
393
394 if ( shm==NULL ) return 0;
395 return 1;
396 }
397#endif
399 // Comms packet queue for asynch thread
400 // Use OpenMP Tasks for cleaner ???
401 // must be called *inside* parallel region
404 // Non blocking send and receive. Necessarily parallel.
406 void DecompressPacket(Packet &packet)
407 {
408 if ( !SloppyComms ) return;
409
410 if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
411
412 typedef typename getPrecision<cobj>::real_scalar_type word;
413 uint64_t words = packet.rbytes/sizeof(word);
414 const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
415 const uint64_t outer = words/nsimd;
416
417 if(sizeof(word)==8) {
418
419 // Can either choose to represent as float vs double and prec change
420 // OR
421 // truncate the mantissa bfp16 style
422 double *dbuf =(double *) packet.recv_buf;
423 float *fbuf =(float *) packet.compressed_recv_buf;
424
425 accelerator_forNB(ss,outer,nsimd,{
426 int lane = acceleratorSIMTlane(nsimd);
427 dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
428 });
429
430 } else if ( sizeof(word)==4){
431 // Can either choose to represent as half vs float and prec change
432 // OR
433 // truncate the mantissa bfp16 style
434
435 uint32_t *fbuf =(uint32_t *) packet.recv_buf;
436 uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
437
438 accelerator_forNB(ss,outer,nsimd,{
439 int lane = acceleratorSIMTlane(nsimd);
440 fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
441 });
442
443 } else {
444 assert(0 && "unknown floating point precision");
445 }
446 }
447 }
448 void CompressPacket(Packet &packet)
449 {
450 packet.xbytes_compressed = packet.xbytes;
451 packet.compressed_send_buf = packet.send_buf;
452
453 packet.rbytes_compressed = packet.rbytes;
454 packet.compressed_recv_buf = packet.recv_buf;
455
456 if ( !SloppyComms ) {
457 return;
458 }
459
460 typedef typename getPrecision<cobj>::real_scalar_type word;
461 uint64_t words = packet.xbytes/sizeof(word);
462 const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
463 const uint64_t outer = words/nsimd;
464
465 if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
466
467 packet.rbytes_compressed = packet.rbytes/2;
468 packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
469 // std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
470
471 }
472 //else {
473 // std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
474 // }
475
476 if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
477
478 packet.xbytes_compressed = packet.xbytes/2;
479 packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
480 // std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
481
482 if(sizeof(word)==8) {
483
484 double *dbuf =(double *) packet.send_buf;
485 float *fbuf =(float *) packet.compressed_send_buf;
486
487 accelerator_forNB(ss,outer,nsimd,{
488 int lane = acceleratorSIMTlane(nsimd);
489 fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
490 });
491
492 } else if ( sizeof(word)==4){
493
494 uint32_t *fbuf =(uint32_t *) packet.send_buf;
495 uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
496
497 accelerator_forNB(ss,outer,nsimd,{
498 int lane = acceleratorSIMTlane(nsimd);
499 hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
500 });
501
502 } else {
503 assert(0 && "unknown floating point precision");
504 }
505
506 }
507 // else {
508 // std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
509 // }
510
511 return;
512 }
513 void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
514 {
515 FlightRecorder::StepLog("Communicate begin");
517 // All GPU kernel tasks must complete
518 // accelerator_barrier(); All kernels should ALREADY be complete
519 //Everyone is here, so noone running slow and still using receive buffer
520 _grid->StencilBarrier();
521 // But the HaloGather had a barrier too.
523 if (SloppyComms) {
525 }
526 for(int i=0;i<Packets.size();i++){
527 this->CompressPacket(Packets[i]);
528 }
529 if (SloppyComms) {
531#ifdef NVLINK_GET
532 _grid->StencilBarrier();
533#endif
534 }
535
536 for(int i=0;i<Packets.size();i++){
537 // std::cout << "Communicate prepare "<<i<<std::endl;
538 // _grid->Barrier();
539 _grid->StencilSendToRecvFromPrepare(MpiReqs,
540 Packets[i].compressed_send_buf,
541 Packets[i].to_rank,Packets[i].do_send,
542 Packets[i].compressed_recv_buf,
543 Packets[i].from_rank,Packets[i].do_recv,
544 Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
545 }
546 // std::cout << "Communicate PollDtoH "<<std::endl;
547 // _grid->Barrier();
548 _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
549 // std::cout << "Communicate CopySynch "<<std::endl;
550 // _grid->Barrier();
552 // Starts intranode
553 for(int i=0;i<Packets.size();i++){
554 // std::cout << "Communicate Begin "<<i<<std::endl;
555 // _grid->Barrier();
556 _grid->StencilSendToRecvFromBegin(MpiReqs,
557 Packets[i].send_buf,Packets[i].compressed_send_buf,
558 Packets[i].to_rank,Packets[i].do_send,
559 Packets[i].recv_buf,Packets[i].compressed_recv_buf,
560 Packets[i].from_rank,Packets[i].do_recv,
561 Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
562 // std::cout << "Communicate Begin started "<<i<<std::endl;
563 // _grid->Barrier();
564 }
565 FlightRecorder::StepLog("Communicate begin has finished");
566 // Get comms started then run checksums
567 // Having this PRIOR to the dslash seems to make Sunspot work... (!)
568 for(int i=0;i<Packets.size();i++){
569 if ( Packets[i].do_send )
570 FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
571 }
572 }
573
574 void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
575 {
576 // std::cout << "Communicate Complete "<<std::endl;
577 // _grid->Barrier();
578 FlightRecorder::StepLog("Start communicate complete");
579 // std::cout << "Communicate Complete PollIRecv "<<std::endl;
580 // _grid->Barrier();
581 _grid->StencilSendToRecvFromPollIRecv(MpiReqs);
582 // std::cout << "Communicate Complete Complete "<<std::endl;
583 // _grid->Barrier();
584 _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
585 // if ( this->partialDirichlet ) DslashLogPartial();
586 if ( this->fullDirichlet ) DslashLogDirichlet();
587 else DslashLogFull();
588 // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
589 // accelerator_barrier();
590 for(int i=0;i<Packets.size();i++){
591 this->DecompressPacket(Packets[i]);
592 if ( Packets[i].do_recv )
593 FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
594 }
595 FlightRecorder::StepLog("Finish communicate complete");
596 }
597
598 // Blocking send and receive. Either sequential or parallel.
600 void Communicate(void)
601 {
603 // Concurrent and non-threaded asynch calls to MPI
605 std::vector<std::vector<CommsRequest_t> > reqs;
606 this->CommunicateBegin(reqs);
607 this->CommunicateComplete(reqs);
608 }
609
610 template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
611 {
612 Prepare();
613 HaloGather(source,compress);
614 Communicate();
615 CommsMergeSHM(compress);
616 CommsMerge(compress);
618 }
619
620 template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
621 {
622 int dimension = this->_directions[point];
623 int displacement = this->_distances[point];
624
625 int fd = _grid->_fdimensions[dimension];
626 int rd = _grid->_rdimensions[dimension];
627
628 // Map to always positive shift modulo global full dimension.
629 int shift = (displacement+fd)%fd;
630
631 assert (source.Checkerboard()== this->_checkerboard);
632
633 // the permute type
634 int simd_layout = _grid->_simd_layout[dimension];
635 int comm_dim = _grid->_processors[dimension] >1 ;
636 int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
637
638 int is_same_node = 1;
639 // Gather phase
640 int sshift [2];
641 if ( comm_dim ) {
642 sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even);
643 sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd);
644 if ( sshift[0] == sshift[1] ) {
645 if (splice_dim) {
646 auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx,point);
647 is_same_node = is_same_node && tmp;
648 } else {
649 auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx,point);
650 is_same_node = is_same_node && tmp;
651 }
652 } else {
653 if(splice_dim){
654 // if checkerboard is unfavourable take two passes
655 // both with block stride loop iteration
656 auto tmp1 = GatherSimd(source,dimension,shift,0x1,compress,face_idx,point);
657 auto tmp2 = GatherSimd(source,dimension,shift,0x2,compress,face_idx,point);
658 is_same_node = is_same_node && tmp1 && tmp2;
659 } else {
660 auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx,point);
661 auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx,point);
662 is_same_node = is_same_node && tmp1 && tmp2;
663 }
664 }
665 }
666 return is_same_node;
667 }
668
669 template<class compressor>
670 void HaloGather(const Lattice<vobj> &source,compressor &compress)
671 {
672 // accelerator_barrier();
674 // I will overwrite my send buffers
676 _grid->StencilBarrier();// Synch shared memory on a single nodes
677
678 assert(source.Grid()==_grid);
679
681
682 // Gather all comms buffers
683 int face_idx=0;
684 for(int point = 0 ; point < this->_npoints; point++) {
685 compress.Point(point);
686 HaloGatherDir(source,compress,point,face_idx);
687 }
688 accelerator_barrier(); // All my local gathers are complete
689#ifdef NVLINK_GET
690 _grid->StencilBarrier(); // He can now get mu local gather, I can get his
691 // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
692 // Or issue barrier AFTER the DMA is running
693#endif
696 }
697
699 // Implementation
701 void Prepare(void)
702 {
703 Decompressions.resize(0);
704 DecompressionsSHM.resize(0);
705 Mergers.resize(0);
706 MergersSHM.resize(0);
707 Packets.resize(0);
708 CopyReceiveBuffers.resize(0);
709 CachedTransfers.resize(0);
710 MpiReqs.resize(0);
711 }
712 void AddCopy(void *from,void * to, Integer bytes)
713 {
714 CopyReceiveBuffer obj;
715 obj.from_p = from;
716 obj.to_p = to;
717 obj.bytes= bytes;
718 CopyReceiveBuffers.push_back(obj);
719 }
721 {
722 // These are device resident MPI buffers.
723 for(int i=0;i<CopyReceiveBuffers.size();i++){
724 cobj *from=(cobj *)CopyReceiveBuffers[i].from_p;
725 cobj *to =(cobj *)CopyReceiveBuffers[i].to_p;
726 Integer words = CopyReceiveBuffers[i].bytes/sizeof(cobj);
727
728 accelerator_forNB(j, words, cobj::Nsimd(), {
729 coalescedWrite(to[j] ,coalescedRead(from [j]));
730 });
732 // Also fenced in WilsonKernels
733 }
734 }
735
736 Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,
737 Integer xbytes,Integer rbytes,
738 Integer cb)
739 {
740 CachedTransfer obj;
741 obj.direction = direction;
742 obj.OrthogPlane = OrthogPlane;
743 obj.DestProc = DestProc;
744 obj.recv_buf = recv_buf;
745 obj.lane = lane;
746 obj.xbytes = xbytes;
747 obj.rbytes = rbytes;
748 obj.cb = cb;
749
750 for(int i=0;i<CachedTransfers.size();i++){
751 if ( (CachedTransfers[i].direction ==direction)
752 &&(CachedTransfers[i].OrthogPlane==OrthogPlane)
753 &&(CachedTransfers[i].DestProc ==DestProc)
754 &&(CachedTransfers[i].xbytes ==xbytes)
755 &&(CachedTransfers[i].rbytes ==rbytes)
756 &&(CachedTransfers[i].lane ==lane)
757 &&(CachedTransfers[i].cb ==cb)
758 ){
759 // FIXME worry about duplicate with partial compression
760 // Wont happen as DWF has no duplicates, but...
761 AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes);
762 return 1;
763 }
764 }
765
766 CachedTransfers.push_back(obj);
767 return 0;
768 }
769 void AddPacket(void *xmit,void * rcv,
770 Integer to, Integer do_send,
771 Integer from, Integer do_recv,
772 Integer xbytes,Integer rbytes){
773 Packet p;
774 p.send_buf = xmit;
775 p.recv_buf = rcv;
776 p.to_rank = to;
777 p.from_rank= from;
778 p.do_send = do_send;
779 p.do_recv = do_recv;
780 p.xbytes = xbytes;
781 p.rbytes = rbytes;
782 // if (do_send) std::cout << GridLogMessage << " MPI packet to "<<to<< " of size "<<xbytes<<std::endl;
783 // if (do_recv) std::cout << GridLogMessage << " MPI packet from "<<from<< " of size "<<xbytes<<std::endl;
784 Packets.push_back(p);
785 }
786 void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
787 Decompress d;
788 // d.partial = this->partialDirichlet;
789 d.dims = _grid->_fdimensions;
790 d.kernel_p = k_p;
791 d.mpi_p = m_p;
792 d.buffer_size = buffer_size;
793 dv.push_back(d);
794 }
795 void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
796 Merge m;
797 // m.partial = this->partialDirichlet;
798 m.dims = _grid->_fdimensions;
799 m.type = type;
800 m.mpointer = merge_p;
801 m.vpointers= rpointers;
802 m.buffer_size = buffer_size;
803 mv.push_back(m);
804 }
805 template<class decompressor> void CommsMerge(decompressor decompress) {
806 CommsCopy();
808 }
809 template<class decompressor> void CommsMergeSHM(decompressor decompress) {
810 assert(MergersSHM.size()==0);
811 assert(DecompressionsSHM.size()==0);
812 }
813
814 template<class decompressor>
815 void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd)
816 {
817 for(int i=0;i<mm.size();i++){
818 decompressor::MergeFace(decompress,mm[i]);
819 }
820 for(int i=0;i<dd.size();i++){
821 decompressor::DecompressFace(decompress,dd[i]);
822 }
823 acceleratorFenceComputeStream(); // dependent kernels
824 }
825
826 // Set up routines
829 for(int i=0;i<_entries.size();i++){
830 if( this->_entries[i]._is_local ) {
831 this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj);
832 } else {
833 this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj);
834 }
835 }
836 };
837
838 // Move interior/exterior split into the generic stencil
839 // FIXME Explicit Ls in interface is a pain. Should just use a vol
840 void BuildSurfaceList(int Ls,int vol4){
841
842 // find same node for SHM
843 // Here we know the distance is 1 for WilsonStencil
844 for(int point=0;point<this->_npoints;point++){
845 this->same_node[point] = this->SameNode(point);
846 }
847 int32_t surface_list_size=0;
848 for(int site = 0 ;site< vol4;site++){
849 int local = 1;
850 for(int point=0;point<this->_npoints;point++){
851 if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
852 local = 0;
853 }
854 }
855 if(local == 0) {
856 for(int s=0;s<Ls;s++){
857 surface_list_size++;
858 }
859 }
860 }
861 // std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
862 surface_list.resize(surface_list_size);
863 std::vector<int> surface_list_host(surface_list_size);
864 int32_t ss=0;
865 for(int site = 0 ;site< vol4;site++){
866 int local = 1;
867 for(int point=0;point<this->_npoints;point++){
868 if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
869 local = 0;
870 }
871 }
872 if(local == 0) {
873 for(int s=0;s<Ls;s++){
874 int idx=site*Ls+s;
875 surface_list_host[ss]= idx;
876 ss++;
877 }
878 }
879 }
880 acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
881 // std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
882 }
883
884 void DirichletBlock(const Coordinate &dirichlet_block)
885 {
886 for(int ii=0;ii<this->_npoints;ii++){
887 int dimension = this->_directions[ii];
888 int displacement = this->_distances[ii];
889 int gd = _grid->_gdimensions[dimension];
890 int fd = _grid->_fdimensions[dimension];
891 int pd = _grid->_processors [dimension];
892 int pc = _grid->_processor_coor[dimension];
893 int ld = fd/pd;
895 // Figure out dirichlet send and receive
896 // on this leg of stencil.
898 int comm_dim = _grid->_processors[dimension] >1 ;
899 int block = dirichlet_block[dimension];
900 this->_comms_send[ii] = comm_dim;
901 this->_comms_recv[ii] = comm_dim;
902 // this->_comms_partial_send[ii] = 0;
903 // this->_comms_partial_recv[ii] = 0;
904 if ( block && comm_dim ) {
905 assert(abs(displacement) < ld );
906 // Quiesce communication across block boundaries
907 if( displacement > 0 ) {
908 // High side, low side
909 // | <--B--->|
910 // | | |
911 // noR
912 // noS
913 if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_recv[ii] = 0;
914 if ( ( (ld*pc ) % block ) == 0 ) this->_comms_send[ii] = 0;
915 } else {
916 // High side, low side
917 // | <--B--->|
918 // | | |
919 // noS
920 // noR
921 if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
922 if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
923 }
924 // if ( partialDirichlet ) {
925 // this->_comms_partial_send[ii] = !this->_comms_send[ii];
926 // this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
927 // }
928 }
929 }
930 }
932 int npoints,
933 int checkerboard,
934 const std::vector<int> &directions,
935 const std::vector<int> &distances,
936 Parameters p=Parameters(),
937 bool preserve_shm=false)
938 {
939 SloppyComms = 0;
941 _grid = grid;
942 this->parameters=p;
944 // Initialise the base
946 this->_npoints = npoints;
947 this->_comm_buf_size.resize(npoints),
948 this->_permute_type.resize(npoints),
949 this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels
950 this->_directions = StencilVector(directions);
951 this->_distances = StencilVector(distances);
952 this->_comms_send.resize(npoints);
953 this->_comms_recv.resize(npoints);
954 this->same_node.resize(npoints);
955
956 if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
957 // partialDirichlet = p.partialDirichlet;
958 DirichletBlock(p.dirichlet); // comms send/recv set up
960 for(int d=0;d<p.dirichlet.size();d++){
961 if (p.dirichlet[d]) fullDirichlet=1;
962 }
963
965 surface_list.resize(0);
966
967 this->_osites = _grid->oSites();
968
969 _entries.resize(this->_npoints* this->_osites);
970 _entries_device.resize(this->_npoints* this->_osites);
971 this->_entries_host_p = &_entries[0];
972 this->_entries_p = &_entries_device[0];
973
974 // std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
975 // <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
976
977 for(int ii=0;ii<npoints;ii++){
978
979 int i = ii; // reverse direction to get SIMD comms done first
980 int point = i;
981
982 int dimension = directions[i];
983 int displacement = distances[i];
984 int shift = displacement;
985
986 int gd = _grid->_gdimensions[dimension];
987 int fd = _grid->_fdimensions[dimension];
988 int pd = _grid->_processors [dimension];
989 // int ld = gd/pd;
990 int rd = _grid->_rdimensions[dimension];
991 int pc = _grid->_processor_coor[dimension];
992 this->_permute_type[point]=_grid->PermuteType(dimension);
993
994 this->_checkerboard = checkerboard;
995
996 int simd_layout = _grid->_simd_layout[dimension];
997 int comm_dim = _grid->_processors[dimension] >1 ;
998 int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
999 int rotate_dim = _grid->_simd_layout[dimension]>2;
1000
1001 assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
1002
1003 int sshift[2];
1005 // Underlying approach. For each local site build
1006 // up a table containing the npoint "neighbours" and whether they
1007 // live in lattice or a comms buffer.
1009 if ( !comm_dim ) {
1010 sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even);
1011 sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd);
1012
1013 if ( sshift[0] == sshift[1] ) {
1014 Local(point,dimension,shift,0x3);
1015 } else {
1016 Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
1017 Local(point,dimension,shift,0x2);// both with block stride loop iteration
1018 }
1019 } else {
1020 // All permute extract done in comms phase prior to Stencil application
1021 // So tables are the same whether comm_dim or splice_dim
1022 sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even);
1023 sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd);
1024 if ( sshift[0] == sshift[1] ) {
1025 Comms(point,dimension,shift,0x3);
1026 } else {
1027 Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
1028 Comms(point,dimension,shift,0x2);// both with block stride loop iteration
1029 }
1030 }
1031 }
1032
1034 // Try to allocate for receiving in a shared memory region, fall back to buffer
1036 const int Nsimd = grid->Nsimd();
1037
1038 // Allow for multiple stencils to be communicated simultaneously
1039 if (!preserve_shm)
1040 _grid->ShmBufferFreeAll();
1041
1042 int maxl=2;
1043 u_simd_send_buf.resize(maxl);
1044 u_simd_recv_buf.resize(maxl);
1045 this->u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
1046 this->u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
1047
1048 for(int l=0;l<maxl;l++){
1049 u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
1050 u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
1051 }
1053 acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry));
1054 }
1055
1056 void Local (int point, int dimension,int shiftpm,int cbmask)
1057 {
1058 int fd = _grid->_fdimensions[dimension];
1059 int rd = _grid->_rdimensions[dimension];
1060 int ld = _grid->_ldimensions[dimension];
1061 int gd = _grid->_gdimensions[dimension];
1062 int ly = _grid->_simd_layout[dimension];
1063
1064 // Map to always positive shift modulo global full dimension.
1065 int shift = (shiftpm+fd)%fd;
1066
1067 // the permute type
1068 int permute_dim =_grid->PermuteDim(dimension);
1069
1070 for(int x=0;x<rd;x++){
1071
1072 // int o = 0;
1073 int bo = x * _grid->_ostride[dimension];
1074
1075 int cb= (cbmask==0x2)? Odd : Even;
1076
1077 int sshift = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb);
1078 int sx = (x+sshift)%rd;
1079
1080 int wraparound=0;
1081 if ( (shiftpm==-1) && (sx>x) ) {
1082 wraparound = 1;
1083 }
1084 if ( (shiftpm== 1) && (sx<x) ) {
1085 wraparound = 1;
1086 }
1087
1088 int permute_slice=0;
1089 if(permute_dim){
1090 int wrap = sshift/rd; wrap=wrap % ly; // but it is local anyway
1091 int num = sshift%rd;
1092 if ( x< rd-num ) permute_slice=wrap;
1093 else permute_slice = (wrap+1)%ly;
1094 }
1095
1096 CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
1097
1098 }
1099 }
1100
1101 void Comms (int point,int dimension,int shiftpm,int cbmask)
1102 {
1103 GridBase *grid=_grid;
1104 const int Nsimd = grid->Nsimd();
1105
1106 // int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
1107 int comms_recv = this->_comms_recv[point];
1108 int fd = _grid->_fdimensions[dimension];
1109 int ld = _grid->_ldimensions[dimension];
1110 int rd = _grid->_rdimensions[dimension];
1111 int pd = _grid->_processors[dimension];
1112 int simd_layout = _grid->_simd_layout[dimension];
1113 int comm_dim = _grid->_processors[dimension] >1 ;
1114
1115 assert(comm_dim==1);
1116 int shift = (shiftpm + fd) %fd;
1117 assert(shift>=0);
1118 assert(shift<fd);
1119
1120 // done in reduced dims, so SIMD factored
1121 int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
1122
1123 this->_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
1124
1125 // send to one or more remote nodes.
1126
1127 int cb= (cbmask==0x2)? Odd : Even;
1128 int sshift= _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb);
1129
1130 for(int x=0;x<rd;x++){
1131
1132 int permute_type=grid->PermuteType(dimension);
1133 int permute_slice;
1134
1135 int sx = (x+sshift)%rd;
1136
1137 int offnode = 0;
1138 if ( simd_layout > 1 ) {
1139
1140 permute_slice=1;
1141 for(int i=0;i<Nsimd;i++){
1142
1143 int inner_bit = (Nsimd>>(permute_type+1));
1144 int ic= (i&inner_bit)? 1:0;
1145 int my_coor = rd*ic + x;
1146 int nbr_coor = my_coor+sshift;
1147 int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
1148
1149 if ( nbr_proc ) {
1150 offnode =1;
1151 }
1152 }
1153
1154 } else {
1155 int comm_proc = ((x+sshift)/rd)%pd;
1156 offnode = (comm_proc!= 0);
1157 permute_slice=0;
1158 }
1159
1160 int wraparound=0;
1161 if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
1162 wraparound = 1;
1163 }
1164 if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
1165 wraparound = 1;
1166 }
1167
1168 // Wrap locally dirichlet support case OR node local
1169 if ( offnode==0 ) {
1170
1171 permute_slice=0;
1172 CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
1173
1174 } else {
1175
1176 if ( comms_recv ) {
1177
1178 ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase
1179
1180 } else {
1181
1182 CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
1183
1184 }
1185
1186 }
1187
1188 if ( offnode ) {
1189 int words = buffer_size;
1190 if (cbmask != 0x3) words=words>>1;
1191 _unified_buffer_size += words;
1192 }
1193 }
1194 }
1195 // Routine builds up integer table for each site in _offsets, _is_local, _permute
1196 void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
1197 {
1198 int rd = _grid->_rdimensions[dimension];
1199
1200 if ( !_grid->CheckerBoarded(dimension) ) {
1201
1202 int o = 0; // relative offset to base within plane
1203 int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
1204 int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
1205
1206 // Simple block stride gather of SIMD objects
1207 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
1208 for(int b=0;b<_grid->_slice_block[dimension];b++){
1209 int idx=point+(lo+o+b)*this->_npoints;
1210 this->_entries[idx]._offset =ro+o+b;
1211 this->_entries[idx]._permute=permute;
1212 this->_entries[idx]._is_local=1;
1213 this->_entries[idx]._around_the_world=wrap;
1214 }
1215 o +=_grid->_slice_stride[dimension];
1216 }
1217
1218 } else {
1219
1220 int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
1221 int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
1222 int o = 0; // relative offset to base within plane
1223
1224 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
1225 for(int b=0;b<_grid->_slice_block[dimension];b++){
1226
1227 int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
1228
1229 if ( ocb&cbmask ) {
1230 int idx = point+(lo+o+b)*this->_npoints;
1231 this->_entries[idx]._offset =ro+o+b;
1232 this->_entries[idx]._is_local=1;
1233 this->_entries[idx]._permute=permute;
1234 this->_entries[idx]._around_the_world=wrap;
1235 }
1236
1237 }
1238 o +=_grid->_slice_stride[dimension];
1239 }
1240
1241 }
1242 }
1243 // Routine builds up integer table for each site in _offsets, _is_local, _permute
1244 void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
1245 {
1246 int rd = _grid->_rdimensions[dimension];
1247
1248 if ( !_grid->CheckerBoarded(dimension) ) {
1249
1250 int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
1251 int o = 0; // relative offset to base within plane
1252 int bo = 0; // offset in buffer
1253
1254 // Simple block stride gather of SIMD objects
1255 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
1256 for(int b=0;b<_grid->_slice_block[dimension];b++){
1257 int idx=point+(so+o+b)*this->_npoints;
1258 this->_entries[idx]._offset =offset+(bo++);
1259 this->_entries[idx]._is_local=0;
1260 this->_entries[idx]._permute=0;
1261 this->_entries[idx]._around_the_world=wrap;
1262 }
1263 o +=_grid->_slice_stride[dimension];
1264 }
1265
1266 } else {
1267
1268 int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
1269 int o = 0; // relative offset to base within plane
1270 int bo = 0; // offset in buffer
1271
1272 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
1273 for(int b=0;b<_grid->_slice_block[dimension];b++){
1274
1275 int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
1276 if ( ocb & cbmask ) {
1277 int idx = point+(so+o+b)*this->_npoints;
1278 this->_entries[idx]._offset =offset+(bo++);
1279 this->_entries[idx]._is_local=0;
1280 this->_entries[idx]._permute =0;
1281 this->_entries[idx]._around_the_world=wrap;
1282 }
1283 }
1284 o +=_grid->_slice_stride[dimension];
1285 }
1286 }
1287 }
1288
1289 template<class compressor>
1290 int Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point)
1291 {
1292 typedef typename cobj::vector_type vector_type;
1293
1294 int comms_send = this->_comms_send[point];
1295 int comms_recv = this->_comms_recv[point];
1296 // int comms_partial_send = this->_comms_partial_send[point] ;
1297 // int comms_partial_recv = this->_comms_partial_recv[point] ;
1298
1299 assert(rhs.Grid()==_grid);
1300 // conformable(_grid,rhs.Grid());
1301
1302 int fd = _grid->_fdimensions[dimension];
1303 int rd = _grid->_rdimensions[dimension];
1304 int pd = _grid->_processors[dimension];
1305 int simd_layout = _grid->_simd_layout[dimension];
1306 int comm_dim = _grid->_processors[dimension] >1 ;
1307 assert(simd_layout==1);
1308 assert(comm_dim==1);
1309 assert(shift>=0);
1310 assert(shift<fd);
1311
1312 int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
1313
1314 int cb= (cbmask==0x2)? Odd : Even;
1315 int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
1316
1317 for(int x=0;x<rd;x++){
1318
1319 int sx = (x+sshift)%rd;
1320 int comm_proc = ((x+sshift)/rd)%pd;
1321
1322 if (comm_proc) {
1323
1324 int words = buffer_size;
1325 if (cbmask != 0x3) words=words>>1;
1326
1327 int bytes = words * compress.CommDatumSize();
1328 int xbytes;
1329 int rbytes;
1330
1331 if ( comms_send ) xbytes = bytes; // Full send
1332 // else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
1333 else xbytes = 0; // full dirichlet
1334
1335 if ( comms_recv ) rbytes = bytes;
1336 // else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
1337 else rbytes = 0;
1338
1339 int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
1340 int comm_off = u_comm_offset;
1341
1342 int recv_from_rank;
1343 int xmit_to_rank;
1344 cobj *recv_buf;
1345 cobj *send_buf;
1346 _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
1347
1348 assert (xmit_to_rank != _grid->ThisRank());
1349 assert (recv_from_rank != _grid->ThisRank());
1350
1351 if ( !face_table_computed ) {
1352 face_table.resize(face_idx+1);
1353 std::vector<std::pair<int,int> > face_table_host ;
1354 Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
1355 // std::cout << "bytes expect "<< bytes << " " << face_table_host.size()* compress.CommDatumSize()<<std::endl;
1356 face_table[face_idx].resize(face_table_host.size());
1357 acceleratorCopyToDevice(&face_table_host[0],
1358 &face_table[face_idx][0],
1359 face_table[face_idx].size()*sizeof(face_table_host[0]));
1360 }
1361
1362
1363 // if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
1364 if ( compress.DecompressionStep()&&comms_recv) {
1365 recv_buf=u_simd_recv_buf[0];
1366 } else {
1367 recv_buf=this->u_recv_buf_p;
1368 }
1369
1370 // potential SHM fast path for intranode
1371 int shm_send=0;
1372 int shm_recv=0;
1373#ifdef SHM_FAST_PATH
1374 // Put directly in place if we can
1375 send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
1376 if ( (send_buf==NULL) ) {
1377 shm_send=0;
1378 send_buf = this->u_send_buf_p;
1379 } else {
1380 shm_send=1;
1381 }
1382 void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf);
1383 if ( test_ptr != NULL ) shm_recv = 1;
1384 // static int printed;
1385 // if (!printed){
1386 // std::cout << " GATHER FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
1387 // printed = 1;
1388 // }
1389#else
1391 // Gather locally
1393 send_buf = this->u_send_buf_p; // Gather locally, must send
1394 assert(send_buf!=NULL);
1395#endif
1396
1397 // std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
1398 // compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
1399 compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
1400
1401 int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
1402 if ( !duplicate ) { // Force comms for now
1403
1405 // Build a list of things to do after we synchronise GPUs
1406 // Start comms now???
1408 int do_send = (comms_send) && (!shm_send );
1409 int do_recv = (comms_send) && (!shm_recv );
1410 AddPacket((void *)&send_buf[comm_off],
1411 (void *)&recv_buf[comm_off],
1412 xmit_to_rank, do_send,
1413 recv_from_rank, do_recv,
1414 xbytes,rbytes);
1415 }
1416
1417 if ( (compress.DecompressionStep() && comms_recv) ) {
1418 AddDecompress(&this->u_recv_buf_p[comm_off],
1419 &recv_buf[comm_off],
1420 words,Decompressions);
1421 }
1422
1423 u_comm_offset+=words;
1424 face_idx++;
1425 }
1426 }
1427 return 0;
1428 }
1429
1430 template<class compressor>
1431 int GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx,int point)
1432 {
1433 const int Nsimd = _grid->Nsimd();
1434
1435 const int maxl =2;// max layout in a direction
1436
1437 int comms_send = this->_comms_send[point];
1438 int comms_recv = this->_comms_recv[point];
1439 // int comms_partial_send = this->_comms_partial_send[point] ;
1440 // int comms_partial_recv = this->_comms_partial_recv[point] ;
1441
1442 int fd = _grid->_fdimensions[dimension];
1443 int rd = _grid->_rdimensions[dimension];
1444 int ld = _grid->_ldimensions[dimension];
1445 int pd = _grid->_processors[dimension];
1446 int simd_layout = _grid->_simd_layout[dimension];
1447 int comm_dim = _grid->_processors[dimension] >1 ;
1448 assert(comm_dim==1);
1449 // This will not work with a rotate dim
1450 assert(simd_layout==maxl);
1451 assert(shift>=0);
1452 assert(shift<fd);
1453
1454
1455 int permute_type=_grid->PermuteType(dimension);
1456
1458 // Simd direction uses an extract/merge pair
1460 int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
1461 // int words = sizeof(cobj)/sizeof(vector_type);
1462
1463 assert(cbmask==0x3); // Fixme think there is a latent bug if not true
1464 // This assert will trap it if ever hit. Not hit normally so far
1465 int reduced_buffer_size = buffer_size;
1466 if (cbmask != 0x3) reduced_buffer_size=buffer_size>>1;
1467
1468 int datum_bytes = compress.CommDatumSize();
1469 int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
1470
1471 // how many bytes on wire : partial dirichlet or dirichlet may set to < bytes
1472 int xbytes;
1473 int rbytes;
1474
1475 assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
1476
1477 std::vector<cobj *> rpointers(maxl);
1478 std::vector<cobj *> spointers(maxl);
1479
1481 // Work out what to send where
1483
1484 int cb = (cbmask==0x2)? Odd : Even;
1485 int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
1486
1487 // loop over outer coord planes orthog to dim
1488 for(int x=0;x<rd;x++){
1489
1490 int any_offnode = ( ((x+sshift)%fd) >= rd );
1491
1492 if ( any_offnode ) {
1493
1494 int comm_off = u_comm_offset;
1495 for(int i=0;i<maxl;i++){
1496 spointers[i] = (cobj *) &u_simd_send_buf[i][comm_off];
1497 }
1498
1499 int sx = (x+sshift)%rd;
1500
1501 if ( !face_table_computed ) {
1502 face_table.resize(face_idx+1);
1503 std::vector<std::pair<int,int> > face_table_host ;
1504
1505 Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
1506 face_table[face_idx].resize(face_table_host.size());
1507 acceleratorCopyToDevice(&face_table_host[0],
1508 &face_table[face_idx][0],
1509 face_table[face_idx].size()*sizeof(face_table_host[0]));
1510
1511 }
1512
1513
1514 if ( comms_send ) xbytes = bytes;
1515 // else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
1516 else xbytes = 0;
1517
1518 if ( comms_recv ) rbytes = bytes;
1519 // else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
1520 else rbytes = 0;
1521
1522 // Gathers SIMD lanes for send and merge
1523 // Different faces can be full comms or partial comms with multiple ranks per node
1524 // if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
1525 if ( comms_send || comms_recv ) {
1526
1527 // int partial = partialDirichlet;
1528 int partial = 0;
1529 compressor::Gather_plane_exchange(face_table[face_idx],rhs,
1530 spointers,dimension,sx,cbmask,
1531 compress,permute_type,partial );
1532 }
1533 face_idx++;
1534
1535 //spointers[0] -- low simd coor
1536 //spointers[1] -- high simd coor
1537 for(int i=0;i<maxl;i++){
1538
1539 int my_coor = rd*i + x; // self explanatory
1540 int nbr_coor = my_coor+sshift; // self explanatory
1541
1542 int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
1543 int nbr_lcoor= (nbr_coor%ld); // local plane coor on neighbour node
1544 int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer simd lane "i"
1545 int nbr_ox = (nbr_lcoor%rd); // outer coord of peer "x"
1546
1547 int nbr_plane = nbr_ic;
1548 assert (sx == nbr_ox);
1549
1550 auto rp = &u_simd_recv_buf[i ][comm_off];
1551 auto sp = &u_simd_send_buf[nbr_plane][comm_off];
1552
1553 if(nbr_proc){
1554
1555 int recv_from_rank;
1556 int xmit_to_rank;
1557 int shm_send=0;
1558
1559 _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
1560#ifdef SHM_FAST_PATH
1561 #warning STENCIL SHM FAST PATH SELECTED
1562 int shm_recv=0;
1563 // shm == receive pointer if offnode
1564 // shm == Translate[send pointer] if on node -- my view of his send pointer
1565 cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
1566 if (shm==NULL) {
1567 shm = rp;
1568 // we found a packet that comes from MPI and contributes to this shift.
1569 // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
1570 // Kernel will add the exterior_terms except if is_same_node.
1571 // leg of stencil
1572 shm_recv=0;
1573 } else {
1574 shm_recv=1;
1575 }
1576 rpointers[i] = shm;
1577 // Test send side
1578 void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
1579 if ( test_ptr != NULL ) shm_send = 1;
1580 // static int printed;
1581 // if (!printed){
1582 // std::cout << " GATHERSIMD FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
1583 // printed = 1;
1584 // }
1585#else
1586 rpointers[i] = rp;
1587#endif
1588
1589 int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
1590 if ( !duplicate ) {
1591 if ( (bytes != rbytes) && (rbytes!=0) ){
1592 acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
1593 }
1594 // int do_send = (comms_send|comms_partial_send) && (!shm_send );
1595 int do_send = (comms_send) && (!shm_send );
1596 AddPacket((void *)sp,(void *)rp,
1597 xmit_to_rank,do_send,
1598 recv_from_rank,do_send,
1599 xbytes,rbytes);
1600 }
1601
1602 } else {
1603
1604 rpointers[i] = sp;
1605
1606 }
1607 }
1608 // rpointer may be doing a remote read in the gather over SHM
1609 // if ( comms_recv|comms_partial_recv ) {
1610 if ( comms_recv ) {
1611 AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
1612 }
1613
1614 u_comm_offset +=buffer_size;
1615
1616 }
1617 }
1618 return 0;
1619 }
1620
1621};
1623
1624#endif
#define accelerator_forNB(iterator, num, nsimd,...)
accelerator_inline int acceleratorSIMTlane(int Nsimd)
#define accelerator_inline
void acceleratorCopySynchronise(void)
void acceleratorFenceComputeStream(void)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
void acceleratorMemSet(void *base, int value, size_t bytes)
#define accelerator_barrier(dummy)
std::vector< T, devAllocator< T > > deviceVector
static const int Even
static const int Odd
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95
#define perm(a, b, n, w)
accelerator_inline void permute(ComplexD &y, ComplexD b, int perm)
accelerator_inline Grid_simd< S, V > abs(const Grid_simd< S, V > &r)
ViewMode
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
uint32_t Integer
Definition Simd.h:58
void DslashResetCounts(void)
Definition Stencil.cc:36
void DslashLogDirichlet(void)
Definition Stencil.cc:50
void Gather_plane_table_compute(GridBase *grid, int dimension, int plane, int cbmask, int off, std::vector< std::pair< int, int > > &table)
Definition Stencil.cc:54
void DslashLogFull(void)
Definition Stencil.cc:48
void DslashLogPartial(void)
void DslashGetCounts(uint64_t &dirichlet, uint64_t &partial, uint64_t &full)
Definition Stencil.cc:42
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
Definition Tensor_SIMT.h:61
uint64_t base
void ShiftedRanks(int dim, int shift, int &source, int &dest)
StencilVector _comms_send
Definition Stencil.h:117
StencilVector _permute_type
Definition Stencil.h:126
StencilVector _comm_buf_size
Definition Stencil.h:125
StencilEntry * _entries_p
Definition Stencil.h:131
accelerator_inline void iCoorFromIindex(Coordinate &coor, int lane) const
Definition Stencil.h:167
accelerator_inline StencilEntry * GetEntry(int &ptype, int point, int osite) const
Definition Stencil.h:143
int GetNodeLocal(int osite, int point) const
Definition Stencil.h:139
accelerator_inline uint64_t GetInfo(int &ptype, int &local, int &perm, int point, int ent, uint64_t base) const
Definition Stencil.h:148
StencilVector _comms_recv
Definition Stencil.h:118
accelerator_inline cobj * CommBuf(void) const
Definition Stencil.h:136
StencilVector same_node
Definition Stencil.h:127
StencilEntry * _entries_host_p
Definition Stencil.h:132
AcceleratorVector< int, STENCIL_MAX > StencilVector
Definition Stencil.h:102
accelerator_inline uint64_t GetPFInfo(int ent, uint64_t base) const
Definition Stencil.h:160
StencilVector _directions
Definition Stencil.h:111
StencilVector _distances
Definition Stencil.h:112
CartesianStencilView(const CartesianStencilView &refer_to_me)=default
void ViewOpen(ViewMode _mode)
Definition Stencil.h:188
CartesianStencilView(const CartesianStencilAccelerator< vobj, cobj, Parameters > &refer_to_me, ViewMode _mode)
Definition Stencil.h:183
void ViewClose(void)
Definition Stencil.h:193
const CartesianStencilView< SiteSpinor, SiteSpinor, ImplParams > View_type
Definition Stencil.h:206
void SetSloppyComms(int sloppy)
Definition Stencil.h:298
void CommsMergeSHM(decompressor decompress)
Definition Stencil.h:809
void ScatterPlane(int point, int dimension, int plane, int cbmask, int offset, int wrap)
Definition Stencil.h:1244
void Prepare(void)
Definition Stencil.h:701
void Comms(int point, int dimension, int shiftpm, int cbmask)
Definition Stencil.h:1101
void CommunicateComplete(std::vector< std::vector< CommsRequest_t > > &reqs)
Definition Stencil.h:574
void HaloExchange(const Lattice< vobj > &source, compressor &compress)
Definition Stencil.h:610
int GatherSimd(const Lattice< vobj > &rhs, int dimension, int shift, int cbmask, compressor &compress, int &face_idx, int point)
Definition Stencil.h:1431
void DecompressPacket(Packet &packet)
Definition Stencil.h:406
void AddMerge(cobj *merge_p, std::vector< cobj * > &rpointers, Integer buffer_size, Integer type, std::vector< Merge > &mv)
Definition Stencil.h:795
std::vector< deviceVector< std::pair< int, int > > > face_table
Definition Stencil.h:315
int Gather(const Lattice< vobj > &rhs, int dimension, int shift, int cbmask, compressor &compress, int &face_idx, int point)
Definition Stencil.h:1290
void * DeviceBufferMalloc(size_t bytes)
Definition Stencil.h:268
void HaloGather(const Lattice< vobj > &source, compressor &compress)
Definition Stencil.h:670
void CopyPlane(int point, int dimension, int lplane, int rplane, int cbmask, int permute, int wrap)
Definition Stencil.h:1196
void CommsMerge(decompressor decompress)
Definition Stencil.h:805
void AddDecompress(cobj *k_p, cobj *m_p, Integer buffer_size, std::vector< Decompress > &dv)
Definition Stencil.h:786
void Local(int point, int dimension, int shiftpm, int cbmask)
Definition Stencil.h:1056
std::vector< CachedTransfer > CachedTransfers
Definition Stencil.h:326
void BuildSurfaceList(int Ls, int vol4)
Definition Stencil.h:840
void CommsMerge(decompressor decompress, std::vector< Merge > &mm, std::vector< Decompress > &dd)
Definition Stencil.h:815
void CommunicateBegin(std::vector< std::vector< CommsRequest_t > > &reqs)
Definition Stencil.h:513
GridBase * Grid(void) const
Definition Stencil.h:292
View_type View(ViewMode mode) const
Definition Stencil.h:307
void AddCopy(void *from, void *to, Integer bytes)
Definition Stencil.h:712
std::vector< CopyReceiveBuffer > CopyReceiveBuffers
Definition Stencil.h:325
int SameNode(int point)
Definition Stencil.h:345
void PrecomputeByteOffsets(void)
Definition Stencil.h:828
void AddPacket(void *xmit, void *rcv, Integer to, Integer do_send, Integer from, Integer do_recv, Integer xbytes, Integer rbytes)
Definition Stencil.h:769
void Communicate(void)
Definition Stencil.h:600
CartesianStencil(GridBase *grid, int npoints, int checkerboard, const std::vector< int > &directions, const std::vector< int > &distances, Parameters p=Parameters(), bool preserve_shm=false)
Definition Stencil.h:931
int HaloGatherDir(const Lattice< vobj > &source, compressor &compress, int point, int &face_idx)
Definition Stencil.h:620
void DirichletBlock(const Coordinate &dirichlet_block)
Introduce a block structure and switch off comms on boundaries.
Definition Stencil.h:884
Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf, Integer lane, Integer xbytes, Integer rbytes, Integer cb)
Definition Stencil.h:736
void CommsCopy()
Definition Stencil.h:720
void DeviceBufferFreeAll(void)
Definition Stencil.h:279
void CompressPacket(Packet &packet)
Definition Stencil.h:448
static bool StepLog(const char *name)
static void recvLog(void *, uint64_t bytes, int rank)
static void xmitLog(void *, uint64_t bytes)
Coordinate _fdimensions
int PermuteType(int dimension)
Coordinate _rdimensions
int Nd(void) const
Coordinate _simd_layout
Coordinate _ostride
Coordinate _ldimensions
int Nsimd(void) const
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const
void * ShmBufferTranslate(int rank, void *local_p)
static deviceVector< unsigned char > DeviceCommBuf
Definition Stencil.h:75
GridTypeMapper< scalar_type >::Realified real_scalar_type
static constexpr int Nsimd
Definition Stencil.h:236
static constexpr int Nsimd
Definition Stencil.h:226
std::vector< cobj * > vpointers
Definition Stencil.h:229
Coordinate dirichlet
Definition Stencil.h:57
uint8_t _permute
Definition Stencil.h:93
uint8_t _around_the_world
Definition Stencil.h:94
uint64_t _byte_offset
Definition Stencil.h:89
uint8_t _is_local
Definition Stencil.h:92
uint64_t _offset
Definition Stencil.h:90
uint8_t _pad
Definition Stencil.h:95