Grid 0.7.0
Cshift_mpi.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/cshift/Cshift_mpi.h
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10Author: paboyle <paboyle@ph.ed.ac.uk>
11
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License along
23 with this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25
26 See the full license in the file "LICENSE" in the top level distribution directory
27*************************************************************************************/
28/* END LEGAL */
29#ifndef _GRID_CSHIFT_MPI_H_
30#define _GRID_CSHIFT_MPI_H_
31
32
34const int Cshift_verbose=0;
35template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
36{
37 typedef typename vobj::vector_type vector_type;
38 typedef typename vobj::scalar_type scalar_type;
39
40 Lattice<vobj> ret(rhs.Grid());
41
42 int fd = rhs.Grid()->_fdimensions[dimension];
43 int rd = rhs.Grid()->_rdimensions[dimension];
44
45 // Map to always positive shift modulo global full dimension.
46 shift = (shift+fd)%fd;
47
48 ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
49
50 // the permute type
51 int simd_layout = rhs.Grid()->_simd_layout[dimension];
52 int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
53 int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
54
55 RealD t1,t0;
56 t0=usecond();
57 if ( !comm_dim ) {
58 // std::cout << "CSHIFT: Cshift_local" <<std::endl;
59 Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
60 } else if ( splice_dim ) {
61 // std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
62 Cshift_comms_simd(ret,rhs,dimension,shift);
63 } else {
64 // std::cout << "CSHIFT: Cshift_comms" <<std::endl;
65 Cshift_comms(ret,rhs,dimension,shift);
66 }
67 t1=usecond();
68 if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
69 return ret;
70}
71
72template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
73{
74 int sshift[2];
75
76 sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
77 sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
78
79 // std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
80 if ( sshift[0] == sshift[1] ) {
81 // std::cout << "Single pass Cshift_comms" <<std::endl;
82 Cshift_comms(ret,rhs,dimension,shift,0x3);
83 } else {
84 // std::cout << "Two pass Cshift_comms" <<std::endl;
85 Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
86 Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
87 }
88}
89
90template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
91{
92 int sshift[2];
93
94 sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
95 sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
96
97 // std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
98 if ( sshift[0] == sshift[1] ) {
99 // std::cout << "Single pass Cshift_comms" <<std::endl;
100 Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
101 } else {
102 // std::cout << "Two pass Cshift_comms" <<std::endl;
103 Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
104 Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
105 }
106}
107template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
108{
109 typedef typename vobj::vector_type vector_type;
110 typedef typename vobj::scalar_type scalar_type;
111
112 GridBase *grid=rhs.Grid();
113 Lattice<vobj> temp(rhs.Grid());
114
115 int fd = rhs.Grid()->_fdimensions[dimension];
116 int rd = rhs.Grid()->_rdimensions[dimension];
117 int pd = rhs.Grid()->_processors[dimension];
118 int simd_layout = rhs.Grid()->_simd_layout[dimension];
119 int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
120 assert(simd_layout==1);
121 assert(comm_dim==1);
122 assert(shift>=0);
123 assert(shift<fd);
124
125 int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
126 static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
127 static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
128#ifndef ACCELERATOR_AWARE_MPI
129 static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
130 static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
131#endif
132
133 int cb= (cbmask==0x2)? Odd : Even;
134 int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
135
136 // Calculate Cshift_vector - it's the same for all slices
137 CalculateCshiftVector<vobj>(ret, rhs, dimension, cbmask);
138 // Copy it to the device
140
141 RealD tcopy=0.0;
142 RealD tgather=0.0;
143 RealD tscatter=0.0;
144 RealD tcomms=0.0;
145 uint64_t xbytes=0;
146 for(int x=0;x<rd;x++){
147
148 int sx = (x+sshift)%rd;
149 int comm_proc = ((x+sshift)/rd)%pd;
150
151 if (comm_proc==0) {
152 FlightRecorder::StepLog("Cshift_Copy_plane");
153 tcopy-=usecond();
154 Copy_plane(ret,rhs,dimension,x,sx,cbmask);
155 tcopy+=usecond();
156 FlightRecorder::StepLog("Cshift_Copy_plane_complete");
157 } else {
158
159 int words = buffer_size;
160 if (cbmask != 0x3) words=words>>1;
161
162 int bytes = words * sizeof(vobj);
163
164 FlightRecorder::StepLog("Cshift_Gather_plane");
165 tgather-=usecond();
166 Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
167 tgather+=usecond();
168 FlightRecorder::StepLog("Cshift_Gather_plane_complete");
169
170 // int rank = grid->_processor;
171 int recv_from_rank;
172 int xmit_to_rank;
173
174 grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
175
176 tcomms-=usecond();
177 grid->Barrier();
178
179 FlightRecorder::StepLog("Cshift_SendRecv");
180#ifdef ACCELERATOR_AWARE_MPI
181 grid->SendToRecvFrom((void *)&send_buf[0],
182 xmit_to_rank,
183 (void *)&recv_buf[0],
184 recv_from_rank,
185 bytes);
186#else
187 // bouncy bouncy
188 acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
189 grid->SendToRecvFrom((void *)&hsend_buf[0],
190 xmit_to_rank,
191 (void *)&hrecv_buf[0],
192 recv_from_rank,
193 bytes);
194 acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
195#endif
196 FlightRecorder::StepLog("Cshift_SendRecv_complete");
197
198 xbytes+=bytes;
199 grid->Barrier();
200 tcomms+=usecond();
201 FlightRecorder::StepLog("Cshift_barrier_complete");
202
203 tscatter-=usecond();
204 Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
205 tscatter+=usecond();
206 }
207 }
208 if (Cshift_verbose){
209 std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
210 std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
211 std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
212 std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
213 std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
214 }
215}
216
217template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
218{
219 GridBase *grid=rhs.Grid();
220 const int Nsimd = grid->Nsimd();
221 typedef typename vobj::vector_type vector_type;
222 typedef typename vobj::scalar_object scalar_object;
223 typedef typename vobj::scalar_type scalar_type;
224
225 int fd = grid->_fdimensions[dimension];
226 int rd = grid->_rdimensions[dimension];
227 int ld = grid->_ldimensions[dimension];
228 int pd = grid->_processors[dimension];
229 int simd_layout = grid->_simd_layout[dimension];
230 int comm_dim = grid->_processors[dimension] >1 ;
231
232 // std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
233 // << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
234 // << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
235
236 assert(comm_dim==1);
237 assert(simd_layout==2);
238 assert(shift>=0);
239 assert(shift<fd);
240
241 RealD tcopy=0.0;
242 RealD tgather=0.0;
243 RealD tscatter=0.0;
244 RealD tcomms=0.0;
245 uint64_t xbytes=0;
246
247 int permute_type=grid->PermuteType(dimension);
248
250 // Simd direction uses an extract/merge pair
252 int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
253 // int words = sizeof(vobj)/sizeof(vector_type);
254
255 static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
256 static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
257 scalar_object * recv_buf_extract_mpi;
258 scalar_object * send_buf_extract_mpi;
259
260 for(int s=0;s<Nsimd;s++){
261 send_buf_extract[s].resize(buffer_size);
262 recv_buf_extract[s].resize(buffer_size);
263 }
264#ifndef ACCELERATOR_AWARE_MPI
265 hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
266 hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
267#endif
268
269 int bytes = buffer_size*sizeof(scalar_object);
270
271 ExtractPointerArray<scalar_object> pointers(Nsimd); //
272 ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
273
275 // Work out what to send where
277 int cb = (cbmask==0x2)? Odd : Even;
278 int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
279
280 // loop over outer coord planes orthog to dim
281 for(int x=0;x<rd;x++){
282
283 // FIXME call local permute copy if none are offnode.
284 for(int i=0;i<Nsimd;i++){
285 pointers[i] = &send_buf_extract[i][0];
286 }
287 int sx = (x+sshift)%rd;
288 tgather-=usecond();
289 Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
290 tgather+=usecond();
291
292 for(int i=0;i<Nsimd;i++){
293
294 int inner_bit = (Nsimd>>(permute_type+1));
295 int ic= (i&inner_bit)? 1:0;
296
297 int my_coor = rd*ic + x;
298 int nbr_coor = my_coor+sshift;
299 int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
300
301 int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
302 int nbr_ox = (nbr_coor%rd); // outer coord of peer
303 int nbr_lane = (i&(~inner_bit));
304
305 int recv_from_rank;
306 int xmit_to_rank;
307
308 if (nbr_ic) nbr_lane|=inner_bit;
309
310 assert (sx == nbr_ox);
311
312 if(nbr_proc){
313 grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
314
315 tcomms-=usecond();
316 grid->Barrier();
317
318 send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
319 recv_buf_extract_mpi = &recv_buf_extract[i][0];
320#ifdef ACCELERATOR_AWARE_MPI
321 grid->SendToRecvFrom((void *)send_buf_extract_mpi,
322 xmit_to_rank,
323 (void *)recv_buf_extract_mpi,
324 recv_from_rank,
325 bytes);
326#else
327 // bouncy bouncy
328 acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
329 grid->SendToRecvFrom((void *)&hsend_buf[0],
330 xmit_to_rank,
331 (void *)&hrecv_buf[0],
332 recv_from_rank,
333 bytes);
334 acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
335#endif
336
337 xbytes+=bytes;
338 grid->Barrier();
339 tcomms+=usecond();
340
341 rpointers[i] = &recv_buf_extract[i][0];
342 } else {
343 rpointers[i] = &send_buf_extract[nbr_lane][0];
344 }
345
346 }
347 tscatter-=usecond();
348 Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
349 tscatter+=usecond();
350 }
351 if(Cshift_verbose){
352 std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
353 std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
354 std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
355 std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
356 std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
357 }
358}
359
361
362#endif
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
std::vector< T, devAllocator< T > > deviceVector
std::vector< T, alignedAllocator< T > > hostVector
static const int Even
static const int Odd
void Copy_plane(Lattice< vobj > &lhs, const Lattice< vobj > &rhs, int dimension, int lplane, int rplane, int cbmask)
void Gather_plane_extract(const Lattice< vobj > &rhs, ExtractPointerArray< typename vobj::scalar_object > pointers, int dimension, int plane, int cbmask)
std::vector< int > Cshift_vector
void CalculateCshiftVector(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int cbmask)
void Scatter_plane_simple(Lattice< vobj > &rhs, deviceVector< vobj > &buffer, int dimension, int plane, int cbmask)
void Scatter_plane_merge(Lattice< vobj > &rhs, ExtractPointerArray< typename vobj::scalar_object > pointers, int dimension, int plane, int cbmask)
void MapCshiftCopy(std::vector< vobj > &Cshift_obj, deviceVector< vobj > &Cshift_obj_device)
void Cshift_local(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int shift)
deviceVector< int > Cshift_vector_device
void Gather_plane_simple(const Lattice< vobj > &rhs, deviceVector< vobj > &buffer, int dimension, int plane, int cbmask, int off=0)
const int Cshift_verbose
Definition Cshift_mpi.h:34
Lattice< vobj > Cshift(const Lattice< vobj > &rhs, int dimension, int shift)
Definition Cshift_mpi.h:35
void Cshift_comms_simd(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int shift)
Definition Cshift_mpi.h:90
void Cshift_comms(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int shift)
Definition Cshift_mpi.h:72
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN")
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
double RealD
Definition Simd.h:61
AcceleratorVector< __T *, GRID_MAX_SIMD > ExtractPointerArray
double usecond(void)
Definition Timer.h:50
accelerator_inline void resize(size_type sz)
Definition Coordinate.h:54
void SendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, int recv_from_rank, int bytes)
void ShiftedRanks(int dim, int shift, int &source, int &dest)
static bool StepLog(const char *name)
Coordinate _fdimensions
int PermuteType(int dimension)
Coordinate _slice_nblock
Coordinate _slice_block
Coordinate _rdimensions
Coordinate _simd_layout
Coordinate _ldimensions
virtual int CheckerBoardDestination(int source_cb, int shift, int dim)=0
virtual int CheckerBoardShiftForCB(int source_cb, int dim, int shift, int cb)=0
int Nsimd(void) const
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const