Grid 0.7.0
Cshift_common.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/cshift/Cshift_common.h
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26 *************************************************************************************/
27 /* END LEGAL */
28#pragma once
29
31
32extern std::vector<std::pair<int,int> > Cshift_table;
34extern std::vector<int> Cshift_vector;
36
37// Copy Cshift map object (table or vector) to device
38template<class vobj>
39inline void MapCshiftCopy(std::vector<vobj> &Cshift_obj, deviceVector<vobj> &Cshift_obj_device)
40{
41 // GPU version only
42 uint64_t sz=Cshift_obj.size();
43 if (Cshift_obj_device.size()!=sz ) {
44 Cshift_obj_device.resize(sz);
45 }
46 acceleratorCopyToDevice((void *)&Cshift_obj[0],
47 (void *)&Cshift_obj_device[0],
48 sizeof(Cshift_obj[0])*sz);
49
50}
51
52// Copy Cshift map object (table or vector) to device and return pointer to device copy
53template<class vobj>
54inline vobj *MapCshift(std::vector<vobj> &Cshift_obj, deviceVector<vobj> &Cshift_obj_device)
55{
56 MapCshiftCopy<vobj>(Cshift_obj, Cshift_obj_device);
57
58 return &Cshift_obj_device[0];
59}
60
61// Calculate Cshift_vector
62template<class vobj>
63void CalculateCshiftVector(Lattice<vobj> &ret, const Lattice<vobj> &rhs, int dimension, int cbmask)
64{
65 GridBase *grid = rhs.Grid();
66
67 if ( !grid->CheckerBoarded(dimension) ) {
68 cbmask=0x3;
69 }
70
71 int e1=grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
72 int e2=grid->_slice_block[dimension];
73 int stride = grid->_slice_stride[dimension];
74
75 if (Cshift_vector.size() < e1*e2) Cshift_vector.resize(e1*e2); // Let it grow to biggest
76
77 int ent = 0;
78 if(cbmask == 0x3 ){
79 for(int n=0;n<e1;n++){
80 for(int b=0;b<e2;b++){
81 int o =n*stride+b;
82 Cshift_vector[ent++] = o;
83 }
84 }
85 } else {
86 for(int n=0;n<e1;n++){
87 for(int b=0;b<e2;b++){
88 int o =n*stride+b;
89 int ocb=1<<ret.Grid()->CheckerBoardFromOindex(o);
90 if ( ocb&cbmask ) {
91 Cshift_vector[ent++] = o;
92 }
93 }
94 }
95 }
96
97 if (ent < Cshift_vector.size()) Cshift_vector.resize(ent); // trim vector to actual size (relevant for checkerboarded dimensions)
98}
99
100
102// Gather for when there is no need to SIMD split
104template<class vobj> void
105Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
106{
107 int rd = rhs.Grid()->_rdimensions[dimension];
108
109 if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
110 cbmask = 0x3;
111 }
112
113 int so=plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
114 int e1=rhs.Grid()->_slice_nblock[dimension];
115 int e2=rhs.Grid()->_slice_block[dimension];
116 int ent = 0;
117
118 if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
119
120 int stride=rhs.Grid()->_slice_stride[dimension];
121
122 if ( cbmask == 0x3 ) {
123 for(int n=0;n<e1;n++){
124 for(int b=0;b<e2;b++){
125 int o = n*stride;
126 int bo = n*e2;
127 Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
128 }
129 }
130 } else {
131 int bo=0;
132 for(int n=0;n<e1;n++){
133 for(int b=0;b<e2;b++){
134 int o = n*stride;
135 int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
136 if ( ocb &cbmask ) {
137 Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
138 }
139 }
140 }
141 }
142 {
143 auto buffer_p = & buffer[0];
145 autoView(rhs_v , rhs, AcceleratorRead);
146 accelerator_for(i,ent,vobj::Nsimd(),{
147 coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
148 });
149 }
150}
151
153// Gather for when there *is* need to SIMD split
155template<class vobj> void
158 int dimension,int plane,int cbmask)
159{
160 int rd = rhs.Grid()->_rdimensions[dimension];
161
162 if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
163 cbmask = 0x3;
164 }
165
166 int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
167
168 int e1=rhs.Grid()->_slice_nblock[dimension];
169 int e2=rhs.Grid()->_slice_block[dimension];
170 int n1=rhs.Grid()->_slice_stride[dimension];
171
172 if ( cbmask ==0x3){
173 autoView(rhs_v , rhs, AcceleratorRead);
174 accelerator_for(nn,e1*e2,1,{
175 int n = nn%e1;
176 int b = nn/e1;
177 int o = n*n1;
178 int offset = b+n*e2;
179
180 vobj temp =rhs_v[so+o+b];
181 extract<vobj>(temp,pointers,offset);
182 });
183 } else {
184 Coordinate rdim=rhs.Grid()->_rdimensions;
186 std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
187 autoView(rhs_v , rhs, AcceleratorRead);
188 accelerator_for(nn,e1*e2,1,{
189 int n = nn%e1;
190 int b = nn/e1;
191
192 Coordinate coor;
193
194 int o=n*n1;
195 int oindex = o+b;
196
197 int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
198
199 int ocb=1<<cb;
200 int offset = b+n*e2;
201
202 if ( ocb & cbmask ) {
203 vobj temp =rhs_v[so+o+b];
204 extract<vobj>(temp,pointers,offset);
205 }
206 });
207 }
208}
209
211// Scatter for when there is no need to SIMD split
213template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
214{
215 int rd = rhs.Grid()->_rdimensions[dimension];
216
217 if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
218 cbmask=0x3;
219 }
220
221 int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
222
223 int e1=rhs.Grid()->_slice_nblock[dimension];
224 int e2=rhs.Grid()->_slice_block[dimension];
225 int stride=rhs.Grid()->_slice_stride[dimension];
226
227 if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
228
229 int ent =0;
230
231 if ( cbmask ==0x3 ) {
232
233 for(int n=0;n<e1;n++){
234 for(int b=0;b<e2;b++){
235 int o =n*rhs.Grid()->_slice_stride[dimension];
236 int bo =n*rhs.Grid()->_slice_block[dimension];
237 Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
238 }
239 }
240
241 } else {
242 int bo=0;
243 for(int n=0;n<e1;n++){
244 for(int b=0;b<e2;b++){
245 int o =n*rhs.Grid()->_slice_stride[dimension];
246 int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
247 if ( ocb & cbmask ) {
248 Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
249 }
250 }
251 }
252 }
253
254 {
255 auto buffer_p = & buffer[0];
257 autoView( rhs_v, rhs, AcceleratorWrite);
258 accelerator_for(i,ent,vobj::Nsimd(),{
259 coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
260 });
261 }
262}
263
265// Scatter for when there *is* need to SIMD split
267template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerArray<typename vobj::scalar_object> pointers,int dimension,int plane,int cbmask)
268{
269 int rd = rhs.Grid()->_rdimensions[dimension];
270
271 if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
272 cbmask=0x3;
273 }
274
275 int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
276
277 int e1=rhs.Grid()->_slice_nblock[dimension];
278 int e2=rhs.Grid()->_slice_block[dimension];
279
280 if(cbmask ==0x3 ) {
281 int _slice_stride = rhs.Grid()->_slice_stride[dimension];
282 int _slice_block = rhs.Grid()->_slice_block[dimension];
283 autoView( rhs_v , rhs, AcceleratorWrite);
284 accelerator_for(nn,e1*e2,1,{
285 int n = nn%e1;
286 int b = nn/e1;
287 int o = n*_slice_stride;
288 int offset = b+n*_slice_block;
289 merge(rhs_v[so+o+b],pointers,offset);
290 });
291 } else {
292
293 // Case of SIMD split AND checker dim cannot currently be hit, except in
294 // Test_cshift_red_black code.
295 std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
296 std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
297 assert(0); // This will fail if hit on GPU
298 autoView( rhs_v, rhs, CpuWrite);
299 for(int n=0;n<e1;n++){
300 for(int b=0;b<e2;b++){
301 int o = n*rhs.Grid()->_slice_stride[dimension];
302 int offset = b+n*rhs.Grid()->_slice_block[dimension];
303 int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
304 if ( ocb&cbmask ) {
305 merge(rhs_v[so+o+b],pointers,offset);
306 }
307 }
308 }
309 }
310}
311
313// local to node block strided copies
315
316template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
317{
318
319
320 int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
321 int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
322
323 auto table = &Cshift_vector_device[0];
324
325 autoView(rhs_v , rhs, AcceleratorRead);
326 autoView(lhs_v , lhs, AcceleratorWrite);
327 accelerator_for(i,Cshift_vector.size(),vobj::Nsimd(),{
328 coalescedWrite(lhs_v[table[i]+lo],coalescedRead(rhs_v[table[i]+ro]));
329 });
330}
331
332template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
333{
334
335 int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
336 int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
337
338 auto table = &Cshift_vector_device[0];
339 autoView( rhs_v, rhs, AcceleratorRead);
340 autoView( lhs_v, lhs, AcceleratorWrite);
341 accelerator_for(i,Cshift_vector.size(),1,{
342 permute(lhs_v[table[i]+lo],rhs_v[table[i]+ro],permute_type);
343 });
344}
345
347// Local to node Cshift
349template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
350{
351 int sshift[2];
352
353 sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
354 sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
355
356 if ( sshift[0] == sshift[1] ) {
357 Cshift_local(ret,rhs,dimension,shift,0x3);
358 } else {
359 Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
360 Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
361 }
362}
363
364template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
365{
366 GridBase *grid = rhs.Grid();
367 int fd = grid->_fdimensions[dimension];
368 int rd = grid->_rdimensions[dimension];
369 int ld = grid->_ldimensions[dimension];
370 int gd = grid->_gdimensions[dimension];
371 int ly = grid->_simd_layout[dimension];
372
373 // Map to always positive shift modulo global full dimension.
374 shift = (shift+fd)%fd;
375
376 int cb= (cbmask==0x2)? Odd : Even;
377 int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
378
379 // the permute type
380 ret.Checkerboard() = grid->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
381 int permute_dim =grid->PermuteDim(dimension);
382 int permute_type=grid->PermuteType(dimension);
383 int permute_type_dist;
384
385 // wrap is whether sshift > rd.
386 // num is sshift mod rd.
387 //
388 // shift 7
389 //
390 // XoXo YcYc
391 // oXoX cYcY
392 // XoXo YcYc
393 // oXoX cYcY
394 //
395 // sshift --
396 //
397 // XX YY ; 3
398 // XX YY ; 0
399 // XX YY ; 3
400 // XX YY ; 0
401 //
402 int wrap = sshift/rd; wrap=wrap % ly;
403 int num = sshift%rd;
404
405 // Calculate Cshift_vector - it's the same for all slices
406 CalculateCshiftVector<vobj>(ret, rhs, dimension, cbmask);
407 // Copy it to the device
409
410 for(int x=0;x<rd;x++){
411
412 int sx = (x+sshift)%rd;
413
414 int permute_slice=0;
415 if(permute_dim){
416 if ( x< rd-num ) permute_slice=wrap;
417 else permute_slice = (wrap+1)%ly;
418
419 if ( (ly>2) && (permute_slice) ) {
420 assert(permute_type & RotateBit);
421 permute_type_dist = permute_type|permute_slice;
422 } else {
423 permute_type_dist = permute_type;
424 }
425 }
426
427 if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
428 else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
429
430 }
431}
433
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
std::vector< T, devAllocator< T > > deviceVector
static const int Even
static const int Odd
accelerator_inline int RedBlackCheckerBoardFromOindex(int oindex, const Coordinate &rdim, const Coordinate &chk_dim_msk)
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95
void Copy_plane(Lattice< vobj > &lhs, const Lattice< vobj > &rhs, int dimension, int lplane, int rplane, int cbmask)
void Copy_plane_permute(Lattice< vobj > &lhs, const Lattice< vobj > &rhs, int dimension, int lplane, int rplane, int cbmask, int permute_type)
deviceVector< std::pair< int, int > > Cshift_table_device
void Gather_plane_extract(const Lattice< vobj > &rhs, ExtractPointerArray< typename vobj::scalar_object > pointers, int dimension, int plane, int cbmask)
std::vector< int > Cshift_vector
void CalculateCshiftVector(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int cbmask)
vobj * MapCshift(std::vector< vobj > &Cshift_obj, deviceVector< vobj > &Cshift_obj_device)
void Scatter_plane_simple(Lattice< vobj > &rhs, deviceVector< vobj > &buffer, int dimension, int plane, int cbmask)
void Scatter_plane_merge(Lattice< vobj > &rhs, ExtractPointerArray< typename vobj::scalar_object > pointers, int dimension, int plane, int cbmask)
std::vector< std::pair< int, int > > Cshift_table
void MapCshiftCopy(std::vector< vobj > &Cshift_obj, deviceVector< vobj > &Cshift_obj_device)
void Cshift_local(Lattice< vobj > &ret, const Lattice< vobj > &rhs, int dimension, int shift)
deviceVector< int > Cshift_vector_device
void Gather_plane_simple(const Lattice< vobj > &rhs, deviceVector< vobj > &buffer, int dimension, int plane, int cbmask, int off=0)
#define autoView(l_v, l, mode)
@ AcceleratorRead
@ AcceleratorWrite
@ CpuWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
#define RotateBit
Definition Simd.h:54
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
Definition Tensor_SIMT.h:61
AcceleratorVector< __T *, GRID_MAX_SIMD > ExtractPointerArray
accelerator void extract(const vobj &vec, ExtractBuffer< sobj > &extracted)
accelerator void merge(vobj &vec, ExtractBuffer< sobj > &extracted)
int PermuteDim(int dimension)
Coordinate _slice_stride
Coordinate _fdimensions
virtual int CheckerBoardFromOindex(int Oindex)=0
int PermuteType(int dimension)
Coordinate _slice_nblock
Coordinate _slice_block
Coordinate _checker_dim_mask
virtual int CheckerBoarded(int dim)=0
Coordinate _rdimensions
Coordinate _simd_layout
Coordinate _ostride
Coordinate _ldimensions
virtual int CheckerBoardDestination(int source_cb, int shift, int dim)=0
Coordinate _gdimensions
virtual int CheckerBoardShiftForCB(int source_cb, int dim, int shift, int cb)=0
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const