Grid 0.7.0
MultiRHSBlockProject.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: MultiRHSDeflation.h
6
7 Copyright (C) 2023
8
9Author: Peter Boyle <pboyle@bnl.gov>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#pragma once
29
31
32
33/*
34 MultiRHS block projection
35
36 Import basis -> nblock x nbasis x (block x internal)
37 Import vector of fine lattice objects -> nblock x nrhs x (block x internal)
38
39 => coarse_(nrhs x nbasis )^block = via batched GEMM
40
41//template<class vobj,class CComplex,int nbasis,class VLattice>
42//inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
43// const VLattice &fineData,
44// const VLattice &Basis)
45*/
46
47template<class Field>
49{
50public:
51
52 typedef typename Field::scalar_type scalar;
53 typedef typename Field::scalar_object scalar_object;
54 typedef Field Fermion;
55
56 int nbasis;
59 uint64_t block_vol;
60 uint64_t fine_vol;
61 uint64_t coarse_vol;
62 uint64_t words;
63
64 // Row major layout "C" order:
65 // BLAS_V[coarse_vol][nbasis][block_vol][words]
66 // BLAS_F[coarse_vol][nrhs][block_vol][words]
67 // BLAS_C[coarse_vol][nrhs][nbasis]
68 /*
69 * in Fortran column major notation (cuBlas order)
70 *
71 * Vxb = [v1(x)][..][vn(x)] ... x coarse vol
72 *
73 * Fxr = [r1(x)][..][rm(x)] ... x coarse vol
74 *
75 * Block project:
76 * C_br = V^dag F x coarse vol
77 *
78 * Block promote:
79 * F_xr = Vxb Cbr x coarse_vol
80 */
81 deviceVector<scalar> BLAS_V; // words * block_vol * nbasis x coarse_vol
82 deviceVector<scalar> BLAS_F; // nrhs x fine_vol * words -- the sources
83 deviceVector<scalar> BLAS_C; // nrhs x coarse_vol * nbasis -- the coarse coeffs
84
86 {
87 scalar ss(0.0);
88 std::vector<scalar> tmp(blas.size());
89 acceleratorCopyFromDevice(&blas[0],&tmp[0],blas.size()*sizeof(scalar));
90 for(int64_t s=0;s<blas.size();s++){
91 ss=ss+tmp[s]*adj(tmp[s]);
92 }
93 coarse_grid->GlobalSum(ss);
94 return real(ss);
95 }
96
99
100 void Deallocate(void)
101 {
102 nbasis=0;
103 coarse_grid=nullptr;
104 fine_grid=nullptr;
105 fine_vol=0;
106 block_vol=0;
107 coarse_vol=0;
108 words=0;
109 BLAS_V.resize(0);
110 BLAS_F.resize(0);
111 BLAS_C.resize(0);
112 }
113 void Allocate(int _nbasis,GridBase *_fgrid,GridBase *_cgrid)
114 {
115 nbasis=_nbasis;
116
117 fine_grid=_fgrid;
118 coarse_grid=_cgrid;
119
120 fine_vol = fine_grid->lSites();
121 coarse_vol = coarse_grid->lSites();
123
124 words = sizeof(scalar_object)/sizeof(scalar);
125
126 BLAS_V.resize (fine_vol * words * nbasis );
127 }
128 void ImportFineGridVectors(std::vector <Field > &vecs, deviceVector<scalar> &blas)
129 {
130 int nvec = vecs.size();
131 typedef typename Field::vector_object vobj;
132 // std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
133
134 assert(vecs[0].Grid()==fine_grid);
135
136 subdivides(coarse_grid,fine_grid); // require they map
137
138 int _ndimension = coarse_grid->_ndimension;
139 assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
140
141 Coordinate block_r (_ndimension);
142 for(int d=0 ; d<_ndimension;d++){
143 block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
144 }
145
146 uint64_t sz = blas.size();
147
148 acceleratorMemSet(&blas[0],0,blas.size()*sizeof(scalar));
149
150 Coordinate fine_rdimensions = fine_grid->_rdimensions;
151 Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
152 int64_t bv= block_vol;
153 for(int v=0;v<vecs.size();v++){
154
155 // std::cout << " BlockProjector importing vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
156 autoView( fineData , vecs[v], AcceleratorRead);
157
158 auto blasData_p = &blas[0];
159 auto fineData_p = &fineData[0];
160
161 int64_t osites = fine_grid->oSites();
162
163 // loop over fine sites
164 const int Nsimd = vobj::Nsimd();
165 // std::cout << "sz "<<sz<<std::endl;
166 // std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
167 assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
168 uint64_t lwords= words; // local variable for copy in to GPU
169 accelerator_for(sf,osites,Nsimd,{
170#ifdef GRID_SIMT
171 {
172 int lane=acceleratorSIMTlane(Nsimd); // buffer lane
173#else
174 for(int lane=0;lane<Nsimd;lane++) {
175#endif
176 // One thread per fine site
177 Coordinate coor_f(_ndimension);
178 Coordinate coor_b(_ndimension);
179 Coordinate coor_c(_ndimension);
180
181 // Fine site to fine coor
182 Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
183
184 for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
185 for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
186
187 int sc;// coarse site
188 int sb;// block site
189 Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
190 Lexicographic::IndexFromCoor(coor_b,sb,block_r);
191
192 scalar_object data = extractLane(lane,fineData[sf]);
193
194 // BLAS layout address calculation
195 // words * block_vol * nbasis x coarse_vol
196 // coarse oSite x block vole x lanes
197 int64_t site = (lane*osites + sc*bv)*nvec
198 + v*bv
199 + sb;
200
201 // assert(site*lwords<sz);
202
203 scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
204
205 *ptr = data;
206#ifdef GRID_SIMT
207 }
208#else
209 }
210#endif
211 });
212 // std::cout << " import fine Blas norm "<<blasNorm2(blas)<<std::endl;
213 // std::cout << " BlockProjector imported vector"<<v<<std::endl;
214 }
215 }
216 void ExportFineGridVectors(std::vector <Field> &vecs, deviceVector<scalar> &blas)
217 {
218 typedef typename Field::vector_object vobj;
219
220 int nvec = vecs.size();
221
222 assert(vecs[0].Grid()==fine_grid);
223
224 subdivides(coarse_grid,fine_grid); // require they map
225
226 int _ndimension = coarse_grid->_ndimension;
227 assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
228
229 Coordinate block_r (_ndimension);
230 for(int d=0 ; d<_ndimension;d++){
231 block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
232 }
233 Coordinate fine_rdimensions = fine_grid->_rdimensions;
234 Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
235
236 // std::cout << " export fine Blas norm "<<blasNorm2(blas)<<std::endl;
237
238 int64_t bv= block_vol;
239 for(int v=0;v<vecs.size();v++){
240
241 autoView( fineData , vecs[v], AcceleratorWrite);
242
243 auto blasData_p = &blas[0];
244 auto fineData_p = &fineData[0];
245
246 int64_t osites = fine_grid->oSites();
247 uint64_t lwords = words;
248 // std::cout << " Nsimd is "<<vobj::Nsimd() << std::endl;
249 // std::cout << " lwords is "<<lwords << std::endl;
250 // std::cout << " sizeof(scalar_object) is "<<sizeof(scalar_object) << std::endl;
251 // loop over fine sites
252 accelerator_for(sf,osites,vobj::Nsimd(),{
253
254#ifdef GRID_SIMT
255 {
256 int lane=acceleratorSIMTlane(vobj::Nsimd()); // buffer lane
257#else
258 for(int lane=0;lane<vobj::Nsimd();lane++) {
259#endif
260 // One thread per fine site
261 Coordinate coor_f(_ndimension);
262 Coordinate coor_b(_ndimension);
263 Coordinate coor_c(_ndimension);
264
265 Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
266
267 for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
268 for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
269
270 int sc;
271 int sb;
272 Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
273 Lexicographic::IndexFromCoor(coor_b,sb,block_r);
274
275 // BLAS layout address calculation
276 // words * block_vol * nbasis x coarse_vol
277 int64_t site = (lane*osites + sc*bv)*nvec
278 + v*bv
279 + sb;
280
281 scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
282
283 scalar_object data = *ptr;
284
285 insertLane(lane,fineData[sf],data);
286#ifdef GRID_SIMT
287 }
288#else
289 }
290#endif
291 });
292 }
293 }
294 template<class vobj>
296 {
297 int nvec = vecs.size();
298 typedef typename vobj::scalar_object coarse_scalar_object;
299
300 // std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
301
302 assert(vecs[0].Grid()==coarse_grid);
303
304 int _ndimension = coarse_grid->_ndimension;
305
306 uint64_t sz = blas.size();
307
308 Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
309
310 for(int v=0;v<vecs.size();v++){
311
312 // std::cout << " BlockProjector importing coarse vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
313 autoView( coarseData , vecs[v], AcceleratorRead);
314
315 auto blasData_p = &blas[0];
316 auto coarseData_p = &coarseData[0];
317
318 int64_t osites = coarse_grid->oSites();
319
320 // loop over fine sites
321 const int Nsimd = vobj::Nsimd();
322 uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
323 assert(cwords==nbasis);
324
325 accelerator_for(sc,osites,Nsimd,{
326#ifdef GRID_SIMT
327 {
328 int lane=acceleratorSIMTlane(Nsimd); // buffer lane
329#else
330 for(int lane=0;lane<Nsimd;lane++) {
331#endif
332 // C_br per site
333 int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
334
335 coarse_scalar_object data = extractLane(lane,coarseData[sc]);
336
337 coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
338
339 *ptr = data;
340#ifdef GRID_SIMT
341 }
342#else
343 }
344#endif
345 });
346 // std::cout << " import coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
347 }
348 }
349 template<class vobj>
351 {
352 int nvec = vecs.size();
353 typedef typename vobj::scalar_object coarse_scalar_object;
354 // std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
355
356 assert(vecs[0].Grid()==coarse_grid);
357
358 int _ndimension = coarse_grid->_ndimension;
359
360 uint64_t sz = blas.size();
361
362 Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
363
364 // std::cout << " export coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
365 for(int v=0;v<vecs.size();v++){
366
367 // std::cout << " BlockProjector exporting coarse vector"<<v<<std::endl;
368 autoView( coarseData , vecs[v], AcceleratorWrite);
369
370 auto blasData_p = &blas[0];
371 auto coarseData_p = &coarseData[0];
372
373 int64_t osites = coarse_grid->oSites();
374
375 // loop over fine sites
376 const int Nsimd = vobj::Nsimd();
377 uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
378 assert(cwords==nbasis);
379
380 accelerator_for(sc,osites,Nsimd,{
381 // Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
382#ifdef GRID_SIMT
383 {
384 int lane=acceleratorSIMTlane(Nsimd); // buffer lane
385#else
386 for(int lane=0;lane<Nsimd;lane++) {
387#endif
388 int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
389 coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
390 coarse_scalar_object data = *ptr;
391 insertLane(lane,coarseData[sc],data);
392#ifdef GRID_SIMT
393 }
394#else
395 }
396#endif
397 });
398 }
399 }
400 void ImportBasis(std::vector < Field > &vecs)
401 {
402 // std::cout << " BlockProjector Import basis size "<<vecs.size()<<std::endl;
404 }
405
406 template<class cobj>
407 void blockProject(std::vector<Field> &fine,std::vector< Lattice<cobj> > & coarse)
408 {
409 int nrhs=fine.size();
410 int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
411 // std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
412 assert(nbasis==_nbasis);
413
414 BLAS_F.resize (fine_vol * words * nrhs );
415 BLAS_C.resize (coarse_vol * nbasis * nrhs );
416
418 // Copy in the multi-rhs sources to same data layout
420 // std::cout << "BlockProject import fine"<<std::endl;
422
426
427 // std::cout << "BlockProject pointers"<<std::endl;
428 for(int c=0;c<coarse_vol;c++){
429 // BLAS_V[coarse_vol][nbasis][block_vol][words]
430 // BLAS_F[coarse_vol][nrhs][block_vol][words]
431 // BLAS_C[coarse_vol][nrhs][nbasis]
432 scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
433 scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
434 scalar * Ch = & BLAS_C[c*nrhs*nbasis];
435
436 acceleratorPut(Vd[c],Vh);
437 acceleratorPut(Fd[c],Fh);
438 acceleratorPut(Cd[c],Ch);
439 }
440
441 GridBLAS BLAS;
442
443 // std::cout << "BlockProject BLAS"<<std::endl;
444 int64_t vw = block_vol * words;
446 // C_br = V^dag R
449 nbasis,nrhs,vw,
450 scalar(1.0),
451 Vd,
452 Fd,
453 scalar(0.0), // wipe out C
454 Cd);
455 BLAS.synchronise();
456 // std::cout << "BlockProject done"<<std::endl;
458 // std::cout << "BlockProject done"<<std::endl;
459
460 }
461
462 template<class cobj>
463 void blockPromote(std::vector<Field> &fine,std::vector<Lattice<cobj> > & coarse)
464 {
465 int nrhs=fine.size();
466 int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
467 assert(nbasis==_nbasis);
468
469 BLAS_F.resize (fine_vol * words * nrhs );
470 BLAS_C.resize (coarse_vol * nbasis * nrhs );
471
473
474 GridBLAS BLAS;
475
479
480 for(int c=0;c<coarse_vol;c++){
481 // BLAS_V[coarse_vol][nbasis][block_vol][words]
482 // BLAS_F[coarse_vol][nrhs][block_vol][words]
483 // BLAS_C[coarse_vol][nrhs][nbasis]
484 scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
485 scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
486 scalar * Ch = & BLAS_C[c*nrhs*nbasis];
487 acceleratorPut(Vd[c],Vh);
488 acceleratorPut(Fd[c],Fh);
489 acceleratorPut(Cd[c],Ch);
490 }
491
493 // Block promote:
494 // F_xr = Vxb Cbr (x coarse_vol)
496
497 int64_t vw = block_vol * words;
499 vw,nrhs,nbasis,
500 scalar(1.0),
501 Vd,
502 Cd,
503 scalar(0.0), // wipe out C
504 Fd);
505 BLAS.synchronise();
506 // std::cout << " blas call done"<<std::endl;
507
509 // std::cout << " exported "<<std::endl;
510 }
511};
512
void acceleratorPut(T &dev, const T &host)
accelerator_inline int acceleratorSIMTlane(int Nsimd)
void acceleratorMemSet(void *base, int value, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
std::vector< T, devAllocator< T > > deviceVector
@ GridBLAS_OP_N
Definition BatchedBlas.h:67
@ GridBLAS_OP_C
Definition BatchedBlas.h:67
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > adj(const Lattice< vobj > &lhs)
void subdivides(GridBase *coarse, GridBase *fine)
#define autoView(l_v, l, mode)
@ AcceleratorRead
@ AcceleratorWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
double RealD
Definition Simd.h:61
accelerator_inline void insertLane(int lane, vobj &__restrict__ vec, const typename vobj::scalar_object &__restrict__ extracted)
accelerator_inline vobj::scalar_object extractLane(int lane, const vobj &__restrict__ vec)
void synchronise(void)
void gemmBatched(int m, int n, int k, ComplexD alpha, deviceVector< ComplexD * > &Amk, deviceVector< ComplexD * > &Bkn, ComplexD beta, deviceVector< ComplexD * > &Cmn)
deviceVector< scalar > BLAS_C
void ImportCoarseGridVectors(std::vector< Lattice< vobj > > &vecs, deviceVector< scalar > &blas)
void Allocate(int _nbasis, GridBase *_fgrid, GridBase *_cgrid)
Field::scalar_type scalar
deviceVector< scalar > BLAS_F
void blockProject(std::vector< Field > &fine, std::vector< Lattice< cobj > > &coarse)
void ExportFineGridVectors(std::vector< Field > &vecs, deviceVector< scalar > &blas)
void ExportCoarseGridVectors(std::vector< Lattice< vobj > > &vecs, deviceVector< scalar > &blas)
void ImportFineGridVectors(std::vector< Field > &vecs, deviceVector< scalar > &blas)
void ImportBasis(std::vector< Field > &vecs)
RealD blasNorm2(deviceVector< scalar > &blas)
Field::scalar_object scalar_object
deviceVector< scalar > BLAS_V
void blockPromote(std::vector< Field > &fine, std::vector< Lattice< cobj > > &coarse)