Grid 0.7.0
GeneralCoarsenedMatrixMultiRHS.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <pboyle@bnl.gov>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#pragma once
29
30
32
33
34// Fine Object == (per site) type of fine field
35// nbasis == number of deflation vectors
36template<class Fobj,class CComplex,int nbasis>
37class MultiGeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
38public:
39 typedef typename CComplex::scalar_object SComplex;
42
52 typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
56
58 // Data members
65
68 std::vector<deviceVector<calcMatrix> > BLAS_A;
69
70 std::vector<deviceVector<ComplexD *> > BLAS_AP;
71 std::vector<deviceVector<ComplexD *> > BLAS_BP;
73
75 // Interface
77 GridBase * Grid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
78 GridCartesian * CoarseGrid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
79
80 // Can be used to do I/O on the operator matrices externally
81 void SetMatrix (int p,CoarseMatrix & A)
82 {
83 assert(A.size()==geom_srhs.npoint);
84 GridtoBLAS(A[p],BLAS_A[p]);
85 }
86 void GetMatrix (int p,CoarseMatrix & A)
87 {
88 assert(A.size()==geom_srhs.npoint);
89 BLAStoGrid(A[p],BLAS_A[p]);
90 }
92 {
93 for(int p=0;p<geom.npoint;p++){
94 auto Aup = _Op.Cell.Extract(_Op._A[p]);
95 //Unpadded
96 GridtoBLAS(Aup,BLAS_A[p]);
97 }
98 }
99 /*
100 void CheckMatrix (GeneralCoarseOp &_Op)
101 {
102 std::cout <<"************* Checking the little direc operator mRHS"<<std::endl;
103 for(int p=0;p<geom.npoint;p++){
104 //Unpadded
105 auto Aup = _Op.Cell.Extract(_Op._A[p]);
106 auto Ack = Aup;
107 BLAStoGrid(Ack,BLAS_A[p]);
108 std::cout << p<<" Ack "<<norm2(Ack)<<std::endl;
109 std::cout << p<<" Aup "<<norm2(Aup)<<std::endl;
110 }
111 std::cout <<"************* "<<std::endl;
112 }
113 */
114
116 _CoarseGridMulti(CoarseGridMulti),
117 geom_srhs(_geom),
118 geom(_CoarseGridMulti,_geom.hops,_geom.skip+1),
119 Cell(geom.Depth(),_CoarseGridMulti),
120 Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
121 {
122 int32_t padded_sites = Cell.grids.back()->lSites();
123 int32_t unpadded_sites = CoarseGridMulti->lSites();
124
125 int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS
126 int32_t orhs = nrhs/CComplex::Nsimd();
127
128 padded_sites = padded_sites/nrhs;
129 unpadded_sites = unpadded_sites/nrhs;
130
132 // Device data vector storage
134 BLAS_A.resize(geom.npoint);
135 for(int p=0;p<geom.npoint;p++){
136 BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
137 }
138
139 BLAS_B.resize(nrhs *padded_sites); // includes ghost zone
140 BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
141 BLAS_AP.resize(geom.npoint);
142 BLAS_BP.resize(geom.npoint);
143 for(int p=0;p<geom.npoint;p++){
144 BLAS_AP[p].resize(unpadded_sites);
145 BLAS_BP[p].resize(unpadded_sites);
146 }
147 BLAS_CP.resize(unpadded_sites);
148
150 // Pointers to data
152
153 // Site identity mapping for A
154 for(int p=0;p<geom.npoint;p++){
155 for(int ss=0;ss<unpadded_sites;ss++){
156 ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
157 acceleratorPut(BLAS_AP[p][ss],ptr);
158 }
159 }
160 // Site identity mapping for C
161 for(int ss=0;ss<unpadded_sites;ss++){
162 ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
163 acceleratorPut(BLAS_CP[ss],ptr);
164 }
165
166 // Neighbour table is more complicated
167 int32_t j=0; // Interior point counter (unpadded)
168 for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
169 int ghost_zone=0;
170 for(int32_t point = 0 ; point < geom.npoint; point++){
171 int i=s*orhs*geom.npoint+point;
172 if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
173 ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
174 }
175 }
176
177 if( ghost_zone==0) {
178 for(int32_t point = 0 ; point < geom.npoint; point++){
179 int i=s*orhs*geom.npoint+point;
180 int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
181 assert(nbr<BLAS_B.size());
182 ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
183 acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
184 }
185 j++;
186 }
187 }
188 assert(j==unpadded_sites);
189 }
190 template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
191 {
192 typedef typename vobj::scalar_object sobj;
193 typedef typename vobj::scalar_type scalar_type;
194 typedef typename vobj::vector_type vector_type;
195
196 GridBase *Fg = from.Grid();
197 assert(!Fg->_isCheckerBoarded);
198 int nd = Fg->_ndimension;
199
200 to.resize(Fg->lSites());
201
202 Coordinate LocalLatt = Fg->LocalDimensions();
203 size_t nsite = 1;
204 for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
205
207 // do the index calc on the GPU
209 Coordinate f_ostride = Fg->_ostride;
210 Coordinate f_istride = Fg->_istride;
211 Coordinate f_rdimensions = Fg->_rdimensions;
212
213 autoView(from_v,from,AcceleratorRead);
214 auto to_v = &to[0];
215
216 const int words=sizeof(vobj)/sizeof(vector_type);
217 accelerator_for(idx,nsite,1,{
218
219 Coordinate from_coor, base;
220 Lexicographic::CoorFromIndex(base,idx,LocalLatt);
221 for(int i=0;i<nd;i++){
222 from_coor[i] = base[i];
223 }
224 int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
225 int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
226
227 const vector_type* from = (const vector_type *)&from_v[from_oidx];
228 scalar_type* to = (scalar_type *)&to_v[idx];
229
230 scalar_type stmp;
231 for(int w=0;w<words;w++){
232 stmp = getlane(from[w], from_lane);
233 to[w] = stmp;
234 }
235 });
236 }
238 {
239 typedef typename vobj::scalar_object sobj;
240 typedef typename vobj::scalar_type scalar_type;
241 typedef typename vobj::vector_type vector_type;
242
243 GridBase *Tg = grid.Grid();
244 assert(!Tg->_isCheckerBoarded);
245 int nd = Tg->_ndimension;
246
247 assert(in.size()==Tg->lSites());
248
249 Coordinate LocalLatt = Tg->LocalDimensions();
250 size_t nsite = 1;
251 for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
252
254 // do the index calc on the GPU
256 Coordinate t_ostride = Tg->_ostride;
257 Coordinate t_istride = Tg->_istride;
258 Coordinate t_rdimensions = Tg->_rdimensions;
259
260 autoView(to_v,grid,AcceleratorWrite);
261 auto from_v = &in[0];
262
263 const int words=sizeof(vobj)/sizeof(vector_type);
264 accelerator_for(idx,nsite,1,{
265
266 Coordinate to_coor, base;
267 Lexicographic::CoorFromIndex(base,idx,LocalLatt);
268 for(int i=0;i<nd;i++){
269 to_coor[i] = base[i];
270 }
271 int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
272 int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
273
274 vector_type* to = (vector_type *)&to_v[to_oidx];
275 scalar_type* from = (scalar_type *)&from_v[idx];
276
277 scalar_type stmp;
278 for(int w=0;w<words;w++){
279 stmp=from[w];
280 putlane(to[w], stmp, to_lane);
281 }
282 });
283 }
287 {
288#if 0
289 std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
290
291 GridBase *grid = Subspace.FineGrid;
292
294 // Orthogonalise the subblocks over the basis
296 CoarseScalar InnerProd(CoarseGrid);
297 blockOrthogonalise(InnerProd,Subspace.subspace);
298
299 const int npoint = geom_srhs.npoint;
300
301 Coordinate clatt = CoarseGrid->GlobalDimensions();
302 int Nd = CoarseGrid->Nd();
303 /*
304 * Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
305 * Matrix index i is mapped to this shift via
306 * geom.shifts[i]
307 *
308 * conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
309 * = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
310 * = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
311 * = M_{kl} A_ji^{b.b+l}
312 *
313 * Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
314 *
315 * Where q_k = delta_k . (2*M_PI/global_nb[mu])
316 *
317 * Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
318 */
319 Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
320 Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
321 ComplexD ci(0.0,1.0);
322 for(int k=0;k<npoint;k++){ // Loop over momenta
323
324 for(int l=0;l<npoint;l++){ // Loop over nbr relative
325 ComplexD phase(0.0,0.0);
326 for(int mu=0;mu<Nd;mu++){
327 RealD TwoPiL = M_PI * 2.0/ clatt[mu];
328 phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
329 }
330 phase=exp(phase*ci);
331 Mkl(k,l) = phase;
332 }
333 }
334 invMkl = Mkl.inverse();
335
337 // Now compute the matrix elements of linop between the orthonormal
338 // set of vectors.
340 FineField phaV(grid); // Phased block basis vector
341 FineField MphaV(grid);// Matrix applied
342 std::vector<FineComplexField> phaF(npoint,grid);
343 std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
344
345 CoarseVector coarseInner(CoarseGrid);
346
347 typedef typename CComplex::scalar_type SComplex;
348 FineComplexField one(grid); one=SComplex(1.0);
349 FineComplexField zz(grid); zz = Zero();
350 for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
352 // Stick a phase on every block
355 pha[p]=Zero();
356 for(int mu=0;mu<Nd;mu++){
357 LatticeCoordinate(coor,mu);
358 RealD TwoPiL = M_PI * 2.0/ clatt[mu];
359 pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
360 }
361 pha[p] =exp(pha[p]*ci);
362
363 blockZAXPY(phaF[p],pha[p],one,zz);
364 }
365
366 // Could save on temporary storage here
367 std::vector<CoarseMatrix> _A;
368 _A.resize(geom_srhs.npoint,CoarseGrid);
369
370 std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
372 for(int i=0;i<nbasis;i++){// Loop over basis vectors
373 std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
374 for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
375
376 phaV = phaF[p]*Subspace.subspace[i];
377
379 // Multiple phased subspace vector by matrix and project to subspace
380 // Remove local bulk phase to leave relative phases
382 linop.Op(phaV,MphaV);
383
384 // Fixme, could use batched block projector here
385 blockProject(coarseInner,MphaV,Subspace.subspace);
386
387 coarseInner = conjugate(pha[p]) * coarseInner;
388
389 ComputeProj[p] = coarseInner;
390 }
391
392 // Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
393 for(int k=0;k<npoint;k++){
394
395 FT = Zero();
396 for(int l=0;l<npoint;l++){
397 FT= FT+ invMkl(l,k)*ComputeProj[l];
398 }
399
400 int osites=CoarseGrid->oSites();
401 autoView( A_v , _A[k], AcceleratorWrite);
402 autoView( FT_v , FT, AcceleratorRead);
403 accelerator_for(sss, osites, 1, {
404 for(int j=0;j<nbasis;j++){
405 A_v[sss](i,j) = FT_v[sss](j);
406 }
407 });
408 }
409 }
410
411 // Only needed if nonhermitian
412 // if ( ! hermitian ) {
413 // std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
414 // PopulateAdag();
415 // }
416 // Need to write something to populate Adag from A
417
418 for(int p=0;p<geom_srhs.npoint;p++){
419 GridtoBLAS(_A[p],BLAS_A[p]);
420 }
421 /*
422Grid : Message : 11698.730546 s : CoarsenOperator eigen 1334 us
423Grid : Message : 11698.730563 s : CoarsenOperator phase 34729 us
424Grid : Message : 11698.730565 s : CoarsenOperator phaseBZ 2423814 us
425Grid : Message : 11698.730566 s : CoarsenOperator mat 127890998 us
426Grid : Message : 11698.730567 s : CoarsenOperator proj 515840840 us
427Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us
428Takes 600s to compute matrix elements, DOMINATED by the block project.
429Easy to speed up with the batched block project.
430Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
431
432// Block project below taks to 240s
433Grid : Message : 328.193418 s : CoarsenOperator phase 38338 us
434Grid : Message : 328.193434 s : CoarsenOperator phaseBZ 1711226 us
435Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
436//Grid : Message : 328.193438 s : CoarsenOperator proj 1181154 us <-- this is mistimed
437//Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us <-- Cut this ~10x if lucky by loop fusion
438 */
439#else
440 RealD tproj=0.0;
441 RealD tmat=0.0;
442 RealD tphase=0.0;
443 RealD tphaseBZ=0.0;
444 RealD tinv=0.0;
445
446 std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
447
448 GridBase *grid = Subspace.FineGrid;
449
451 // Orthogonalise the subblocks over the basis
453 CoarseScalar InnerProd(CoarseGrid);
454 blockOrthogonalise(InnerProd,Subspace.subspace);
455
456
458 Projector.Allocate(nbasis,grid,CoarseGrid);
459 Projector.ImportBasis(Subspace.subspace);
460
461 const int npoint = geom_srhs.npoint;
462
463 Coordinate clatt = CoarseGrid->GlobalDimensions();
464 int Nd = CoarseGrid->Nd();
465 /*
466 * Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
467 * Matrix index i is mapped to this shift via
468 * geom.shifts[i]
469 *
470 * conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
471 * = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
472 * = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
473 * = M_{kl} A_ji^{b.b+l}
474 *
475 * Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
476 *
477 * Where q_k = delta_k . (2*M_PI/global_nb[mu])
478 *
479 * Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
480 */
481 Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
482 Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
483 ComplexD ci(0.0,1.0);
484 for(int k=0;k<npoint;k++){ // Loop over momenta
485
486 for(int l=0;l<npoint;l++){ // Loop over nbr relative
487 ComplexD phase(0.0,0.0);
488 for(int mu=0;mu<Nd;mu++){
489 RealD TwoPiL = M_PI * 2.0/ clatt[mu];
490 phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
491 }
492 phase=exp(phase*ci);
493 Mkl(k,l) = phase;
494 }
495 }
496 invMkl = Mkl.inverse();
497
499 // Now compute the matrix elements of linop between the orthonormal
500 // set of vectors.
502 FineField phaV(grid); // Phased block basis vector
503 FineField MphaV(grid);// Matrix applied
504 std::vector<FineComplexField> phaF(npoint,grid);
505 std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
506
507 CoarseVector coarseInner(CoarseGrid);
508
509 tphase=-usecond();
510 typedef typename CComplex::scalar_type SComplex;
511 FineComplexField one(grid); one=SComplex(1.0);
512 FineComplexField zz(grid); zz = Zero();
513 for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
515 // Stick a phase on every block
518 pha[p]=Zero();
519 for(int mu=0;mu<Nd;mu++){
520 LatticeCoordinate(coor,mu);
521 RealD TwoPiL = M_PI * 2.0/ clatt[mu];
522 pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
523 }
524 pha[p] =exp(pha[p]*ci);
525
526 blockZAXPY(phaF[p],pha[p],one,zz);
527 }
528 tphase+=usecond();
529
530 // Could save on temporary storage here
531 std::vector<CoarseMatrix> _A;
532 _A.resize(geom_srhs.npoint,CoarseGrid);
533
534 // Count use small chunks than npoint == 81 and save memory
535 int batch = 9;
536 std::vector<FineField> _MphaV(batch,grid);
537 std::vector<CoarseVector> TmpProj(batch,CoarseGrid);
538
539 std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
541 for(int i=0;i<nbasis;i++){// Loop over basis vectors
542 std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
543
544 // std::cout << GridLogMessage << " phasing the fine vector "<<std::endl;
545 // Fixme : do this in batches
546 for(int p=0;p<npoint;p+=batch){ // Loop over momenta in npoint
547
548 for(int b=0;b<MIN(batch,npoint-p);b++){
549 tphaseBZ-=usecond();
550 phaV = phaF[p+b]*Subspace.subspace[i];
551 tphaseBZ+=usecond();
552
554 // Multiple phased subspace vector by matrix and project to subspace
555 // Remove local bulk phase to leave relative phases
557 // Memory footprint was an issue
558 tmat-=usecond();
559 linop.Op(phaV,MphaV);
560 _MphaV[b] = MphaV;
561 tmat+=usecond();
562 }
563
564 // std::cout << GridLogMessage << " Calling block project "<<std::endl;
565 tproj-=usecond();
566 Projector.blockProject(_MphaV,TmpProj);
567 tproj+=usecond();
568
569 // std::cout << GridLogMessage << " conj phasing the coarse vectors "<<std::endl;
570 for(int b=0;b<MIN(batch,npoint-p);b++){
571 ComputeProj[p+b] = conjugate(pha[p+b])*TmpProj[b];
572 }
573 }
574
575 // Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
576
577 // std::cout << GridLogMessage << " Starting FT inv "<<std::endl;
578 tinv-=usecond();
579 for(int k=0;k<npoint;k++){
580 FT = Zero();
581 // 81 kernel calls as many ComputeProj vectors
582 // Could fuse with a vector of views, but ugly
583 // Could unroll the expression and run fewer kernels -- much more attractive
584 // Could also do non blocking.
585#if 0
586 for(int l=0;l<npoint;l++){
587 FT= FT+ invMkl(l,k)*ComputeProj[l];
588 }
589#else
590 const int radix = 9;
591 int ll;
592 for(ll=0;ll+radix-1<npoint;ll+=radix){
593 // When ll = npoint-radix, ll+radix-1 = npoint-1, and we do it all.
594 FT = FT
595 + invMkl(ll+0,k)*ComputeProj[ll+0]
596 + invMkl(ll+1,k)*ComputeProj[ll+1]
597 + invMkl(ll+2,k)*ComputeProj[ll+2]
598 + invMkl(ll+3,k)*ComputeProj[ll+3]
599 + invMkl(ll+4,k)*ComputeProj[ll+4]
600 + invMkl(ll+5,k)*ComputeProj[ll+5]
601 + invMkl(ll+6,k)*ComputeProj[ll+6]
602 + invMkl(ll+7,k)*ComputeProj[ll+7]
603 + invMkl(ll+8,k)*ComputeProj[ll+8];
604 }
605 for(int l=ll;l<npoint;l++){
606 FT= FT+ invMkl(l,k)*ComputeProj[l];
607 }
608#endif
609
610 // 1 kernel call -- must be cheaper
611 int osites=CoarseGrid->oSites();
612 autoView( A_v , _A[k], AcceleratorWrite);
613 autoView( FT_v , FT, AcceleratorRead);
614 accelerator_for(sss, osites, 1, {
615 for(int j=0;j<nbasis;j++){
616 A_v[sss](i,j) = FT_v[sss](j);
617 }
618 });
619 }
620 tinv+=usecond();
621 }
622
623 // Only needed if nonhermitian
624 // if ( ! hermitian ) {
625 // std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
626 // PopulateAdag();
627 // }
628 // Need to write something to populate Adag from A
629 // std::cout << GridLogMessage << " Calling GridtoBLAS "<<std::endl;
630 for(int p=0;p<geom_srhs.npoint;p++){
631 GridtoBLAS(_A[p],BLAS_A[p]);
632 }
633 std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
634 std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
635 std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
636 std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
637 std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
638#endif
639 }
640 void Mdag(const CoarseVector &in, CoarseVector &out)
641 {
642 this->M(in,out);
643 }
644 void M (const CoarseVector &in, CoarseVector &out)
645 {
646 // std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
648 conformable(in.Grid(),out.Grid());
649 out.Checkerboard() = in.Checkerboard();
650
651 RealD t_tot;
652 RealD t_exch;
653 RealD t_GtoB;
654 RealD t_BtoG;
655 RealD t_mult;
656
657 t_tot=-usecond();
658 CoarseVector tin=in;
659 t_exch=-usecond();
660 CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
661 t_exch+=usecond();
662
663 CoarseVector pout(pin.Grid());
664
665 int npoint = geom.npoint;
666 typedef calcMatrix* Aview;
667 typedef LatticeView<Cvec> Vview;
668
669 const int Nsimd = CComplex::Nsimd();
670
671 int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
672 assert(nrhs>=1);
673
674 RealD flops,bytes;
675 int64_t osites=in.Grid()->oSites(); // unpadded
676 int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs;
677
678 flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
679 bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
680 + 2.0*osites*sizeof(siteVector)*npoint;
681
682
683 t_GtoB=-usecond();
684 GridtoBLAS(pin,BLAS_B);
685 t_GtoB+=usecond();
686
687 GridBLAS BLAS;
688
689 t_mult=-usecond();
690 for(int p=0;p<geom.npoint;p++){
691 RealD c = 1.0;
692 if (p==0) c = 0.0;
693 ComplexD beta(c);
694
695 BLAS.gemmBatched(nbasis,nrhs,nbasis,
696 ComplexD(1.0),
697 BLAS_AP[p],
698 BLAS_BP[p],
699 ComplexD(c),
700 BLAS_CP);
701 }
702 BLAS.synchronise();
703 t_mult+=usecond();
704
705 t_BtoG=-usecond();
706 BLAStoGrid(out,BLAS_C);
707 t_BtoG+=usecond();
708 t_tot+=usecond();
709 /*
710 std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
711 std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
712 std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
713 std::cout << GridLogMessage<<"Coarse Mult GtoB "<<t_GtoB<<" us"<<std::endl;
714 std::cout << GridLogMessage<<"Coarse Mult BtoG "<<t_BtoG<<" us"<<std::endl;
715 std::cout << GridLogMessage<<"Coarse Mult tot "<<t_tot<<" us"<<std::endl;
716 */
717 // std::cout << GridLogMessage<<std::endl;
718 // std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
719 // std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
720 // std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
721 // std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
722 // std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
723 };
724 virtual void Mdiag (const Field &in, Field &out){ assert(0);};
725 virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
726 virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
727};
728
void acceleratorPut(T &dev, const T &host)
#define accelerator_for(iterator, num, nsimd,...)
std::vector< T, devAllocator< T > > deviceVector
#define one
Definition BGQQPX.h:108
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95
accelerator_inline S getlane(const Grid_simd< S, V > &in, int lane)
accelerator_inline void putlane(Grid_simd< S, V > &vec, const S &_S, int lane)
accelerator_inline Grid_simd< S, V > exp(const Grid_simd< S, V > &r)
void conformable(const Lattice< obj1 > &lhs, const Lattice< obj2 > &rhs)
void LatticeCoordinate(Lattice< iobj > &l, int mu)
Lattice< vobj > conjugate(const Lattice< vobj > &lhs)
void blockOrthogonalise(Lattice< CComplex > &ip, std::vector< Lattice< vobj > > &Basis)
void blockZAXPY(Lattice< vobj > &fineZ, const Lattice< CComplex > &coarseA, const Lattice< vobj2 > &fineX, const Lattice< vobj > &fineY)
void blockProject(Lattice< iVector< CComplex, nbasis > > &coarseData, const Lattice< vobj > &fineData, const VLattice &Basis)
#define autoView(l_v, l, mode)
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL")
@ AcceleratorRead
@ AcceleratorWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
static constexpr int Nd
Definition QCD.h:52
std::complex< RealD > ComplexD
Definition Simd.h:79
double RealD
Definition Simd.h:61
double usecond(void)
Definition Timer.h:50
uint64_t base
#define MIN(a, b)
Definition Zolotarev.cc:23
#define M_PI
Definition Zolotarev.cc:41
std::vector< Lattice< Fobj > > subspace
Definition Aggregates.h:58
GridBase * FineGrid
Definition Aggregates.h:57
std::vector< CoarseMatrix > _A
void synchronise(void)
void gemmBatched(int m, int n, int k, ComplexD alpha, deviceVector< ComplexD * > &Amk, deviceVector< ComplexD * > &Bkn, ComplexD beta, deviceVector< ComplexD * > &Cmn)
const Coordinate & GlobalDimensions(void)
Coordinate _istride
int lSites(void) const
bool _isCheckerBoarded
int oSites(void) const
Coordinate _rdimensions
const Coordinate & FullDimensions(void)
Coordinate _ostride
const Coordinate & LocalDimensions(void)
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const
void Mdag(const CoarseVector &in, CoarseVector &out)
std::vector< deviceVector< ComplexD * > > BLAS_BP
std::vector< deviceVector< calcMatrix > > BLAS_A
void M(const CoarseVector &in, CoarseVector &out)
Lattice< iScalar< CComplex > > CoarseComplexField
void SetMatrix(int p, CoarseMatrix &A)
Lattice< iMatrix< CComplex, nbasis > > CoarseMatrix
virtual void MdirAll(const Field &in, std::vector< Field > &out)
void BLAStoGrid(Lattice< vobj > &grid, deviceVector< typename vobj::scalar_object > &in)
void GridtoBLAS(const Lattice< vobj > &from, deviceVector< typename vobj::scalar_object > &to)
MultiGeneralCoarsenedMatrix< Fobj, CComplex, nbasis > MultiGeneralCoarseOp
std::vector< deviceVector< ComplexD * > > BLAS_AP
virtual void Mdiag(const Field &in, Field &out)
MultiGeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom, GridCartesian *CoarseGridMulti)
void GetMatrix(int p, CoarseMatrix &A)
GeneralCoarsenedMatrix< Fobj, CComplex, nbasis > GeneralCoarseOp
virtual void Mdir(const Field &in, Field &out, int dir, int disp)
void CoarsenOperator(LinearOperatorBase< Lattice< Fobj > > &linop, Aggregation< Fobj, CComplex, nbasis > &Subspace, GridBase *CoarseGrid)
void Allocate(int _nbasis, GridBase *_fgrid, GridBase *_cgrid)
void blockProject(std::vector< Field > &fine, std::vector< Lattice< cobj > > &coarse)
void ImportBasis(std::vector< Field > &vecs)
Lattice< vobj > Extract(const Lattice< vobj > &in) const
Definition PaddedCell.h:287
Definition Simd.h:194