Grid 0.7.0
Lattice_transfer.h
Go to the documentation of this file.
1/*************************************************************************************
2 Grid physics library, www.github.com/paboyle/Grid
3
4 Source file: ./lib/lattice/Lattice_transfer.h
5
6 Copyright (C) 2015
7
8Author: Peter Boyle <paboyle@ph.ed.ac.uk>
9Author: Christoph Lehner <christoph@lhnr.de>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#pragma once
29
31
32inline void subdivides(GridBase *coarse,GridBase *fine)
33{
34 assert(coarse->_ndimension == fine->_ndimension);
35
36 int _ndimension = coarse->_ndimension;
37
38 // local and global volumes subdivide cleanly after SIMDization
39 for(int d=0;d<_ndimension;d++){
40 assert(coarse->_processors[d] == fine->_processors[d]);
41 assert(coarse->_simd_layout[d] == fine->_simd_layout[d]);
42 assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
43 }
44}
45
46
48// remove and insert a half checkerboard
50template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
51{
52 half.Checkerboard() = cb;
53
54 autoView( half_v, half, CpuWrite);
55 autoView( full_v, full, CpuRead);
56 thread_for(ss, full.Grid()->oSites(),{
57 int cbos;
58 Coordinate coor;
59 full.Grid()->oCoorFromOindex(coor,ss);
60 cbos=half.Grid()->CheckerBoard(coor);
61
62 if (cbos==cb) {
63 int ssh=half.Grid()->oIndex(coor);
64 half_v[ssh] = full_v[ss];
65 }
66 });
67}
68template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
69{
70 int cb = half.Checkerboard();
71 autoView( half_v , half, CpuRead);
72 autoView( full_v , full, CpuWrite);
73 thread_for(ss,full.Grid()->oSites(),{
74
75 Coordinate coor;
76 int cbos;
77
78 full.Grid()->oCoorFromOindex(coor,ss);
79 cbos=half.Grid()->CheckerBoard(coor);
80
81 if (cbos==cb) {
82 int ssh=half.Grid()->oIndex(coor);
83 full_v[ss]=half_v[ssh];
84 }
85 });
86}
87
88template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
89{
90 half.Checkerboard() = cb;
91 autoView(half_v, half, AcceleratorWrite);
92 autoView(full_v, full, AcceleratorRead);
93 Coordinate rdim_full = full.Grid()->_rdimensions;
94 Coordinate rdim_half = half.Grid()->_rdimensions;
95 unsigned long ndim_half = half.Grid()->_ndimension;
96 Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
97 Coordinate ostride_half = half.Grid()->_ostride;
98 accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
99
100 Coordinate coor;
101 int cbos;
102 int linear=0;
103
104 Lexicographic::CoorFromIndex(coor,ss,rdim_full);
105 assert(coor.size()==ndim_half);
106
107 for(int d=0;d<ndim_half;d++){
108 if(checker_dim_mask_half[d]) linear += coor[d];
109 }
110 cbos = (linear&0x1);
111
112 if (cbos==cb) {
113 int ssh=0;
114 for(int d=0;d<ndim_half;d++) {
115 if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
116 else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
117 }
118 coalescedWrite(half_v[ssh],full_v(ss));
119 }
120 });
121}
122template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
123{
124 int cb = half.Checkerboard();
125 autoView(half_v , half, AcceleratorRead);
126 autoView(full_v , full, AcceleratorWrite);
127 Coordinate rdim_full = full.Grid()->_rdimensions;
128 Coordinate rdim_half = half.Grid()->_rdimensions;
129 unsigned long ndim_half = half.Grid()->_ndimension;
130 Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
131 Coordinate ostride_half = half.Grid()->_ostride;
132 accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
133
134 Coordinate coor;
135 int cbos;
136 int linear=0;
137
138 Lexicographic::CoorFromIndex(coor,ss,rdim_full);
139 assert(coor.size()==ndim_half);
140
141 for(int d=0;d<ndim_half;d++){
142 if(checker_dim_mask_half[d]) linear += coor[d];
143 }
144 cbos = (linear&0x1);
145
146 if (cbos==cb) {
147 int ssh=0;
148 for(int d=0;d<ndim_half;d++){
149 if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
150 else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
151 }
152 coalescedWrite(full_v[ss],half_v(ssh));
153 }
154
155 });
156}
157
159// Flexible Type Conversion for internal promotion to double as well as graceful
160// treatment of scalar-compatible types
162accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
163 out = in;
164}
165
166accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
167 out = in;
168}
169
170template<typename T>
172 out = in;
173}
174
175// This would allow for conversions between GridFundamental types, but is not strictly needed as yet
176/*template<typename T1, typename T2>
177accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
178// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
179//accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
180convertType(T1 & out, const T2 & in) {
181 out = in;
182}*/
183
184#ifdef GRID_SIMT
185accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
187}
188accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
190}
191accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
193}
194#endif
195
197 precisionChange(out,in);
198}
199
201 precisionChange(out,in);
202}
203
204template<typename T1,typename T2>
208
209template<typename T1,typename T2>
213
214template<typename T1,typename T2>
218
219template<typename T1,typename T2,int N>
221 for (int i=0;i<N;i++)
222 for (int j=0;j<N;j++)
223 convertType(out._internal[i][j],in._internal[i][j]);
224}
225
226template<typename T1,typename T2,int N>
228 for (int i=0;i<N;i++)
229 convertType(out._internal[i],in._internal[i]);
230}
231
232template<typename T1,typename T2>
234 autoView( out_v , out,AcceleratorWrite);
235 autoView( in_v , in ,AcceleratorRead);
236 accelerator_for(ss,out_v.size(),T1::Nsimd(),{
237 convertType(out_v[ss],in_v(ss));
238 });
239}
240
242// precision-promoted local inner product
244template<class vobj>
245inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
246-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
247{
248 autoView( lhs_v , lhs, AcceleratorRead);
249 autoView( rhs_v , rhs, AcceleratorRead);
250
251 typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
252 Lattice<iScalar<t_inner>> ret(lhs.Grid());
253
254 {
255 autoView(ret_v, ret,AcceleratorWrite);
256 accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
257 convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
258 });
259 }
260 return ret;
261}
262
264// block routines
266template<class vobj,class CComplex,int nbasis,class VLattice>
268 const Lattice<vobj> &fineData,
269 const VLattice &Basis)
270{
271 GridBase * fine = fineData.Grid();
272 GridBase * coarse= coarseData.Grid();
273
274 Lattice<iScalar<CComplex>> ip(coarse);
275 Lattice<vobj> fineDataRed = fineData;
276
277 autoView( coarseData_ , coarseData, AcceleratorWrite);
278 autoView( ip_ , ip, AcceleratorWrite);
279 RealD t_IP=0;
280 RealD t_co=0;
281 RealD t_za=0;
282 for(int v=0;v<nbasis;v++) {
283 t_IP-=usecond();
284 blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
285 t_IP+=usecond();
286 t_co-=usecond();
287 accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
288 convertType(coarseData_[sc](v),ip_[sc]);
289 });
290 t_co+=usecond();
291
292 // improve numerical stability of projection
293 // |fine> = |fine> - <basis|fine> |basis>
294 ip=-ip;
295 t_za-=usecond();
296 blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
297 t_za+=usecond();
298 }
299 // std::cout << GridLogPerformance << " blockProject : blockInnerProduct : "<<t_IP<<" us"<<std::endl;
300 // std::cout << GridLogPerformance << " blockProject : conv : "<<t_co<<" us"<<std::endl;
301 // std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl;
302}
303// This only minimises data motion from CPU to GPU
304// there is chance of better implementation that does a vxk loop of inner products to data share
305// at the GPU thread level
306template<class vobj,class CComplex,int nbasis,class VLattice>
307inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
308 const std::vector<Lattice<vobj>> &fineData,
309 const VLattice &Basis)
310{
311 int NBatch = fineData.size();
312 assert(coarseData.size() == NBatch);
313
314 GridBase * fine = fineData[0].Grid();
315 GridBase * coarse= coarseData[0].Grid();
316
317 Lattice<iScalar<CComplex>> ip(coarse);
318 std::vector<Lattice<vobj>> fineDataCopy = fineData;
319
320 autoView(ip_, ip, AcceleratorWrite);
321 for(int v=0;v<nbasis;v++) {
322 for (int k=0; k<NBatch; k++) {
323 autoView( coarseData_ , coarseData[k], AcceleratorWrite);
324 blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
325 accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
326 convertType(coarseData_[sc](v),ip_[sc]);
327 });
328
329 // improve numerical stability of projection
330 // |fine> = |fine> - <basis|fine> |basis>
331 ip=-ip;
332 blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
333 }
334 }
335}
336
337template<class vobj,class vobj2,class CComplex>
338 inline void blockZAXPY(Lattice<vobj> &fineZ,
339 const Lattice<CComplex> &coarseA,
340 const Lattice<vobj2> &fineX,
341 const Lattice<vobj> &fineY)
342{
343 GridBase * fine = fineZ.Grid();
344 GridBase * coarse= coarseA.Grid();
345
346 fineZ.Checkerboard()=fineX.Checkerboard();
347 assert(fineX.Checkerboard()==fineY.Checkerboard());
348 subdivides(coarse,fine); // require they map
349 conformable(fineX,fineY);
350 conformable(fineX,fineZ);
351
352 int _ndimension = coarse->_ndimension;
353
354 Coordinate block_r (_ndimension);
355
356 // FIXME merge with subdivide checking routine as this is redundant
357 for(int d=0 ; d<_ndimension;d++){
358 block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
359 assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
360 }
361
362 autoView( fineZ_ , fineZ, AcceleratorWrite);
363 autoView( fineX_ , fineX, AcceleratorRead);
364 autoView( fineY_ , fineY, AcceleratorRead);
365 autoView( coarseA_, coarseA, AcceleratorRead);
366 Coordinate fine_rdimensions = fine->_rdimensions;
367 Coordinate coarse_rdimensions = coarse->_rdimensions;
368
369 accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
370
371 int sc;
372 Coordinate coor_c(_ndimension);
373 Coordinate coor_f(_ndimension);
374
375 Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
376 for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
377 Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
378
379 // z = A x + y
380#ifdef GRID_SIMT
381 typename vobj2::tensor_reduced::scalar_object cA;
382 typename vobj::scalar_object cAx;
383#else
384 typename vobj2::tensor_reduced cA;
385 vobj cAx;
386#endif
387 convertType(cA,TensorRemove(coarseA_(sc)));
388 auto prod = cA*fineX_(sf);
389 convertType(cAx,prod);
390 coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
391
392 });
393
394 return;
395}
396
397template<class vobj,class CComplex>
398 inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
399 const Lattice<vobj> &fineX,
400 const Lattice<vobj> &fineY)
401{
402 typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
403
404 GridBase *coarse(CoarseInner.Grid());
405 GridBase *fine (fineX.Grid());
406
407 Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
408 Lattice<dotp> coarse_inner(coarse);
409
410 // Precision promotion
411 RealD t;
412 t=-usecond();
413 fine_inner = localInnerProductD<vobj>(fineX,fineY);
414 // t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "<<t<<" us"<<std::endl;
415
416 t=-usecond();
417 blockSum(coarse_inner,fine_inner);
418 // t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : blockSum "<<t<<" us"<<std::endl;
419 t=-usecond();
420 {
421 autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
422 autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
423 accelerator_for(ss, coarse->oSites(), 1, {
424 convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
425 });
426 }
427 // t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "<<t<<" us"<<std::endl;
428
429}
430
431template<class vobj,class CComplex> // deprecate
432inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
433 const Lattice<vobj> &fineX,
434 const Lattice<vobj> &fineY)
435{
436 typedef decltype(innerProduct(vobj(),vobj())) dotp;
437
438 GridBase *coarse(CoarseInner.Grid());
439 GridBase *fine (fineX.Grid());
440
441 Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
442 Lattice<dotp> coarse_inner(coarse);
443
444 // Precision promotion?
445 fine_inner = localInnerProduct(fineX,fineY);
446 blockSum(coarse_inner,fine_inner);
447 {
448 autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
449 autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
450 accelerator_for(ss, coarse->oSites(), 1, {
451 CoarseInner_[ss] = coarse_inner_[ss];
452 });
453 }
454}
455
456template<class vobj,class CComplex>
458{
459 GridBase *coarse = ip.Grid();
460 Lattice<vobj> zz(fineX.Grid()); zz=Zero(); zz.Checkerboard()=fineX.Checkerboard();
461 blockInnerProduct(ip,fineX,fineX);
462 ip = pow(ip,-0.5);
463 blockZAXPY(fineX,ip,fineX,zz);
464}
465// useful in multigrid project;
466// Generic name : Coarsen?
467template<class vobj>
468inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
469{
470 const int maxsubsec=256;
471 typedef iVector<vobj,maxsubsec> vSubsec;
472
473 GridBase * fine = fineData.Grid();
474 GridBase * coarse= coarseData.Grid();
475
476 subdivides(coarse,fine); // require they map
477
478 int _ndimension = coarse->_ndimension;
479
480 Coordinate block_r (_ndimension);
481
482 for(int d=0 ; d<_ndimension;d++){
483 block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
484 }
485 int blockVol = fine->oSites()/coarse->oSites();
486
487 // Turn this around to loop threaded over sc and interior loop
488 // over sf would thread better
489 autoView( coarseData_ , coarseData, AcceleratorWrite);
490 autoView( fineData_ , fineData, AcceleratorRead);
491
492 auto coarseData_p = &coarseData_[0];
493 auto fineData_p = &fineData_[0];
494
495 Coordinate fine_rdimensions = fine->_rdimensions;
496 Coordinate coarse_rdimensions = coarse->_rdimensions;
497
498 vobj zz = Zero();
499
500 // Somewhat lazy calculation
501 // Find the biggest power of two subsection divisor less than or equal to maxsubsec
502 int subsec=maxsubsec;
503 int subvol;
504 subvol=blockVol/subsec;
505 while(subvol*subsec!=blockVol){
506 subsec = subsec/2;
507 subvol=blockVol/subsec;
508 };
509
510 Lattice<vSubsec> coarseTmp(coarse);
511 autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard);
512 auto coarseTmp_p= &coarseTmp_[0];
513
514 // Sum within subsecs in a first kernel
515 accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{
516
517 int sc=sce/subsec;
518 int e=sce%subsec;
519
520 // One thread per sub block
521 Coordinate coor_c(_ndimension);
522 Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
523
524 auto cd = coalescedRead(zz);
525 for(int sb=e*subvol;sb<MIN((e+1)*subvol,blockVol);sb++){
526 int sf;
527 Coordinate coor_b(_ndimension);
528 Coordinate coor_f(_ndimension);
529 Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
530 for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
531 Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
532
533 cd=cd+coalescedRead(fineData_p[sf]);
534 }
535
536 coalescedWrite(coarseTmp_[sc](e),cd);
537
538 });
539 // Sum across subsecs in a second kernel
540 accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
541 auto cd = coalescedRead(coarseTmp_p[sc](0));
542 for(int e=1;e<subsec;e++){
543 cd=cd+coalescedRead(coarseTmp_p[sc](e));
544 }
545 coalescedWrite(coarseData_p[sc],cd);
546 });
547
548 return;
549}
550
551
552template<class vobj>
553inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
554{
555 GridBase * fine = unpicked.Grid();
556
557 Lattice<vobj> zz(fine); zz.Checkerboard() = unpicked.Checkerboard();
558 Lattice<iScalar<vInteger> > fcoor(fine);
559
560 zz = Zero();
561
562 picked = unpicked;
563 for(int d=0;d<fine->_ndimension;d++){
564 LatticeCoordinate(fcoor,d);
565 int block= fine->_rdimensions[d] / coarse->_rdimensions[d];
566 int lo = (coor[d])*block;
567 int hi = (coor[d]+1)*block;
568 picked = where( (fcoor<hi) , picked, zz);
569 picked = where( (fcoor>=lo), picked, zz);
570 }
571}
572
573template<class CComplex,class VLattice>
574inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
575{
576 GridBase *coarse = ip.Grid();
577 GridBase *fine = Basis[0].Grid();
578
579 int nbasis = Basis.size() ;
580
581 // checks
582 subdivides(coarse,fine);
583 for(int i=0;i<nbasis;i++){
584 conformable(Basis[i].Grid(),fine);
585 }
586
587 for(int v=0;v<nbasis;v++) {
588 for(int u=0;u<v;u++) {
589 //Inner product & remove component
590 blockInnerProductD(ip,Basis[u],Basis[v]);
591 ip = -ip;
592 blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
593 }
594 blockNormalise(ip,Basis[v]);
595 }
596}
597
598template<class vobj,class CComplex>
599inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
600{
601 blockOrthonormalize(ip,Basis);
602}
603
604#ifdef GRID_ACCELERATED
605// TODO: CPU optimized version here
606template<class vobj,class CComplex,int nbasis>
607inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
608 Lattice<vobj> &fineData,
609 const std::vector<Lattice<vobj> > &Basis)
610{
611 GridBase * fine = fineData.Grid();
612 GridBase * coarse= coarseData.Grid();
613 int _ndimension = coarse->_ndimension;
614
615 // checks
616 assert( nbasis == Basis.size() );
617 subdivides(coarse,fine);
618 for(int i=0;i<nbasis;i++){
619 conformable(Basis[i].Grid(),fine);
620 }
621
622 Coordinate block_r (_ndimension);
623
624 for(int d=0 ; d<_ndimension;d++){
625 block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
626 }
627 autoView( fineData_ , fineData, AcceleratorWrite);
628 autoView( coarseData_ , coarseData, AcceleratorRead);
629
630 typedef LatticeView<vobj> Vview;
631 std::vector<Vview> AcceleratorVecViewContainer_h;
632 for(int v=0;v<nbasis;v++) {
633 AcceleratorVecViewContainer_h.push_back(Basis[v].View(AcceleratorRead));
634 }
635 static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis);
636 acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview));
637 auto Basis_p = &AcceleratorVecViewContainer[0];
638 // Loop with a cache friendly loop ordering
639 Coordinate frdimensions=fine->_rdimensions;
640 Coordinate crdimensions=coarse->_rdimensions;
641 accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{
642 int sc;
643 Coordinate coor_c(_ndimension);
644 Coordinate coor_f(_ndimension);
645
646 Lexicographic::CoorFromIndex(coor_f,sf,frdimensions);
647 for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
648 Lexicographic::IndexFromCoor(coor_c,sc,crdimensions);
649
650 auto sum= coarseData_(sc)(0) *Basis_p[0](sf);
651 for(int i=1;i<nbasis;i++) sum = sum + coarseData_(sc)(i)*Basis_p[i](sf);
652 coalescedWrite(fineData_[sf],sum);
653 });
654 for(int v=0;v<nbasis;v++) {
655 AcceleratorVecViewContainer_h[v].ViewClose();
656 }
657 return;
658}
659#else
660// CPU version
661template<class vobj,class CComplex,int nbasis,class VLattice>
662inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
663 Lattice<vobj> &fineData,
664 const VLattice &Basis)
665{
666 GridBase * fine = fineData.Grid();
667 GridBase * coarse= coarseData.Grid();
668 fineData=Zero();
669 for(int i=0;i<nbasis;i++) {
670 Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
671
672 //Lattice<CComplex> cip(coarse);
673 //autoView( cip_ , cip, AcceleratorWrite);
674 //autoView( ip_ , ip, AcceleratorRead);
675 //accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
676 // coalescedWrite(cip_[sc], ip_(sc)());
677 // });
678 //blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
679 blockZAXPY(fineData,ip,Basis[i],fineData);
680 }
681}
682#endif
683
684template<class vobj,class CComplex,int nbasis,class VLattice>
685inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
686 std::vector<Lattice<vobj>> &fineData,
687 const VLattice &Basis)
688{
689 int NBatch = coarseData.size();
690 assert(fineData.size() == NBatch);
691
692 GridBase * fine = fineData[0].Grid();
693 GridBase * coarse = coarseData[0].Grid();
694 for (int k=0; k<NBatch; k++)
695 fineData[k]=Zero();
696 for (int i=0;i<nbasis;i++) {
697 for (int k=0; k<NBatch; k++) {
698 Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
699 blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
700 }
701 }
702}
703
704// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
705// Simd layouts need not match since we use peek/poke Local
706template<class vobj,class vvobj>
708{
709 typedef typename vobj::scalar_object sobj;
710 typedef typename vvobj::scalar_object ssobj;
711
712 GridBase *ig = in.Grid();
713 GridBase *og = out.Grid();
714
715 int ni = ig->_ndimension;
716 int no = og->_ndimension;
717
718 assert(ni == no);
719
720 for(int d=0;d<no;d++){
721 assert(ig->_processors[d] == og->_processors[d]);
722 assert(ig->_ldimensions[d] == og->_ldimensions[d]);
723 assert(ig->lSites() == og->lSites());
724 }
725
726 autoView(in_v,in,CpuRead);
727 autoView(out_v,out,CpuWrite);
728 thread_for(idx, ig->lSites(),{
729 sobj s;
730 ssobj ss;
731
732 Coordinate lcoor(ni);
733 ig->LocalIndexToLocalCoor(idx,lcoor);
734 peekLocalSite(s,in_v,lcoor);
735 ss=s;
736 pokeLocalSite(ss,out_v,lcoor);
737 });
738}
739
740template<class vobj>
741void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize)
742{
743 typedef typename vobj::scalar_object sobj;
744 typedef typename vobj::scalar_type scalar_type;
745 typedef typename vobj::vector_type vector_type;
746
747 const int words=sizeof(vobj)/sizeof(vector_type);
748
750 // checks should guarantee that the operations are local
752
753 GridBase *Fg = From.Grid();
754 GridBase *Tg = To.Grid();
755 assert(!Fg->_isCheckerBoarded);
756 assert(!Tg->_isCheckerBoarded);
757 int Nsimd = Fg->Nsimd();
758 int nF = Fg->_ndimension;
759 int nT = Tg->_ndimension;
760 int nd = nF;
761 assert(nF == nT);
762
763 for(int d=0;d<nd;d++){
764 assert(Fg->_processors[d] == Tg->_processors[d]);
765 }
766
768 // do the index calc on the GPU
770 Coordinate f_ostride = Fg->_ostride;
771 Coordinate f_istride = Fg->_istride;
772 Coordinate f_rdimensions = Fg->_rdimensions;
773 Coordinate t_ostride = Tg->_ostride;
774 Coordinate t_istride = Tg->_istride;
775 Coordinate t_rdimensions = Tg->_rdimensions;
776
777 size_t nsite = 1;
778 for(int i=0;i<nd;i++) nsite *= RegionSize[i];
779
780 typedef typename vobj::vector_type vector_type;
781 typedef typename vobj::scalar_type scalar_type;
782
783 autoView(from_v,From,AcceleratorRead);
784 autoView(to_v,To,AcceleratorWrite);
785
786 accelerator_for(idx,nsite,1,{
787
788 Coordinate from_coor, to_coor, base;
789 Lexicographic::CoorFromIndex(base,idx,RegionSize);
790 for(int i=0;i<nd;i++){
791 from_coor[i] = base[i] + FromLowerLeft[i];
792 to_coor[i] = base[i] + ToLowerLeft[i];
793 }
794 int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
795 int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
796 int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
797 int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
798
799 const vector_type* from = (const vector_type *)&from_v[from_oidx];
800 vector_type* to = (vector_type *)&to_v[to_oidx];
801
802 scalar_type stmp;
803 for(int w=0;w<words;w++){
804 stmp = getlane(from[w], from_lane);
805 putlane(to[w], stmp, to_lane);
806 }
807 });
808}
809
810template<class vobj>
811void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int orthog)
812{
813 typedef typename vobj::scalar_object sobj;
814 typedef typename vobj::scalar_type scalar_type;
815 typedef typename vobj::vector_type vector_type;
816
817 const int words=sizeof(vobj)/sizeof(vector_type);
818
820 // checks should guarantee that the operations are local
822 GridBase *Fg = From.Grid();
823 GridBase *Tg = To.Grid();
824 assert(!Fg->_isCheckerBoarded);
825 assert(!Tg->_isCheckerBoarded);
826 int Nsimd = Fg->Nsimd();
827 int nF = Fg->_ndimension;
828 int nT = Tg->_ndimension;
829 assert(nF+1 == nT);
830
832 // do the index calc on the GPU
834 Coordinate f_ostride = Fg->_ostride;
835 Coordinate f_istride = Fg->_istride;
836 Coordinate f_rdimensions = Fg->_rdimensions;
837 Coordinate t_ostride = Tg->_ostride;
838 Coordinate t_istride = Tg->_istride;
839 Coordinate t_rdimensions = Tg->_rdimensions;
840 Coordinate RegionSize = Fg->_ldimensions;
841 size_t nsite = 1;
842 for(int i=0;i<nF;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
843
844 typedef typename vobj::vector_type vector_type;
845 typedef typename vobj::scalar_type scalar_type;
846
847 autoView(from_v,From,AcceleratorRead);
848 autoView(to_v,To,AcceleratorWrite);
849
850 accelerator_for(idx,nsite,1,{
851
852 Coordinate from_coor(nF), to_coor(nT);
853 Lexicographic::CoorFromIndex(from_coor,idx,RegionSize);
854 int j=0;
855 for(int i=0;i<nT;i++){
856 if ( i!=orthog ) {
857 to_coor[i] = from_coor[j];
858 j++;
859 } else {
860 to_coor[i] = slice;
861 }
862 }
863 int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
864 int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
865 int to_oidx = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
866 int to_lane = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
867
868 const vector_type* from = (const vector_type *)&from_v[from_oidx];
869 vector_type* to = (vector_type *)&to_v[to_oidx];
870
871 scalar_type stmp;
872 for(int w=0;w<words;w++){
873 stmp = getlane(from[w], from_lane);
874 putlane(to[w], stmp, to_lane);
875 }
876 });
877}
878
879template<class vobj>
880void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, int orthog)
881{
882 typedef typename vobj::scalar_object sobj;
883 typedef typename vobj::scalar_type scalar_type;
884 typedef typename vobj::vector_type vector_type;
885
886 const int words=sizeof(vobj)/sizeof(vector_type);
887
889 // checks should guarantee that the operations are local
891 GridBase *Fg = From.Grid();
892 GridBase *Tg = To.Grid();
893 assert(!Fg->_isCheckerBoarded);
894 assert(!Tg->_isCheckerBoarded);
895 int Nsimd = Fg->Nsimd();
896 int nF = Fg->_ndimension;
897 int nT = Tg->_ndimension;
898 assert(nT+1 == nF);
899
901 // do the index calc on the GPU
903 Coordinate f_ostride = Fg->_ostride;
904 Coordinate f_istride = Fg->_istride;
905 Coordinate f_rdimensions = Fg->_rdimensions;
906 Coordinate t_ostride = Tg->_ostride;
907 Coordinate t_istride = Tg->_istride;
908 Coordinate t_rdimensions = Tg->_rdimensions;
909 Coordinate RegionSize = Tg->_ldimensions;
910 size_t nsite = 1;
911 for(int i=0;i<nT;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
912
913 typedef typename vobj::vector_type vector_type;
914 typedef typename vobj::scalar_type scalar_type;
915
916 autoView(from_v,From,AcceleratorRead);
917 autoView(to_v,To,AcceleratorWrite);
918
919 accelerator_for(idx,nsite,1,{
920
921 Coordinate from_coor(nF), to_coor(nT);
922 Lexicographic::CoorFromIndex(to_coor,idx,RegionSize);
923 int j=0;
924 for(int i=0;i<nF;i++){
925 if ( i!=orthog ) {
926 from_coor[i] = to_coor[j];
927 j++;
928 } else {
929 from_coor[i] = slice;
930 }
931 }
932 int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
933 int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
934 int to_oidx = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
935 int to_lane = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
936
937 const vector_type* from = (const vector_type *)&from_v[from_oidx];
938 vector_type* to = (vector_type *)&to_v[to_oidx];
939
940 scalar_type stmp;
941 for(int w=0;w<words;w++){
942 stmp = getlane(from[w], from_lane);
943 putlane(to[w], stmp, to_lane);
944 }
945 });
946}
947
948template<class vobj>
949void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
950{
951 typedef typename vobj::scalar_object sobj;
952
953 GridBase *lg = lowDim.Grid();
954 GridBase *hg = higherDim.Grid();
955 int nl = lg->_ndimension;
956 int nh = hg->_ndimension;
957
958 assert(nl+1 == nh);
959 assert(orthog<nh);
960 assert(orthog>=0);
961 assert(hg->_processors[orthog]==1);
962
963 int dl; dl = 0;
964 for(int d=0;d<nh;d++){
965 if ( d != orthog) {
966 assert(lg->_processors[dl] == hg->_processors[d]);
967 assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
968 dl++;
969 }
970 }
971
972 // the above should guarantee that the operations are local
973 autoView(lowDimv,lowDim,CpuRead);
974 autoView(higherDimv,higherDim,CpuWrite);
975 thread_for(idx,lg->lSites(),{
976 sobj s;
977 Coordinate lcoor(nl);
978 Coordinate hcoor(nh);
979 lg->LocalIndexToLocalCoor(idx,lcoor);
980 int ddl=0;
981 hcoor[orthog] = slice;
982 for(int d=0;d<nh;d++){
983 if ( d!=orthog ) {
984 hcoor[d]=lcoor[ddl];
985 if ( hg->_checker_dim == d ) {
986 hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
987 lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
988 }
989 ddl++;
990 }
991
992 }
993 peekLocalSite(s,lowDimv,lcoor);
994 pokeLocalSite(s,higherDimv,hcoor);
995 });
996}
997
998template<class vobj>
999void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice, int orthog)
1000{
1001 typedef typename vobj::scalar_object sobj;
1002
1003 GridBase *lg = lowDim.Grid();
1004 GridBase *hg = higherDim.Grid();
1005 int nl = lg->_ndimension;
1006 int nh = hg->_ndimension;
1007
1008 assert(nl+1 == nh);
1009 assert(orthog<nh);
1010 assert(orthog>=0);
1011 assert(hg->_processors[orthog]==1);
1012 lowDim.Checkerboard() = higherDim.Checkerboard();
1013
1014 int dl; dl = 0;
1015 for(int d=0;d<nh;d++){
1016 if ( d != orthog) {
1017 assert(lg->_processors[dl] == hg->_processors[d]);
1018 assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
1019 dl++;
1020 }
1021 }
1022 // the above should guarantee that the operations are local
1023 autoView(lowDimv,lowDim,CpuWrite);
1024 autoView(higherDimv,higherDim,CpuRead);
1025 thread_for(idx,lg->lSites(),{
1026 sobj s;
1027 Coordinate lcoor(nl);
1028 Coordinate hcoor(nh);
1029 lg->LocalIndexToLocalCoor(idx,lcoor);
1030 hcoor[orthog] = slice;
1031 int ddl=0;
1032 for(int d=0;d<nh;d++){
1033 if ( d!=orthog ) {
1034 hcoor[d]=lcoor[ddl];
1035 if ( hg->_checker_dim == d ) {
1036 hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
1037 lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
1038 }
1039 ddl++;
1040 }
1041 }
1042 peekLocalSite(s,higherDimv,hcoor);
1043 pokeLocalSite(s,lowDimv,lcoor);
1044 });
1045
1046}
1047
1048//Can I implement with local copyregion??
1049template<class vobj>
1050void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
1051{
1052 typedef typename vobj::scalar_object sobj;
1053
1054 GridBase *lg = lowDim.Grid();
1055 GridBase *hg = higherDim.Grid();
1056 int nl = lg->_ndimension;
1057 int nh = hg->_ndimension;
1058
1059 assert(nl == nh);
1060 assert(orthog<nh);
1061 assert(orthog>=0);
1062
1063 for(int d=0;d<nh;d++){
1064 if ( d!=orthog ) {
1065 assert(lg->_processors[d] == hg->_processors[d]);
1066 assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
1067 }
1068 }
1069 Coordinate sz = lg->_ldimensions;
1070 sz[orthog]=1;
1071 Coordinate f_ll(nl,0); f_ll[orthog]=slice_lo;
1072 Coordinate t_ll(nh,0); t_ll[orthog]=slice_hi;
1073 localCopyRegion(lowDim,higherDim,f_ll,t_ll,sz);
1074}
1075
1076
1077template<class vobj>
1078void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
1079{
1080 InsertSliceLocal(higherDim,lowDim,slice_hi,slice_lo,orthog);
1081}
1082
1083
1084template<class vobj>
1085void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
1086{
1087 typedef typename vobj::scalar_object sobj;
1088
1089 GridBase *cg = coarse.Grid();
1090 GridBase *fg = fine.Grid();
1091
1092 int nd = cg->_ndimension;
1093
1094 subdivides(cg,fg);
1095
1096 assert(cg->_ndimension==fg->_ndimension);
1097
1098 Coordinate ratio(cg->_ndimension);
1099
1100 for(int d=0;d<cg->_ndimension;d++){
1101 ratio[d] = fg->_fdimensions[d]/cg->_fdimensions[d];
1102 }
1103
1104 Coordinate fcoor(nd);
1105 Coordinate ccoor(nd);
1106 for(int64_t g=0;g<fg->gSites();g++){
1107
1108 fg->GlobalIndexToGlobalCoor(g,fcoor);
1109 for(int d=0;d<nd;d++){
1110 ccoor[d] = fcoor[d]%cg->_gdimensions[d];
1111 }
1112
1113 sobj tmp;
1114 peekSite(tmp,coarse,ccoor);
1115 pokeSite(tmp,fine,fcoor);
1116 }
1117
1118}
1119
1120//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
1121template<typename vobj, typename sobj>
1122typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type
1123unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
1124{
1125
1126 typedef typename vobj::vector_type vtype;
1127
1128 GridBase* in_grid = in.Grid();
1129 out.resize(in_grid->lSites());
1130
1131 int ndim = in_grid->Nd();
1132 int in_nsimd = vtype::Nsimd();
1133
1134 std::vector<Coordinate > in_icoor(in_nsimd);
1135
1136 for(int lane=0; lane < in_nsimd; lane++){
1137 in_icoor[lane].resize(ndim);
1138 in_grid->iCoorFromIindex(in_icoor[lane], lane);
1139 }
1140
1141 //loop over outer index
1142 autoView( in_v , in, CpuRead);
1143 thread_for(in_oidx,in_grid->oSites(),{
1144 //Assemble vector of pointers to output elements
1145 ExtractPointerArray<sobj> out_ptrs(in_nsimd);
1146
1147 Coordinate in_ocoor(ndim);
1148 in_grid->oCoorFromOindex(in_ocoor, in_oidx);
1149
1150 Coordinate lcoor(in_grid->Nd());
1151
1152 for(int lane=0; lane < in_nsimd; lane++){
1153
1154 for(int mu=0;mu<ndim;mu++){
1155 lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
1156 }
1157
1158 int lex;
1159 Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
1160 assert(lex < out.size());
1161 out_ptrs[lane] = &out[lex];
1162 }
1163
1164 //Unpack into those ptrs
1165 const vobj & in_vobj = in_v[in_oidx];
1166 extract(in_vobj, out_ptrs, 0);
1167 });
1168}
1169
1170template<typename vobj, typename sobj>
1171typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type
1172unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
1173{
1174
1175 typedef typename vobj::vector_type vtype;
1176
1177 GridBase* in_grid = in._grid;
1178 out.resize(in_grid->lSites());
1179
1180 int ndim = in_grid->Nd();
1181 int in_nsimd = vtype::Nsimd();
1182
1183 std::vector<Coordinate > in_icoor(in_nsimd);
1184
1185 for(int lane=0; lane < in_nsimd; lane++){
1186 in_icoor[lane].resize(ndim);
1187 in_grid->iCoorFromIindex(in_icoor[lane], lane);
1188 }
1189
1190 thread_for(in_oidx, in_grid->oSites(),{
1191 //Assemble vector of pointers to output elements
1192 std::vector<sobj*> out_ptrs(in_nsimd);
1193
1194 Coordinate in_ocoor(ndim);
1195 in_grid->oCoorFromOindex(in_ocoor, in_oidx);
1196
1197 Coordinate lcoor(in_grid->Nd());
1198
1199 for(int lane=0; lane < in_nsimd; lane++){
1200 for(int mu=0;mu<ndim;mu++)
1201 lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
1202
1203 int lex;
1204 Lexicographic::IndexFromCoorReversed(lcoor, lex, in_grid->_ldimensions);
1205 out_ptrs[lane] = &out[lex];
1206 }
1207
1208 //Unpack into those ptrs
1209 const vobj & in_vobj = in._odata[in_oidx];
1210 extract1(in_vobj, out_ptrs, 0);
1211 });
1212}
1213
1214//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
1215template<typename vobj, typename sobj>
1216typename std::enable_if<isSIMDvectorized<vobj>::value
1218vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
1219{
1220
1221 typedef typename vobj::vector_type vtype;
1222
1223 GridBase* grid = out.Grid();
1224 assert(in.size()==grid->lSites());
1225
1226 const int ndim = grid->Nd();
1227 constexpr int nsimd = vtype::Nsimd();
1228
1229 std::vector<Coordinate > icoor(nsimd);
1230
1231 for(int lane=0; lane < nsimd; lane++){
1232 icoor[lane].resize(ndim);
1233 grid->iCoorFromIindex(icoor[lane],lane);
1234 }
1235 autoView( out_v , out, CpuWrite);
1236 thread_for(oidx, grid->oSites(),{
1237 //Assemble vector of pointers to output elements
1238 ExtractPointerArray<sobj> ptrs(nsimd);
1239
1240 Coordinate ocoor(ndim);
1241 Coordinate lcoor(ndim);
1242 grid->oCoorFromOindex(ocoor, oidx);
1243
1244 for(int lane=0; lane < nsimd; lane++){
1245
1246 for(int mu=0;mu<ndim;mu++){
1247 lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
1248 }
1249
1250 int lex;
1251 Lexicographic::IndexFromCoor(lcoor, lex, grid->_ldimensions);
1252 ptrs[lane] = &in[lex];
1253 }
1254
1255 //pack from those ptrs
1256 vobj vecobj;
1257 merge(vecobj, ptrs, 0);
1258 out_v[oidx] = vecobj;
1259 });
1260}
1261
1262template<typename vobj, typename sobj>
1263typename std::enable_if<isSIMDvectorized<vobj>::value
1265vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
1266{
1267
1268 typedef typename vobj::vector_type vtype;
1269
1270 GridBase* grid = out._grid;
1271 assert(in.size()==grid->lSites());
1272
1273 int ndim = grid->Nd();
1274 int nsimd = vtype::Nsimd();
1275
1276 std::vector<Coordinate > icoor(nsimd);
1277
1278 for(int lane=0; lane < nsimd; lane++){
1279 icoor[lane].resize(ndim);
1280 grid->iCoorFromIindex(icoor[lane],lane);
1281 }
1282
1283 thread_for(oidx, grid->oSites(), {
1284 //Assemble vector of pointers to output elements
1285 std::vector<sobj*> ptrs(nsimd);
1286
1287 Coordinate ocoor(ndim);
1288 grid->oCoorFromOindex(ocoor, oidx);
1289
1290 Coordinate lcoor(grid->Nd());
1291
1292 for(int lane=0; lane < nsimd; lane++){
1293
1294 for(int mu=0;mu<ndim;mu++){
1295 lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
1296 }
1297
1298 int lex;
1299 Lexicographic::IndexFromCoorReversed(lcoor, lex, grid->_ldimensions);
1300 ptrs[lane] = &in[lex];
1301 }
1302
1303 //pack from those ptrs
1304 vobj vecobj;
1305 merge1(vecobj, ptrs, 0);
1306 out._odata[oidx] = vecobj;
1307 });
1308}
1309
1310//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
1311template<class VobjOut, class VobjIn>
1313{
1314 typedef typename VobjOut::vector_type Vout;
1315 typedef typename VobjIn::vector_type Vin;
1316 const int N = sizeof(VobjOut)/sizeof(Vout);
1317 conformable(out.Grid(),in.Grid());
1318 out.Checkerboard() = in.Checkerboard();
1319 int nsimd = out.Grid()->Nsimd();
1320 autoView( out_v , out, AcceleratorWrite);
1321 autoView( in_v , in, AcceleratorRead);
1322 accelerator_for(idx,out.Grid()->oSites(),1,{
1323 Vout *vout = (Vout *)&out_v[idx];
1324 Vin *vin = (Vin *)&in_v[idx];
1325 precisionChange(vout,vin,N);
1326 });
1327}
1328//Convert a Lattice from one precision to another (original, slow implementation)
1329template<class VobjOut, class VobjIn>
1331{
1332 assert(out.Grid()->Nd() == in.Grid()->Nd());
1333 for(int d=0;d<out.Grid()->Nd();d++){
1334 assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
1335 }
1336 out.Checkerboard() = in.Checkerboard();
1337 GridBase *in_grid=in.Grid();
1338 GridBase *out_grid = out.Grid();
1339
1340 typedef typename VobjOut::scalar_object SobjOut;
1341 typedef typename VobjIn::scalar_object SobjIn;
1342
1343 int ndim = out.Grid()->Nd();
1344 int out_nsimd = out_grid->Nsimd();
1345 int in_nsimd = in_grid->Nsimd();
1346 std::vector<Coordinate > out_icoor(out_nsimd);
1347
1348 for(int lane=0; lane < out_nsimd; lane++){
1349 out_icoor[lane].resize(ndim);
1350 out_grid->iCoorFromIindex(out_icoor[lane], lane);
1351 }
1352
1353 std::vector<SobjOut> in_slex_conv(in_grid->lSites());
1354 unvectorizeToLexOrdArray(in_slex_conv, in);
1355
1356 autoView( out_v , out, CpuWrite);
1357 thread_for(out_oidx,out_grid->oSites(),{
1358 Coordinate out_ocoor(ndim);
1359 out_grid->oCoorFromOindex(out_ocoor, out_oidx);
1360
1361 ExtractPointerArray<SobjOut> ptrs(out_nsimd);
1362
1363 Coordinate lcoor(out_grid->Nd());
1364
1365 for(int lane=0; lane < out_nsimd; lane++){
1366 for(int mu=0;mu<ndim;mu++)
1367 lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
1368
1369 int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
1370 ptrs[lane] = &in_slex_conv[llex];
1371 }
1372 merge(out_v[out_oidx], ptrs, 0);
1373 });
1374}
1375
1376//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
1378 std::pair<Integer,Integer>* fmap_device; //device pointer
1379 //maintain grids for checking
1382public:
1383 precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
1384 //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
1385 assert(out_grid->Nd() == in_grid->Nd());
1386 for(int d=0;d<out_grid->Nd();d++){
1387 assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
1388 }
1389 int Nsimd_out = out_grid->Nsimd();
1390
1391 std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
1392 for(int lane=0; lane < out_grid->Nsimd(); lane++)
1393 out_grid->iCoorFromIindex(out_icorrs[lane], lane);
1394
1395 std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
1396 thread_for(out_oidx,out_grid->oSites(),{
1397 Coordinate out_ocorr;
1398 out_grid->oCoorFromOindex(out_ocorr, out_oidx);
1399
1400 Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
1401 for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
1402 out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
1403
1404 //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
1405 //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
1406 //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
1407 int in_oidx = 0, in_lane = 0;
1408 for(int d=0;d<in_grid->_ndimension;d++){
1409 in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
1410 in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
1411 }
1412 fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
1413 }
1414 });
1415
1416 //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
1417 size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
1418 fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
1419 acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
1420 }
1421
1422 //Prevent moving or copying
1427
1428 std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
1429
1430 void checkGrids(GridBase* out, GridBase* in) const{
1431 conformable(out, _out_grid);
1432 conformable(in, _in_grid);
1433 }
1434
1438};
1439
1440
1441//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
1442//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
1443template<class VobjOut, class VobjIn>
1444auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
1445 if(out.Grid() == in.Grid()){
1446 precisionChangeFast(out,in);
1447 return 1;
1448 }else{
1449 return 0;
1450 }
1451}
1452template<class VobjOut, class VobjIn>
1453int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
1454 return 0;
1455}
1456
1457
1458//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
1459//which contains the mapping data.
1460template<class VobjOut, class VobjIn>
1462 if(_precisionChangeFastWrap(out,in,0)) return;
1463
1464 static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
1465
1466 out.Checkerboard() = in.Checkerboard();
1467 constexpr int Nsimd_out = VobjOut::Nsimd();
1468
1469 workspace.checkGrids(out.Grid(),in.Grid());
1470 std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
1471
1472 //Do the copy/precision change
1473 autoView( out_v , out, AcceleratorWrite);
1474 autoView( in_v , in, AcceleratorRead);
1475
1476 accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
1477 std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
1478 for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
1479 int in_oidx = fmap_osite[out_lane].first;
1480 int in_lane = fmap_osite[out_lane].second;
1481 copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
1482 }
1483 });
1484}
1485
1486//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
1487//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
1488template<class VobjOut, class VobjIn>
1490 if(_precisionChangeFastWrap(out,in,0)) return;
1491 precisionChangeWorkspace workspace(out.Grid(), in.Grid());
1492 precisionChange(out, in, workspace);
1493}
1494
1495
1496
1497
1499// Communicate between grids
1502// SIMPLE CASE:
1504//
1505// Mesh of nodes (2x2) ; subdivide to 1x1 subdivisions
1506//
1507// Lex ord:
1508// N0 va0 vb0 vc0 vd0 N1 va1 vb1 vc1 vd1
1509// N2 va2 vb2 vc2 vd2 N3 va3 vb3 vc3 vd3
1510//
1511// Ratio = full[dim] / split[dim]
1512//
1513// For each dimension do an all to all; get Nvec -> Nvec / ratio
1514// Ldim -> Ldim * ratio
1515// LocalVol -> LocalVol * ratio
1516// full AllToAll(0)
1517// N0 va0 vb0 va1 vb1 N1 vc0 vd0 vc1 vd1
1518// N2 va2 vb2 va3 vb3 N3 vc2 vd2 vc3 vd3
1519//
1520// REARRANGE
1521// N0 va01 vb01 N1 vc01 vd01
1522// N2 va23 vb23 N3 vc23 vd23
1523//
1524// full AllToAll(1) // Not what is wanted. FIXME
1525// N0 va01 va23 N1 vc01 vc23
1526// N2 vb01 vb23 N3 vd01 vd23
1527//
1528// REARRANGE
1529// N0 va0123 N1 vc0123
1530// N2 vb0123 N3 vd0123
1531//
1532// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
1533// NB: Easiest to programme if keep in lex order.
1534/*
1535 * Let chunk = (fvol*nvec)/sP be size of a chunk. ( Divide lexico vol * nvec into fP/sP = M chunks )
1536 *
1537 * 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M)
1538 *
1539 * node 0 1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1).. data chunk x M x sP = fL / sP * M * sP = fL * M growth
1540 * node 1 1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1)..
1541 * node 2 1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1)..
1542 * node 3 1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1)..
1543 * etc...
1544 */
1545template<class Vobj>
1546void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
1547{
1548 typedef typename Vobj::scalar_object Sobj;
1549
1550 int full_vecs = full.size();
1551
1552 assert(full_vecs>=1);
1553
1554 GridBase * full_grid = full[0].Grid();
1555 GridBase *split_grid = split.Grid();
1556
1557 int ndim = full_grid->_ndimension;
1558 int full_nproc = full_grid->_Nprocessors;
1559 int split_nproc =split_grid->_Nprocessors;
1560
1562 // Checkerboard management
1564 int cb = full[0].Checkerboard();
1565 split.Checkerboard() = cb;
1566
1568 // Checks
1570 assert(full_grid->_ndimension==split_grid->_ndimension);
1571 for(int n=0;n<full_vecs;n++){
1572 assert(full[n].Checkerboard() == cb);
1573 for(int d=0;d<ndim;d++){
1574 assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
1575 assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
1576 }
1577 }
1578
1579 int nvector =full_nproc/split_nproc;
1580 assert(nvector*split_nproc==full_nproc);
1581 assert(nvector == full_vecs);
1582
1583 Coordinate ratio(ndim);
1584 for(int d=0;d<ndim;d++){
1585 ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
1586 }
1587
1588 uint64_t lsites = full_grid->lSites();
1589 uint64_t sz = lsites * nvector;
1590 std::vector<Sobj> tmpdata(sz);
1591 std::vector<Sobj> alldata(sz);
1592 std::vector<Sobj> scalardata(lsites);
1593
1594 for(int v=0;v<nvector;v++){
1595 unvectorizeToLexOrdArray(scalardata,full[v]);
1596 thread_for(site,lsites,{
1597 alldata[v*lsites+site] = scalardata[site];
1598 });
1599 }
1600
1601 int nvec = nvector; // Counts down to 1 as we collapse dims
1602 Coordinate ldims = full_grid->_ldimensions;
1603
1604 for(int d=ndim-1;d>=0;d--){
1605
1606 if ( ratio[d] != 1 ) {
1607
1608 full_grid ->AllToAll(d,alldata,tmpdata);
1609 if ( split_grid->_processors[d] > 1 ) {
1610 alldata=tmpdata;
1611 split_grid->AllToAll(d,alldata,tmpdata);
1612 }
1613
1614 auto rdims = ldims;
1615 auto M = ratio[d];
1616 auto rsites= lsites*M;// increases rsites by M
1617 nvec /= M; // Reduce nvec by subdivision factor
1618 rdims[d] *= M; // increase local dim by same factor
1619
1620 int sP = split_grid->_processors[d];
1621 int fP = full_grid->_processors[d];
1622
1623 int fvol = lsites;
1624
1625 int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
1626
1627 // Loop over reordered data post A2A
1628 thread_for(c, chunk, {
1629 Coordinate coor(ndim);
1630 for(int m=0;m<M;m++){
1631 for(int s=0;s<sP;s++){
1632
1633 // addressing; use lexico
1634 int lex_r;
1635 uint64_t lex_c = c+chunk*m+chunk*M*s;
1636 uint64_t lex_fvol_vec = c+chunk*s;
1637 uint64_t lex_fvol = lex_fvol_vec%fvol;
1638 uint64_t lex_vec = lex_fvol_vec/fvol;
1639
1640 // which node sets an adder to the coordinate
1641 Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);
1642 coor[d] += m*ldims[d];
1643 Lexicographic::IndexFromCoor(coor, lex_r, rdims);
1644 lex_r += lex_vec * rsites;
1645
1646 // LexicoFind coordinate & vector number within split lattice
1647 alldata[lex_r] = tmpdata[lex_c];
1648
1649 }
1650 }
1651 });
1652 ldims[d]*= ratio[d];
1653 lsites *= ratio[d];
1654
1655 }
1656 }
1657 vectorizeFromLexOrdArray(alldata,split);
1658}
1659
1660template<class Vobj>
1662{
1663 int nvector = full.Grid()->_Nprocessors / split.Grid()->_Nprocessors;
1664 std::vector<Lattice<Vobj> > full_v(nvector,full.Grid());
1665 for(int n=0;n<nvector;n++){
1666 full_v[n] = full;
1667 }
1668 Grid_split(full_v,split);
1669}
1670
1671template<class Vobj>
1672void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
1673{
1674 typedef typename Vobj::scalar_object Sobj;
1675
1676 int full_vecs = full.size();
1677
1678 assert(full_vecs>=1);
1679
1680 GridBase * full_grid = full[0].Grid();
1681 GridBase *split_grid = split.Grid();
1682
1683 int ndim = full_grid->_ndimension;
1684 int full_nproc = full_grid->_Nprocessors;
1685 int split_nproc =split_grid->_Nprocessors;
1686
1688 // Checkerboard management
1690 int cb = full[0].Checkerboard();
1691 split.Checkerboard() = cb;
1692
1694 // Checks
1696 assert(full_grid->_ndimension==split_grid->_ndimension);
1697 for(int n=0;n<full_vecs;n++){
1698 assert(full[n].Checkerboard() == cb);
1699 for(int d=0;d<ndim;d++){
1700 assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
1701 assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
1702 }
1703 }
1704
1705 int nvector =full_nproc/split_nproc;
1706 assert(nvector*split_nproc==full_nproc);
1707 assert(nvector == full_vecs);
1708
1709 Coordinate ratio(ndim);
1710 for(int d=0;d<ndim;d++){
1711 ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
1712 }
1713
1714 uint64_t lsites = full_grid->lSites();
1715 uint64_t sz = lsites * nvector;
1716 std::vector<Sobj> tmpdata(sz);
1717 std::vector<Sobj> alldata(sz);
1718 std::vector<Sobj> scalardata(lsites);
1719
1720 unvectorizeToLexOrdArray(alldata,split);
1721
1723 // Start from split grid and work towards full grid
1725
1726 int nvec = 1;
1727 uint64_t rsites = split_grid->lSites();
1728 Coordinate rdims = split_grid->_ldimensions;
1729
1730 for(int d=0;d<ndim;d++){
1731
1732 if ( ratio[d] != 1 ) {
1733
1734 auto M = ratio[d];
1735
1736 int sP = split_grid->_processors[d];
1737 int fP = full_grid->_processors[d];
1738
1739 auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor
1740 auto lsites= rsites/M; // Decreases rsites by M
1741
1742 int fvol = lsites;
1743 int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
1744
1745 {
1746 // Loop over reordered data post A2A
1747 thread_for(c, chunk,{
1748 Coordinate coor(ndim);
1749 for(int m=0;m<M;m++){
1750 for(int s=0;s<sP;s++){
1751
1752 // addressing; use lexico
1753 int lex_r;
1754 uint64_t lex_c = c+chunk*m+chunk*M*s;
1755 uint64_t lex_fvol_vec = c+chunk*s;
1756 uint64_t lex_fvol = lex_fvol_vec%fvol;
1757 uint64_t lex_vec = lex_fvol_vec/fvol;
1758
1759 // which node sets an adder to the coordinate
1760 Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);
1761 coor[d] += m*ldims[d];
1762 Lexicographic::IndexFromCoor(coor, lex_r, rdims);
1763 lex_r += lex_vec * rsites;
1764
1765 // LexicoFind coordinate & vector number within split lattice
1766 tmpdata[lex_c] = alldata[lex_r];
1767 }
1768 }
1769 });
1770 }
1771
1772 if ( split_grid->_processors[d] > 1 ) {
1773 split_grid->AllToAll(d,tmpdata,alldata);
1774 tmpdata=alldata;
1775 }
1776 full_grid ->AllToAll(d,tmpdata,alldata);
1777 rdims[d]/= M;
1778 rsites /= M;
1779 nvec *= M; // Increase nvec by subdivision factor
1780 }
1781 }
1782
1783 lsites = full_grid->lSites();
1784 for(int v=0;v<nvector;v++){
1785 thread_for(site, lsites,{
1786 scalardata[site] = alldata[v*lsites+site];
1787 });
1788 vectorizeFromLexOrdArray(scalardata,full[v]);
1789 }
1790}
1791
1793// Faster but less accurate blockProject
1795template<class vobj,class CComplex,int nbasis,class VLattice>
1797 const Lattice<vobj> &fineData,
1798 const VLattice &Basis)
1799{
1800 GridBase * fine = fineData.Grid();
1801 GridBase * coarse= coarseData.Grid();
1802
1803 Lattice<iScalar<CComplex> > ip(coarse);
1804
1805 autoView( coarseData_ , coarseData, AcceleratorWrite);
1806 autoView( ip_ , ip, AcceleratorWrite);
1807 RealD t_IP=0;
1808 RealD t_co=0;
1809 for(int v=0;v<nbasis;v++) {
1810 t_IP-=usecond();
1811 blockInnerProductD(ip,Basis[v],fineData);
1812 t_IP+=usecond();
1813 t_co-=usecond();
1814 accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
1815 convertType(coarseData_[sc](v),ip_[sc]);
1816 });
1817 t_co+=usecond();
1818 }
1819}
1820
1821
1823
accelerator_inline int acceleratorSIMTlane(int Nsimd)
#define accelerator_inline
void * acceleratorAllocDevice(size_t bytes)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
void acceleratorFreeDevice(void *ptr)
std::vector< T, devAllocator< T > > deviceVector
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95
accelerator_inline S getlane(const Grid_simd< S, V > &in, int lane)
accelerator_inline void putlane(Grid_simd< S, V > &vec, const S &_S, int lane)
Grid_simd2< complex< double >, vComplexD > vComplexD2
Grid_simd< complex< float >, SIMD_Ftype > vComplexF
Grid_simd< complex< double >, SIMD_Dtype > vComplexD
Invoke< std::enable_if<!Condition::value, ReturnType > > NotEnableIf
Invoke< std::enable_if< Condition::value, ReturnType > > EnableIf
void conformable(const Lattice< obj1 > &lhs, const Lattice< obj2 > &rhs)
void LatticeCoordinate(Lattice< iobj > &l, int mu)
auto localInnerProduct(const Lattice< vobj > &lhs, const Lattice< vobj > &rhs) -> Lattice< typename vobj::tensor_reduced >
auto PeekIndex(const Lattice< vobj > &lhs, int i) -> Lattice< decltype(peekIndex< Index >(vobj(), i))>
void pokeSite(const sobj &s, Lattice< vobj > &l, const Coordinate &site)
void peekLocalSite(sobj &s, const LatticeView< vobj > &l, Coordinate &site)
void pokeLocalSite(const sobj &s, LatticeView< vobj > &l, Coordinate &site)
vobj::scalar_object peekSite(const Lattice< vobj > &l, const Coordinate &site)
ComplexD innerProduct(const Lattice< vobj > &left, const Lattice< vobj > &right)
void InsertSliceFast(const Lattice< vobj > &From, Lattice< vobj > &To, int slice, int orthog)
void subdivides(GridBase *coarse, GridBase *fine)
void blockProjectFast(Lattice< iVector< CComplex, nbasis > > &coarseData, const Lattice< vobj > &fineData, const VLattice &Basis)
void InsertSlice(const Lattice< vobj > &lowDim, Lattice< vobj > &higherDim, int slice, int orthog)
void localCopyRegion(const Lattice< vobj > &From, Lattice< vobj > &To, Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize)
auto _precisionChangeFastWrap(Lattice< VobjOut > &out, const Lattice< VobjIn > &in, int dummy) -> decltype(precisionChange(((typename VobjOut::vector_type *) 0),((typename VobjIn::vector_type *) 0), 1), int())
void InsertSliceLocal(const Lattice< vobj > &lowDim, Lattice< vobj > &higherDim, int slice_lo, int slice_hi, int orthog)
void blockNormalise(Lattice< CComplex > &ip, Lattice< vobj > &fineX)
void batchBlockPromote(const std::vector< Lattice< iVector< CComplex, nbasis > > > &coarseData, std::vector< Lattice< vobj > > &fineData, const VLattice &Basis)
void precisionChangeFast(Lattice< VobjOut > &out, const Lattice< VobjIn > &in)
void blockOrthogonalise(Lattice< CComplex > &ip, std::vector< Lattice< vobj > > &Basis)
void batchBlockProject(std::vector< Lattice< iVector< CComplex, nbasis > > > &coarseData, const std::vector< Lattice< vobj > > &fineData, const VLattice &Basis)
void localConvert(const Lattice< vobj > &in, Lattice< vvobj > &out)
void blockInnerProductD(Lattice< CComplex > &CoarseInner, const Lattice< vobj > &fineX, const Lattice< vobj > &fineY)
void ExtractSliceFast(Lattice< vobj > &To, const Lattice< vobj > &From, int slice, int orthog)
void blockPromote(const Lattice< iVector< CComplex, nbasis > > &coarseData, Lattice< vobj > &fineData, const VLattice &Basis)
void acceleratorSetCheckerboard(Lattice< vobj > &full, const Lattice< vobj > &half, int checker_dim_half=0)
void acceleratorPickCheckerboard(int cb, Lattice< vobj > &half, const Lattice< vobj > &full, int checker_dim_half=0)
std::enable_if< isSIMDvectorized< vobj >::value &&!isSIMDvectorized< sobj >::value, void >::type unvectorizeToRevLexOrdArray(std::vector< sobj > &out, const Lattice< vobj > &in)
accelerator_inline void convertType(ComplexD &out, const std::complex< double > &in)
void setCheckerboard(Lattice< vobj > &full, const Lattice< vobj > &half)
void blockOrthonormalize(Lattice< CComplex > &ip, VLattice &Basis)
void blockPick(GridBase *coarse, const Lattice< vobj > &unpicked, Lattice< vobj > &picked, Coordinate coor)
void precisionChangeOrig(Lattice< VobjOut > &out, const Lattice< VobjIn > &in)
void ExtractSlice(Lattice< vobj > &lowDim, const Lattice< vobj > &higherDim, int slice, int orthog)
void pickCheckerboard(int cb, Lattice< vobj > &half, const Lattice< vobj > &full)
void ExtractSliceLocal(Lattice< vobj > &lowDim, const Lattice< vobj > &higherDim, int slice_lo, int slice_hi, int orthog)
std::enable_if< isSIMDvectorized< vobj >::value &&!isSIMDvectorized< sobj >::value, void >::type vectorizeFromRevLexOrdArray(std::vector< sobj > &in, Lattice< vobj > &out)
void blockZAXPY(Lattice< vobj > &fineZ, const Lattice< CComplex > &coarseA, const Lattice< vobj2 > &fineX, const Lattice< vobj > &fineY)
std::enable_if< isSIMDvectorized< vobj >::value &&!isSIMDvectorized< sobj >::value, void >::type vectorizeFromLexOrdArray(std::vector< sobj > &in, Lattice< vobj > &out)
auto localInnerProductD(const Lattice< vobj > &lhs, const Lattice< vobj > &rhs) -> Lattice< iScalar< decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0], rhs.View(CpuRead)[0])))> >
void precisionChange(Lattice< VobjOut > &out, const Lattice< VobjIn > &in, const precisionChangeWorkspace &workspace)
void Replicate(const Lattice< vobj > &coarse, Lattice< vobj > &fine)
void Grid_unsplit(std::vector< Lattice< Vobj > > &full, Lattice< Vobj > &split)
void blockInnerProduct(Lattice< CComplex > &CoarseInner, const Lattice< vobj > &fineX, const Lattice< vobj > &fineY)
std::enable_if< isSIMDvectorized< vobj >::value &&!isSIMDvectorized< sobj >::value, void >::type unvectorizeToLexOrdArray(std::vector< sobj > &out, const Lattice< vobj > &in)
void blockProject(Lattice< iVector< CComplex, nbasis > > &coarseData, const Lattice< vobj > &fineData, const VLattice &Basis)
void blockSum(Lattice< vobj > &coarseData, const Lattice< vobj > &fineData)
void Grid_split(std::vector< Lattice< Vobj > > &full, Lattice< Vobj > &split)
Lattice< obj > pow(const Lattice< obj > &rhs_i, RealD y)
#define autoView(l_v, l, mode)
@ AcceleratorRead
@ CpuRead
@ AcceleratorWrite
@ CpuWrite
@ AcceleratorWriteDiscard
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
static constexpr int Nd
Definition QCD.h:52
std::complex< RealF > ComplexF
Definition Simd.h:78
std::complex< RealD > ComplexD
Definition Simd.h:79
double RealD
Definition Simd.h:61
#define T2
#define T1
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87
accelerator_inline std::enable_if<!isGridTensor< T >::value, T >::type TensorRemove(T arg)
accelerator void extract(const vobj &vec, ExtractBuffer< sobj > &extracted)
accelerator void merge(vobj &vec, ExtractBuffer< sobj > &extracted)
accelerator_inline ComplexD innerProductD2(const ComplexF &l, const ComplexF &r)
#define thread_for(i, num,...)
Definition Threads.h:60
double usecond(void)
Definition Timer.h:50
uint64_t base
void AllToAll(int dim, std::vector< T > &in, std::vector< T > &out)
Coordinate _fdimensions
int64_t gSites(void) const
Coordinate _istride
int lSites(void) const
Coordinate _checker_dim_mask
bool _isCheckerBoarded
int oSites(void) const
Coordinate _rdimensions
int Nd(void) const
Coordinate _simd_layout
const Coordinate & FullDimensions(void)
Coordinate _ostride
void GlobalIndexToGlobalCoor(int64_t gidx, Coordinate &gcoor)
Coordinate _ldimensions
Coordinate _gdimensions
void iCoorFromIindex(Coordinate &coor, int lane)
int Nsimd(void) const
static accelerator_inline constexpr int Nsimd(void)
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const
Definition Simd.h:194
vtype _internal[N][N]
vtype _internal
vtype _internal[N]
std::pair< Integer, Integer > * fmap_device
precisionChangeWorkspace & operator=(precisionChangeWorkspace &&r)=delete
precisionChangeWorkspace(const precisionChangeWorkspace &r)=delete
precisionChangeWorkspace & operator=(const precisionChangeWorkspace &r)=delete
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid)
std::pair< Integer, Integer > const * getMap() const
void checkGrids(GridBase *out, GridBase *in) const
precisionChangeWorkspace(precisionChangeWorkspace &&r)=delete