Grid 0.7.0
Lattice_reduction.h
Go to the documentation of this file.
1/*************************************************************************************
2 Grid physics library, www.github.com/paboyle/Grid
3 Source file: ./lib/lattice/Lattice_reduction.h
4 Copyright (C) 2015
5Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
6Author: Peter Boyle <paboyle@ph.ed.ac.uk>
7Author: paboyle <paboyle@ph.ed.ac.uk>
8Author: Christoph Lehner <christoph@lhnr.de>
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation, Inc.,
19 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 See the full license in the file "LICENSE" in the top level distribution directory
21 *************************************************************************************/
22 /* END LEGAL */
23#pragma once
24
26
27
28#if defined(GRID_CUDA)||defined(GRID_HIP)
30#endif
31#if defined(GRID_SYCL)
33#endif
35
37
39// FIXME this should promote to double and accumulate
41template<class vobj>
42inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
43{
44 typedef typename vobj::scalar_object sobj;
45
46 // const int Nsimd = vobj::Nsimd();
47 const int nthread = GridThread::GetThreads();
48
49 std::vector<sobj> sumarray(nthread);
50 for(int i=0;i<nthread;i++){
51 sumarray[i]=Zero();
52 }
53
54 thread_for(thr,nthread, {
55 int nwork, mywork, myoff;
56 nwork = osites;
57 GridThread::GetWork(nwork,thr,mywork,myoff);
58 vobj vvsum=Zero();
59 for(int ss=myoff;ss<mywork+myoff; ss++){
60 vvsum = vvsum + arg[ss];
61 }
62 sumarray[thr]=Reduce(vvsum);
63 });
64
65 sobj ssum=Zero(); // sum across threads
66 for(int i=0;i<nthread;i++){
67 ssum = ssum+sumarray[i];
68 }
69 return ssum;
70}
71template<class vobj>
72inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
73{
74 typedef typename vobj::scalar_objectD sobj;
75
76 const int nthread = GridThread::GetThreads();
77
78 std::vector<sobj> sumarray(nthread);
79 for(int i=0;i<nthread;i++){
80 sumarray[i]=Zero();
81 }
82
83 thread_for(thr,nthread, {
84 int nwork, mywork, myoff;
85 nwork = osites;
86 GridThread::GetWork(nwork,thr,mywork,myoff);
87 vobj vvsum=Zero();
88 for(int ss=myoff;ss<mywork+myoff; ss++){
89 vvsum = vvsum + arg[ss];
90 }
91 sumarray[thr]=Reduce(vvsum);
92 });
93
94 sobj ssum=Zero(); // sum across threads
95 for(int i=0;i<nthread;i++){
96 ssum = ssum+sumarray[i];
97 }
98 return ssum;
99}
100/*
101Threaded max, don't use for now
102template<class Double>
103inline Double max(const Double *arg, Integer osites)
104{
105 // const int Nsimd = vobj::Nsimd();
106 const int nthread = GridThread::GetThreads();
107
108 std::vector<Double> maxarray(nthread);
109
110 thread_for(thr,nthread, {
111 int nwork, mywork, myoff;
112 nwork = osites;
113 GridThread::GetWork(nwork,thr,mywork,myoff);
114 Double max=arg[0];
115 for(int ss=myoff;ss<mywork+myoff; ss++){
116 if( arg[ss] > max ) max = arg[ss];
117 }
118 maxarray[thr]=max;
119 });
120
121 Double tmax=maxarray[0];
122 for(int i=0;i<nthread;i++){
123 if (maxarray[i]>tmax) tmax = maxarray[i];
124 }
125 return tmax;
126}
127*/
128template<class vobj>
129inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
130{
131#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
132 return sum_gpu(arg,osites);
133#else
134 return sum_cpu(arg,osites);
135#endif
136}
137template<class vobj>
138inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
139{
140#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
141 return sumD_gpu(arg,osites);
142#else
143 return sumD_cpu(arg,osites);
144#endif
145}
146template<class vobj>
147inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
148{
149#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
150 return sumD_gpu_large(arg,osites);
151#else
152 return sumD_cpu(arg,osites);
153#endif
154}
155
156template<class vobj>
157inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
158{
159 Integer osites = arg.Grid()->oSites();
160#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
161 autoView( arg_v, arg, AcceleratorRead);
162 return sum_gpu(&arg_v[0],osites);
163#else
164 autoView(arg_v, arg, CpuRead);
165 return sum_cpu(&arg_v[0],osites);
166#endif
167}
168
169template<class vobj>
170inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
171{
172 auto ssum = rankSum(arg);
173 arg.Grid()->GlobalSum(ssum);
174 return ssum;
175}
176
177template<class vobj>
178inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
179{
180#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
181 autoView( arg_v, arg, AcceleratorRead);
182 Integer osites = arg.Grid()->oSites();
183 return sum_gpu_large(&arg_v[0],osites);
184#else
185 autoView(arg_v, arg, CpuRead);
186 Integer osites = arg.Grid()->oSites();
187 return sum_cpu(&arg_v[0],osites);
188#endif
189}
190
191template<class vobj>
192inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
193{
194 auto ssum = rankSumLarge(arg);
195 arg.Grid()->GlobalSum(ssum);
196 return ssum;
197}
198
200// Deterministic Reduction operations
202template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
203 ComplexD nrm = innerProduct(arg,arg);
204 return real(nrm);
205}
206
207
208template<class Op,class T1>
209inline auto norm2(const LatticeUnaryExpression<Op,T1> & expr) ->RealD
210{
211 return norm2(closure(expr));
212}
213
214template<class Op,class T1,class T2>
215inline auto norm2(const LatticeBinaryExpression<Op,T1,T2> & expr) ->RealD
216{
217 return norm2(closure(expr));
218}
219
220
221template<class Op,class T1,class T2,class T3>
222inline auto norm2(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) ->RealD
223{
224 return norm2(closure(expr));
225}
226
227
228//The global maximum of the site norm2
229template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
230{
231 typedef typename vobj::tensor_reduced vscalar; //iScalar<iScalar<.... <vPODtype> > >
232 typedef typename vscalar::scalar_object scalar; //iScalar<iScalar<.... <PODtype> > >
233
234 Lattice<vscalar> inner = localNorm2(arg);
235
236 auto grid = arg.Grid();
237
238 RealD max;
239 for(int l=0;l<grid->lSites();l++){
240 Coordinate coor;
241 scalar val;
242 RealD r;
243 grid->LocalIndexToLocalCoor(l,coor);
244 peekLocalSite(val,inner,coor);
245 r=real(TensorRemove(val));
246 if( (l==0) || (r>max)){
247 max=r;
248 }
249 }
250 grid->GlobalMax(max);
251 return max;
252}
253
254// Double inner product
255template<class vobj>
256inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
257{
258 typedef typename vobj::vector_typeD vector_type;
259 ComplexD nrm;
260
261 GridBase *grid = left.Grid();
262
263 const uint64_t nsimd = grid->Nsimd();
264 const uint64_t sites = grid->oSites();
265
266 // Might make all code paths go this way.
267 typedef decltype(innerProduct(vobj(),vobj())) inner_t;
268 deviceVector<inner_t> inner_tmp(sites);
269 auto inner_tmp_v = &inner_tmp[0];
270
271 {
272 autoView( left_v , left, AcceleratorRead);
273 autoView( right_v,right, AcceleratorRead);
274
275 // GPU - SIMT lane compliance...
276 accelerator_for( ss, sites, nsimd,{
277 auto x_l = left_v(ss);
278 auto y_l = right_v(ss);
279 coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
280 });
281 }
282 // This is in single precision and fails some tests
283 auto anrm = sumD(inner_tmp_v,sites);
284 nrm = anrm;
285 return nrm;
286}
287
288
289template<class vobj>
290inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
291 GridBase *grid = left.Grid();
292
293 bool ok;
294#ifdef GRID_SYCL
295 uint64_t csum=0;
296 uint64_t csum2=0;
298 {
299 // Hack
300 // Fast integer xor checksum. Can also be used in comms now.
301 autoView(l_v,left,AcceleratorRead);
302 Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
303 uint64_t *base= (uint64_t *)&l_v[0];
304 csum=svm_xor(base,words);
305 ok = FlightRecorder::CsumLog(csum);
306 if ( !ok ) {
307 csum2=svm_xor(base,words);
308 std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
309 } else {
310 // csum2=svm_xor(base,words);
311 // std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
312 }
313 assert(ok);
314 }
315#endif
316 FlightRecorder::StepLog("rank inner product");
317 ComplexD nrm = rankInnerProduct(left,right);
318 // ComplexD nrmck=nrm;
319 RealD local = real(nrm);
320 ok = FlightRecorder::NormLog(real(nrm));
321 if ( !ok ) {
322 ComplexD nrm2 = rankInnerProduct(left,right);
323 RealD local2 = real(nrm2);
324 std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
325 assert(ok);
326 }
327 FlightRecorder::StepLog("Start global sum");
328 grid->GlobalSumP2P(nrm);
329 // grid->GlobalSum(nrm);
330 FlightRecorder::StepLog("Finished global sum");
331 // std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
333 return nrm;
334}
335
336
338// Fast axpby_norm
339// z = a x + b y
340// return norm z
342template<class sobj,class vobj> strong_inline RealD
344{
345 sobj one(1.0);
346 return axpby_norm_fast(z,a,one,x,y);
347}
348
349template<class sobj,class vobj> strong_inline RealD
350axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
351{
352 z.Checkerboard() = x.Checkerboard();
353 conformable(z,x);
354 conformable(x,y);
355
356 // typedef typename vobj::vector_typeD vector_type;
357 RealD nrm;
358
359 GridBase *grid = x.Grid();
360
361 const uint64_t nsimd = grid->Nsimd();
362 const uint64_t sites = grid->oSites();
363
364 // GPU
365 autoView( x_v, x, AcceleratorRead);
366 autoView( y_v, y, AcceleratorRead);
367 autoView( z_v, z, AcceleratorWrite);
368 typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
369 deviceVector<inner_t> inner_tmp;
370 inner_tmp.resize(sites);
371 auto inner_tmp_v = &inner_tmp[0];
372
373 accelerator_for( ss, sites, nsimd,{
374 auto tmp = a*x_v(ss)+b*y_v(ss);
375 coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
376 coalescedWrite(z_v[ss],tmp);
377 });
378 bool ok;
379#ifdef GRID_SYCL
380 uint64_t csum=0;
381 uint64_t csum2=0;
383 {
384 // z_v
385 {
386 Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
387 uint64_t *base= (uint64_t *)&z_v[0];
388 csum=svm_xor(base,words);
389 ok = FlightRecorder::CsumLog(csum);
390 if ( !ok ) {
391 csum2=svm_xor(base,words);
392 std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
393 }
394 assert(ok);
395 }
396 // inner_v
397 {
398 Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
399 uint64_t *base= (uint64_t *)&inner_tmp_v[0];
400 csum=svm_xor(base,words);
401 ok = FlightRecorder::CsumLog(csum);
402 if ( !ok ) {
403 csum2=svm_xor(base,words);
404 std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
405 }
406 assert(ok);
407 }
408 }
409#endif
410 nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
411 ok = FlightRecorder::NormLog(real(nrm));
412 assert(ok);
413 RealD local = real(nrm);
414 grid->GlobalSum(nrm);
416 return nrm;
417}
418
419template<class vobj> strong_inline void
420innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
421{
422 conformable(left,right);
423
424 typedef typename vobj::vector_typeD vector_type;
425 std::vector<ComplexD> tmp(2);
426
427 GridBase *grid = left.Grid();
428
429 const uint64_t nsimd = grid->Nsimd();
430 const uint64_t sites = grid->oSites();
431
432 // GPU
433 typedef decltype(innerProductD(vobj(),vobj())) inner_t;
434 typedef decltype(innerProductD(vobj(),vobj())) norm_t;
435 deviceVector<inner_t> inner_tmp(sites);
436 deviceVector<norm_t> norm_tmp(sites);
437 auto inner_tmp_v = &inner_tmp[0];
438 auto norm_tmp_v = &norm_tmp[0];
439 {
440 autoView(left_v,left, AcceleratorRead);
441 autoView(right_v,right,AcceleratorRead);
442 accelerator_for( ss, sites, 1,{
443 auto left_tmp = left_v[ss];
444 inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
445 norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
446 });
447 }
448
449 tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
450 tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
451
452 grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
453 ip = tmp[0];
454 nrm = real(tmp[1]);
455}
456
457template<class Op,class T1>
458inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
459 ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
460{
461 return sum(closure(expr));
462}
463
464template<class Op,class T1,class T2>
465inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
466 ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
467{
468 return sum(closure(expr));
469}
470
471
472template<class Op,class T1,class T2,class T3>
474 ->typename decltype(expr.op.func(eval(0,expr.arg1),
475 eval(0,expr.arg2),
476 eval(0,expr.arg3)
477 ))::scalar_object
478{
479 return sum(closure(expr));
480}
481
483// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
485
486template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
487 std::vector<typename vobj::scalar_object> &result,
488 int orthogdim)
489{
491 // FIXME precision promoted summation
492 // may be important for correlation functions
493 // But easily avoided by using double precision fields
495 typedef typename vobj::scalar_object sobj;
496 typedef typename vobj::scalar_object::scalar_type scalar_type;
497 GridBase *grid = Data.Grid();
498 assert(grid!=NULL);
499
500 const int Nd = grid->_ndimension;
501 const int Nsimd = grid->Nsimd();
502
503 assert(orthogdim >= 0);
504 assert(orthogdim < Nd);
505
506 int fd=grid->_fdimensions[orthogdim];
507 int ld=grid->_ldimensions[orthogdim];
508 int rd=grid->_rdimensions[orthogdim];
509
510 std::vector<vobj> lvSum(rd); // will locally sum vectors first
511 std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
512 ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
513
514 result.resize(fd); // And then global sum to return the same vector to every node
515 for(int r=0;r<rd;r++){
516 lvSum[r]=Zero();
517 }
518
519 int e1= grid->_slice_nblock[orthogdim];
520 int e2= grid->_slice_block [orthogdim];
521 int stride=grid->_slice_stride[orthogdim];
522 int ostride=grid->_ostride[orthogdim];
523
524 //Reduce Data down to lvSum
525 sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
526
527 // Sum across simd lanes in the plane, breaking out orthog dir.
528 Coordinate icoor(Nd);
529
530 for(int rt=0;rt<rd;rt++){
531
532 extract(lvSum[rt],extracted);
533
534 for(int idx=0;idx<Nsimd;idx++){
535
536 grid->iCoorFromIindex(icoor,idx);
537
538 int ldx =rt+icoor[orthogdim]*rd;
539
540 lsSum[ldx]=lsSum[ldx]+extracted[idx];
541
542 }
543 }
544
545 // sum over nodes.
546 for(int t=0;t<fd;t++){
547 int pt = t/ld; // processor plane
548 int lt = t%ld;
549 if ( pt == grid->_processor_coor[orthogdim] ) {
550 result[t]=lsSum[lt];
551 } else {
552 result[t]=Zero();
553 }
554
555 }
556 scalar_type * ptr = (scalar_type *) &result[0];
557 int words = fd*sizeof(sobj)/sizeof(scalar_type);
558 grid->GlobalSumVector(ptr, words);
559 // std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
560
561}
562template<class vobj> inline
563std::vector<typename vobj::scalar_object>
564sliceSum(const Lattice<vobj> &Data,int orthogdim)
565{
566 std::vector<typename vobj::scalar_object> result;
567 sliceSum(Data,result,orthogdim);
568 return result;
569}
570
571/*
572Reimplement
573
5741)
575template<class vobj>
576static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
577
5782)
579template<class vobj>
580static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
581
5823)
583-- Make Slice Mul Matrix call sliceMaddMatrix
584 */
585template<class vobj>
586static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
587{
588 typedef typename vobj::vector_type vector_type;
589 typedef typename vobj::scalar_type scalar_type;
590 GridBase *grid = lhs.Grid();
591 assert(grid!=NULL);
592 conformable(grid,rhs.Grid());
593
594 const int Nd = grid->_ndimension;
595 const int Nsimd = grid->Nsimd();
596
597 assert(orthogdim >= 0);
598 assert(orthogdim < Nd);
599
600 int fd=grid->_fdimensions[orthogdim];
601 int ld=grid->_ldimensions[orthogdim];
602 int rd=grid->_rdimensions[orthogdim];
603
604 std::vector<vector_type> lvSum(rd); // will locally sum vectors first
605 std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
606 ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
607
608 result.resize(fd); // And then global sum to return the same vector to every node for IO to file
609 for(int r=0;r<rd;r++){
610 lvSum[r]=Zero();
611 }
612
613 int e1= grid->_slice_nblock[orthogdim];
614 int e2= grid->_slice_block [orthogdim];
615 int stride=grid->_slice_stride[orthogdim];
616
617 autoView( lhv, lhs, CpuRead);
618 autoView( rhv, rhs, CpuRead);
619 thread_for( r,rd,{
620
621 int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
622
623 for(int n=0;n<e1;n++){
624 for(int b=0;b<e2;b++){
625 int ss= so+n*stride+b;
626 vector_type vv = TensorRemove(innerProduct(lhv[ss],rhv[ss]));
627 lvSum[r]=lvSum[r]+vv;
628 }
629 }
630 });
631
632 // Sum across simd lanes in the plane, breaking out orthog dir.
633 Coordinate icoor(Nd);
634 for(int rt=0;rt<rd;rt++){
635
637 temp._internal = lvSum[rt];
638 extract(temp,extracted);
639
640 for(int idx=0;idx<Nsimd;idx++){
641
642 grid->iCoorFromIindex(icoor,idx);
643
644 int ldx =rt+icoor[orthogdim]*rd;
645
646 lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
647
648 }
649 }
650
651 // sum over nodes.
652 scalar_type gsum;
653 for(int t=0;t<fd;t++){
654 int pt = t/ld; // processor plane
655 int lt = t%ld;
656 if ( pt == grid->_processor_coor[orthogdim] ) {
657 gsum=lsSum[lt];
658 } else {
659 gsum=scalar_type(0.0);
660 }
661
662 grid->GlobalSum(gsum);
663
664 result[t]=gsum;
665 }
666}
667template<class vobj>
668static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog)
669{
670 typedef typename vobj::scalar_object sobj;
671 typedef typename vobj::scalar_type scalar_type;
672 typedef typename vobj::vector_type vector_type;
673
674 int Nblock = rhs.Grid()->GlobalDimensions()[Orthog];
675 std::vector<ComplexD> ip(Nblock);
676 sn.resize(Nblock);
677
678 sliceInnerProductVector(ip,rhs,rhs,Orthog);
679 for(int ss=0;ss<Nblock;ss++){
680 sn[ss] = real(ip[ss]);
681 }
682};
683
684
685template<class vobj>
686static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
687 int orthogdim,RealD scale=1.0)
688{
689 // perhaps easier to just promote A to a field and use regular madd
690 typedef typename vobj::scalar_object sobj;
691 typedef typename vobj::scalar_type scalar_type;
692 typedef typename vobj::vector_type vector_type;
693 typedef typename vobj::tensor_reduced tensor_reduced;
694
695 scalar_type zscale(scale);
696
697 GridBase *grid = X.Grid();
698
699 int Nsimd =grid->Nsimd();
700 int Nblock =grid->GlobalDimensions()[orthogdim];
701
702 int fd =grid->_fdimensions[orthogdim];
703 int ld =grid->_ldimensions[orthogdim];
704 int rd =grid->_rdimensions[orthogdim];
705
706 int e1 =grid->_slice_nblock[orthogdim];
707 int e2 =grid->_slice_block [orthogdim];
708 int stride =grid->_slice_stride[orthogdim];
709
710 Coordinate icoor;
711 for(int r=0;r<rd;r++){
712
713 int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
714
715 vector_type av;
716
717 for(int l=0;l<Nsimd;l++){
718 grid->iCoorFromIindex(icoor,l);
719 int ldx =r+icoor[orthogdim]*rd;
720 av.putlane(scalar_type(a[ldx])*zscale,l);
721 }
722
723 tensor_reduced at; at=av;
724
725 autoView( Rv, R, CpuWrite);
726 autoView( Xv, X, CpuRead);
727 autoView( Yv, Y, CpuRead);
728 thread_for2d( n, e1, b,e2, {
729 int ss= so+n*stride+b;
730 Rv[ss] = at*Xv[ss]+Yv[ss];
731 });
732 }
733};
734
735inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
736{
737 int NN = BlockSolverGrid->_ndimension;
738 int nsimd = BlockSolverGrid->Nsimd();
739
740 std::vector<int> latt_phys(NN-1);
741 Coordinate simd_phys;
742 std::vector<int> mpi_phys(NN-1);
743 Coordinate checker_dim_mask(NN-1);
744 int checker_dim=-1;
745
746 int dd;
747 for(int d=0;d<NN;d++){
748 if( d!=Orthog ) {
749 latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
750 mpi_phys[dd] =BlockSolverGrid->_processors[d];
751 checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
752 if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
753 dd++;
754 }
755 }
756 simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
757 GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
758 if(BlockSolverGrid->_isCheckerBoarded) {
759 GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
760 delete tmp;
761 return (GridBase *) ret;
762 } else {
763 return (GridBase *) tmp;
764 }
765}
766
767template<class vobj>
768static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
769{
770 GridBase *FullGrid = X.Grid();
771 GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
772
773 Lattice<vobj> Ys(SliceGrid);
774 Lattice<vobj> Rs(SliceGrid);
775 Lattice<vobj> Xs(SliceGrid);
776 Lattice<vobj> RR(FullGrid);
777
778 RR = R; // Copies checkerboard for insert
779
780 typedef typename vobj::scalar_object sobj;
781 typedef typename vobj::vector_type vector_type;
782 int Nslice = X.Grid()->GlobalDimensions()[Orthog];
783 for(int i=0;i<Nslice;i++){
784 ExtractSlice(Ys,Y,i,Orthog);
785 ExtractSlice(Rs,R,i,Orthog);
786 Rs=Ys;
787 for(int j=0;j<Nslice;j++){
788 ExtractSlice(Xs,X,j,Orthog);
789 Rs = Rs + Xs*(scale*aa(j,i));
790 }
791 InsertSlice(Rs,RR,i,Orthog);
792 }
793 R=RR; // Copy back handles arguments aliasing case
794 delete SliceGrid;
795};
796
797template<class vobj>
798static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
799{
800 R=Zero();
801 sliceMaddMatrix(R,aa,X,R,Orthog,scale);
802};
803
804
805template<class vobj>
806static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
807{
808 GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
809
810 Lattice<vobj> ls(SliceGrid);
811 Lattice<vobj> rs(SliceGrid);
812
813 typedef typename vobj::scalar_object sobj;
814 typedef typename vobj::vector_type vector_type;
815 int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
816 mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
817 for(int s=0;s<Nslice;s++){
818 ExtractSlice(ls,lhs,s,Orthog);
819 for(int ss=0;ss<Nslice;ss++){
820 ExtractSlice(rs,rhs,ss,Orthog);
821 mat(s,ss) = innerProduct(ls,rs);
822 }
823 }
824 delete SliceGrid;
825}
826
828
829
830
831
#define accelerator_for(iterator, num, nsimd,...)
std::vector< T, devAllocator< T > > deviceVector
#define one
Definition BGQQPX.h:108
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95
const Coordinate GridDefaultSimd(int dims, int nsimd)
Definition Init.cc:109
auto closure(const LatticeUnaryExpression< Op, T1 > &expr) -> Lattice< typename std::remove_const< decltype(expr.op.func(vecEval(0, expr.arg1)))>::type >
Definition Lattice_ET.h:492
accelerator_inline sobj eval(const uint64_t ss, const sobj &arg)
Definition Lattice_ET.h:103
void conformable(const Lattice< obj1 > &lhs, const Lattice< obj2 > &rhs)
auto localNorm2(const Lattice< vobj > &rhs) -> Lattice< typename vobj::tensor_reduced >
void peekLocalSite(sobj &s, const LatticeView< vobj > &l, Coordinate &site)
Lattice< vobj > real(const Lattice< vobj > &lhs)
vobj::scalar_object rankSum(const Lattice< vobj > &arg)
ComplexD innerProduct(const Lattice< vobj > &left, const Lattice< vobj > &right)
RealD maxLocalNorm2(const Lattice< vobj > &arg)
vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
strong_inline void innerProductNorm(ComplexD &ip, RealD &nrm, const Lattice< vobj > &left, const Lattice< vobj > &right)
vobj::scalar_object sum(const vobj *arg, Integer osites)
static void sliceMaddVector(Lattice< vobj > &R, std::vector< RealD > &a, const Lattice< vobj > &X, const Lattice< vobj > &Y, int orthogdim, RealD scale=1.0)
GridBase * makeSubSliceGrid(const GridBase *BlockSolverGrid, int Orthog)
ComplexD rankInnerProduct(const Lattice< vobj > &left, const Lattice< vobj > &right)
vobj::scalar_object sum_large(const Lattice< vobj > &arg)
static void sliceMulMatrix(Lattice< vobj > &R, Eigen::MatrixXcd &aa, const Lattice< vobj > &X, int Orthog, RealD scale=1.0)
static void sliceNorm(std::vector< RealD > &sn, const Lattice< vobj > &rhs, int Orthog)
vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
static void sliceMaddMatrix(Lattice< vobj > &R, Eigen::MatrixXcd &aa, const Lattice< vobj > &X, const Lattice< vobj > &Y, int Orthog, RealD scale=1.0)
static void sliceInnerProductMatrix(Eigen::MatrixXcd &mat, const Lattice< vobj > &lhs, const Lattice< vobj > &rhs, int Orthog)
vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
strong_inline RealD axpby_norm_fast(Lattice< vobj > &z, sobj a, sobj b, const Lattice< vobj > &x, const Lattice< vobj > &y)
strong_inline RealD axpy_norm_fast(Lattice< vobj > &z, sobj a, const Lattice< vobj > &x, const Lattice< vobj > &y)
vobj::scalar_object rankSumLarge(const Lattice< vobj > &arg)
static void sliceInnerProductVector(std::vector< ComplexD > &result, const Lattice< vobj > &lhs, const Lattice< vobj > &rhs, int orthogdim)
void sliceSum(const Lattice< vobj > &Data, std::vector< typename vobj::scalar_object > &result, int orthogdim)
vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
RealD norm2(const Lattice< vobj > &arg)
vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
Word svm_xor(Word *vec, uint64_t L)
void sliceSumReduction(const Lattice< vobj > &Data, std::vector< vobj > &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
void InsertSlice(const Lattice< vobj > &lowDim, Lattice< vobj > &higherDim, int slice, int orthog)
void ExtractSlice(Lattice< vobj > &lowDim, const Lattice< vobj > &higherDim, int slice, int orthog)
#define autoView(l_v, l, mode)
@ AcceleratorRead
@ CpuRead
@ AcceleratorWrite
@ CpuWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
static constexpr int Nd
Definition QCD.h:52
uint32_t Integer
Definition Simd.h:58
std::complex< RealD > ComplexD
Definition Simd.h:79
double RealD
Definition Simd.h:61
accelerator_inline ComplexD Reduce(const ComplexD &r)
Definition Simd.h:129
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87
accelerator_inline std::enable_if<!isGridTensor< T >::value, T >::type TensorRemove(T arg)
AcceleratorVector< __T,GRID_MAX_SIMD > ExtractBuffer
accelerator void extract(const vobj &vec, ExtractBuffer< sobj > &extracted)
accelerator_inline ComplexD innerProductD(const ComplexF &l, const ComplexF &r)
#define thread_for(i, num,...)
Definition Threads.h:60
#define thread_for2d(i1, n1, i2, n2,...)
Definition Threads.h:61
#define strong_inline
Definition Threads.h:36
uint64_t base
void GlobalSumVector(RealF *, int N)
static bool StepLog(const char *name)
static void ReductionLog(double lcl, double glbl)
static bool CsumLog(uint64_t csum)
static int LoggingMode
static bool NormLog(double value)
const Coordinate & GlobalDimensions(void)
Coordinate _slice_stride
Coordinate _fdimensions
Coordinate _slice_nblock
Coordinate _slice_block
Coordinate _checker_dim_mask
bool _isCheckerBoarded
int oSites(void) const
Coordinate _rdimensions
Coordinate _ostride
Coordinate _ldimensions
void iCoorFromIindex(Coordinate &coor, int lane)
int Nsimd(void) const
static int GetThreads(void)
static void GetWork(int nwork, int me, int &mywork, int &myoff)
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const
Definition Simd.h:194
vtype _internal