36template<
class Fobj,
class CComplex,
int nbasis>
39 typedef typename CComplex::scalar_object
SComplex;
68 std::vector<deviceVector<calcMatrix> >
BLAS_A;
70 std::vector<deviceVector<ComplexD *> >
BLAS_AP;
71 std::vector<deviceVector<ComplexD *> >
BLAS_BP;
93 for(
int p=0;p<
geom.npoint;p++){
122 int32_t padded_sites =
Cell.grids.back()->lSites();
123 int32_t unpadded_sites = CoarseGridMulti->
lSites();
126 int32_t orhs = nrhs/CComplex::Nsimd();
128 padded_sites = padded_sites/nrhs;
129 unpadded_sites = unpadded_sites/nrhs;
135 for(
int p=0;p<
geom.npoint;p++){
136 BLAS_A[p].resize (unpadded_sites);
139 BLAS_B.resize(nrhs *padded_sites);
140 BLAS_C.resize(nrhs *unpadded_sites);
143 for(
int p=0;p<
geom.npoint;p++){
144 BLAS_AP[p].resize(unpadded_sites);
145 BLAS_BP[p].resize(unpadded_sites);
147 BLAS_CP.resize(unpadded_sites);
154 for(
int p=0;p<
geom.npoint;p++){
155 for(
int ss=0;ss<unpadded_sites;ss++){
161 for(
int ss=0;ss<unpadded_sites;ss++){
168 for(int32_t s=0;s<padded_sites;s++){
170 for(int32_t point = 0 ; point <
geom.npoint; point++){
171 int i=s*orhs*
geom.npoint+point;
172 if(
Stencil._entries[i]._wrap ) {
178 for(int32_t point = 0 ; point <
geom.npoint; point++){
179 int i=s*orhs*
geom.npoint+point;
180 int32_t nbr =
Stencil._entries[i]._offset*CComplex::Nsimd();
181 assert(nbr<
BLAS_B.size());
188 assert(j==unpadded_sites);
192 typedef typename vobj::scalar_object sobj;
194 typedef typename vobj::vector_type vector_type;
204 for(
int i=0;i<nd;i++) nsite *= LocalLatt[i];
216 const int words=
sizeof(vobj)/
sizeof(vector_type);
220 Lexicographic::CoorFromIndex(
base,idx,LocalLatt);
221 for(
int i=0;i<nd;i++){
222 from_coor[i] =
base[i];
224 int from_oidx = 0;
for(
int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
225 int from_lane = 0;
for(
int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
227 const vector_type* from = (
const vector_type *)&from_v[from_oidx];
231 for(
int w=0;w<words;w++){
232 stmp =
getlane(from[w], from_lane);
239 typedef typename vobj::scalar_object sobj;
241 typedef typename vobj::vector_type vector_type;
247 assert(in.size()==Tg->
lSites());
251 for(
int i=0;i<nd;i++) nsite *= LocalLatt[i];
261 auto from_v = &in[0];
263 const int words=
sizeof(vobj)/
sizeof(vector_type);
267 Lexicographic::CoorFromIndex(
base,idx,LocalLatt);
268 for(
int i=0;i<nd;i++){
269 to_coor[i] =
base[i];
271 int to_oidx = 0;
for(
int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
272 int to_lane = 0;
for(
int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
274 vector_type* to = (vector_type *)&to_v[to_oidx];
278 for(
int w=0;w<words;w++){
289 std::cout <<
GridLogMessage<<
"GeneralCoarsenMatrixMrhs "<< std::endl;
319 Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
320 Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
322 for(
int k=0;k<npoint;k++){
324 for(
int l=0;l<npoint;l++){
326 for(
int mu=0;mu<
Nd;mu++){
334 invMkl = Mkl.inverse();
342 std::vector<FineComplexField> phaF(npoint,grid);
343 std::vector<CoarseComplexField> pha(npoint,
CoarseGrid);
347 typedef typename CComplex::scalar_type
SComplex;
350 for(
int p=0;p<npoint;p++){
356 for(
int mu=0;mu<
Nd;mu++){
359 pha[p] = pha[p] + (TwoPiL *
geom_srhs.shifts[p][mu]) * coor;
361 pha[p] =
exp(pha[p]*ci);
367 std::vector<CoarseMatrix> _A;
370 std::vector<CoarseVector> ComputeProj(npoint,
CoarseGrid);
372 for(
int i=0;i<nbasis;i++){
373 std::cout <<
GridLogMessage<<
"CoarsenMatrixColoured vec "<<i<<
"/"<<nbasis<< std::endl;
374 for(
int p=0;p<npoint;p++){
376 phaV = phaF[p]*Subspace.
subspace[i];
382 linop.Op(phaV,MphaV);
387 coarseInner =
conjugate(pha[p]) * coarseInner;
389 ComputeProj[p] = coarseInner;
393 for(
int k=0;k<npoint;k++){
396 for(
int l=0;l<npoint;l++){
397 FT= FT+ invMkl(l,k)*ComputeProj[l];
404 for(
int j=0;j<nbasis;j++){
405 A_v[sss](i,j) = FT_v[sss](j);
446 std::cout <<
GridLogMessage<<
"GeneralCoarsenMatrixMrhs "<< std::endl;
481 Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
482 Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
484 for(
int k=0;k<npoint;k++){
486 for(
int l=0;l<npoint;l++){
488 for(
int mu=0;mu<
Nd;mu++){
496 invMkl = Mkl.inverse();
504 std::vector<FineComplexField> phaF(npoint,grid);
505 std::vector<CoarseComplexField> pha(npoint,
CoarseGrid);
510 typedef typename CComplex::scalar_type
SComplex;
513 for(
int p=0;p<npoint;p++){
519 for(
int mu=0;mu<
Nd;mu++){
522 pha[p] = pha[p] + (TwoPiL *
geom_srhs.shifts[p][mu]) * coor;
524 pha[p] =
exp(pha[p]*ci);
531 std::vector<CoarseMatrix> _A;
536 std::vector<FineField> _MphaV(batch,grid);
537 std::vector<CoarseVector> TmpProj(batch,
CoarseGrid);
539 std::vector<CoarseVector> ComputeProj(npoint,
CoarseGrid);
541 for(
int i=0;i<nbasis;i++){
542 std::cout <<
GridLogMessage<<
"CoarsenMatrixColoured vec "<<i<<
"/"<<nbasis<< std::endl;
546 for(
int p=0;p<npoint;p+=batch){
548 for(
int b=0;b<
MIN(batch,npoint-p);b++){
550 phaV = phaF[p+b]*Subspace.
subspace[i];
559 linop.Op(phaV,MphaV);
570 for(
int b=0;b<
MIN(batch,npoint-p);b++){
571 ComputeProj[p+b] =
conjugate(pha[p+b])*TmpProj[b];
579 for(
int k=0;k<npoint;k++){
586 for(
int l=0;l<npoint;l++){
587 FT= FT+ invMkl(l,k)*ComputeProj[l];
592 for(ll=0;ll+radix-1<npoint;ll+=radix){
595 + invMkl(ll+0,k)*ComputeProj[ll+0]
596 + invMkl(ll+1,k)*ComputeProj[ll+1]
597 + invMkl(ll+2,k)*ComputeProj[ll+2]
598 + invMkl(ll+3,k)*ComputeProj[ll+3]
599 + invMkl(ll+4,k)*ComputeProj[ll+4]
600 + invMkl(ll+5,k)*ComputeProj[ll+5]
601 + invMkl(ll+6,k)*ComputeProj[ll+6]
602 + invMkl(ll+7,k)*ComputeProj[ll+7]
603 + invMkl(ll+8,k)*ComputeProj[ll+8];
605 for(
int l=ll;l<npoint;l++){
606 FT= FT+ invMkl(l,k)*ComputeProj[l];
615 for(
int j=0;j<nbasis;j++){
616 A_v[sss](i,j) = FT_v[sss](j);
633 std::cout <<
GridLogMessage<<
"CoarsenOperator phase "<<tphase<<
" us"<<std::endl;
634 std::cout <<
GridLogMessage<<
"CoarsenOperator phaseBZ "<<tphaseBZ<<
" us"<<std::endl;
635 std::cout <<
GridLogMessage<<
"CoarsenOperator mat "<<tmat <<
" us"<<std::endl;
636 std::cout <<
GridLogMessage<<
"CoarsenOperator proj "<<tproj<<
" us"<<std::endl;
637 std::cout <<
GridLogMessage<<
"CoarsenOperator inv "<<tinv<<
" us"<<std::endl;
665 int npoint =
geom.npoint;
669 const int Nsimd = CComplex::Nsimd();
678 flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
690 for(
int p=0;p<
geom.npoint;p++){
726 virtual void MdirAll (
const Field &in, std::vector<Field> &out){assert(0);};
void acceleratorPut(T &dev, const T &host)
#define accelerator_for(iterator, num, nsimd,...)
std::vector< T, devAllocator< T > > deviceVector
AcceleratorVector< int, MaxDims > Coordinate
accelerator_inline S getlane(const Grid_simd< S, V > &in, int lane)
accelerator_inline void putlane(Grid_simd< S, V > &vec, const S &_S, int lane)
accelerator_inline Grid_simd< S, V > exp(const Grid_simd< S, V > &r)
void LatticeCoordinate(Lattice< iobj > &l, int mu)
Lattice< vobj > conjugate(const Lattice< vobj > &lhs)
void blockOrthogonalise(Lattice< CComplex > &ip, std::vector< Lattice< vobj > > &Basis)
void blockZAXPY(Lattice< vobj > &fineZ, const Lattice< CComplex > &coarseA, const Lattice< vobj2 > &fineX, const Lattice< vobj > &fineY)
void blockProject(Lattice< iVector< CComplex, nbasis > > &coarseData, const Lattice< vobj > &fineData, const VLattice &Basis)
#define autoView(l_v, l, mode)
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL")
#define NAMESPACE_BEGIN(A)
std::complex< RealD > ComplexD
std::vector< Lattice< Fobj > > subspace
unsigned long _ndimension
std::vector< CoarseMatrix > _A
void gemmBatched(int m, int n, int k, ComplexD alpha, deviceVector< ComplexD * > &Amk, deviceVector< ComplexD * > &Bkn, ComplexD beta, deviceVector< ComplexD * > &Cmn)
const Coordinate & GlobalDimensions(void)
const Coordinate & FullDimensions(void)
const Coordinate & LocalDimensions(void)
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const
void Mdag(const CoarseVector &in, CoarseVector &out)
std::vector< deviceVector< ComplexD * > > BLAS_BP
Lattice< CComplex > FineComplexField
Lattice< CComplex > CoarseScalar
Lattice< siteVector > CoarseVector
std::vector< deviceVector< calcMatrix > > BLAS_A
void M(const CoarseVector &in, CoarseVector &out)
NonLocalStencilGeometry geom
deviceVector< ComplexD * > BLAS_CP
iVector< CComplex, nbasis > siteVector
Lattice< iScalar< CComplex > > CoarseComplexField
iMatrix< SComplex, nbasis > calcMatrix
void SetMatrix(int p, CoarseMatrix &A)
CComplex::scalar_object SComplex
deviceVector< calcVector > BLAS_C
NonLocalStencilGeometry geom_srhs
Lattice< iMatrix< CComplex, nbasis > > CoarseMatrix
deviceVector< calcVector > BLAS_B
virtual void MdirAll(const Field &in, std::vector< Field > &out)
iMatrix< CComplex, nbasis > Cobj
void BLAStoGrid(Lattice< vobj > &grid, deviceVector< typename vobj::scalar_object > &in)
void GridtoBLAS(const Lattice< vobj > &from, deviceVector< typename vobj::scalar_object > &to)
MultiGeneralCoarsenedMatrix< Fobj, CComplex, nbasis > MultiGeneralCoarseOp
iVector< CComplex, nbasis > Cvec
GridCartesian * _CoarseGridMulti
void CopyMatrix(GeneralCoarseOp &_Op)
std::vector< deviceVector< ComplexD * > > BLAS_AP
virtual void Mdiag(const Field &in, Field &out)
GridCartesian * CoarseGrid(void)
MultiGeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom, GridCartesian *CoarseGridMulti)
iMatrix< CComplex, nbasis > siteMatrix
Lattice< Fobj > FineField
iVector< SComplex, nbasis > calcVector
void GetMatrix(int p, CoarseMatrix &A)
GeneralLocalStencil Stencil
GeneralCoarsenedMatrix< Fobj, CComplex, nbasis > GeneralCoarseOp
virtual void Mdir(const Field &in, Field &out, int dir, int disp)
void CoarsenOperator(LinearOperatorBase< Lattice< Fobj > > &linop, Aggregation< Fobj, CComplex, nbasis > &Subspace, GridBase *CoarseGrid)
void Allocate(int _nbasis, GridBase *_fgrid, GridBase *_cgrid)
void blockProject(std::vector< Field > &fine, std::vector< Lattice< cobj > > &coarse)
void ImportBasis(std::vector< Field > &vecs)
Lattice< vobj > Extract(const Lattice< vobj > &in) const