39 for(
int d=0;d<_ndimension;d++){
59 full.Grid()->oCoorFromOindex(coor,ss);
60 cbos=half.Grid()->CheckerBoard(coor);
63 int ssh=half.Grid()->oIndex(coor);
64 half_v[ssh] = full_v[ss];
78 full.Grid()->oCoorFromOindex(coor,ss);
79 cbos=half.Grid()->CheckerBoard(coor);
82 int ssh=half.Grid()->oIndex(coor);
83 full_v[ss]=half_v[ssh];
104 Lexicographic::CoorFromIndex(coor,ss,rdim_full);
105 assert(coor.size()==ndim_half);
107 for(int d=0;d<ndim_half;d++){
108 if(checker_dim_mask_half[d]) linear += coor[d];
114 for(
int d=0;d<ndim_half;d++) {
115 if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
116 else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
138 Lexicographic::CoorFromIndex(coor,ss,rdim_full);
139 assert(coor.size()==ndim_half);
141 for(int d=0;d<ndim_half;d++){
142 if(checker_dim_mask_half[d]) linear += coor[d];
148 for(
int d=0;d<ndim_half;d++){
149 if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
150 else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
204template<
typename T1,
typename T2>
209template<
typename T1,
typename T2>
214template<
typename T1,
typename T2>
219template<
typename T1,
typename T2,
int N>
221 for (
int i=0;i<N;i++)
222 for (
int j=0;j<N;j++)
226template<
typename T1,
typename T2,
int N>
228 for (
int i=0;i<N;i++)
232template<
typename T1,
typename T2>
237 convertType(out_v[ss],in_v(ss));
257 convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
266template<
class vobj,
class CComplex,
int nbasis,
class VLattice>
269 const VLattice &Basis)
272 GridBase * coarse= coarseData.Grid();
282 for(
int v=0;v<nbasis;v++) {
288 convertType(coarseData_[sc](v),ip_[sc]);
296 blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
306template<
class vobj,
class CComplex,
int nbasis,
class VLattice>
309 const VLattice &Basis)
311 int NBatch = fineData.size();
312 assert(coarseData.size() == NBatch);
314 GridBase * fine = fineData[0].Grid();
315 GridBase * coarse= coarseData[0].Grid();
318 std::vector<Lattice<vobj>> fineDataCopy = fineData;
321 for(
int v=0;v<nbasis;v++) {
322 for (
int k=0; k<NBatch; k++) {
326 convertType(coarseData_[sc](v),ip_[sc]);
332 blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
337template<
class vobj,
class vobj2,
class CComplex>
357 for(
int d=0 ; d<_ndimension;d++){
372 Coordinate coor_c(_ndimension);
373 Coordinate coor_f(_ndimension);
375 Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
376 for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
377 Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
381 typename vobj2::tensor_reduced::scalar_object cA;
382 typename vobj::scalar_object cAx;
384 typename vobj2::tensor_reduced cA;
387 convertType(cA,TensorRemove(coarseA_(sc)));
388 auto prod = cA*fineX_(sf);
389 convertType(cAx,prod);
390 coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
397template<
class vobj,
class CComplex>
424 convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
431template<
class vobj,
class CComplex>
451 CoarseInner_[ss] = coarse_inner_[ss];
456template<
class vobj,
class CComplex>
470 const int maxsubsec=256;
482 for(
int d=0 ; d<_ndimension;d++){
492 auto coarseData_p = &coarseData_[0];
493 auto fineData_p = &fineData_[0];
502 int subsec=maxsubsec;
504 subvol=blockVol/subsec;
505 while(subvol*subsec!=blockVol){
507 subvol=blockVol/subsec;
512 auto coarseTmp_p= &coarseTmp_[0];
521 Coordinate coor_c(_ndimension);
522 Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);
524 auto cd = coalescedRead(zz);
525 for(int sb=e*subvol;sb<MIN((e+1)*subvol,blockVol);sb++){
527 Coordinate coor_b(_ndimension);
528 Coordinate coor_f(_ndimension);
529 Lexicographic::CoorFromIndex(coor_b,sb,block_r);
530 for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
531 Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
533 cd=cd+coalescedRead(fineData_p[sf]);
541 auto cd = coalescedRead(coarseTmp_p[sc](0));
542 for(int e=1;e<subsec;e++){
543 cd=cd+coalescedRead(coarseTmp_p[sc](e));
566 int lo = (coor[d])*block;
567 int hi = (coor[d]+1)*block;
568 picked = where( (fcoor<hi) , picked, zz);
569 picked = where( (fcoor>=lo), picked, zz);
573template<
class CComplex,
class VLattice>
579 int nbasis = Basis.size() ;
583 for(
int i=0;i<nbasis;i++){
587 for(
int v=0;v<nbasis;v++) {
588 for(
int u=0;u<v;u++) {
598template<
class vobj,
class CComplex>
604#ifdef GRID_ACCELERATED
606template<
class vobj,
class CComplex,
int nbasis>
612 GridBase * coarse= coarseData.Grid();
616 assert( nbasis == Basis.size() );
618 for(
int i=0;i<nbasis;i++){
624 for(
int d=0 ; d<_ndimension;d++){
631 std::vector<Vview> AcceleratorVecViewContainer_h;
632 for(
int v=0;v<nbasis;v++) {
633 AcceleratorVecViewContainer_h.push_back(Basis[v].View(
AcceleratorRead));
635 static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis);
636 acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *
sizeof(Vview));
637 auto Basis_p = &AcceleratorVecViewContainer[0];
643 Coordinate coor_c(_ndimension);
644 Coordinate coor_f(_ndimension);
646 Lexicographic::CoorFromIndex(coor_f,sf,frdimensions);
647 for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
648 Lexicographic::IndexFromCoor(coor_c,sc,crdimensions);
650 auto sum= coarseData_(sc)(0) *Basis_p[0](sf);
651 for(int i=1;i<nbasis;i++) sum = sum + coarseData_(sc)(i)*Basis_p[i](sf);
652 coalescedWrite(fineData_[sf],sum);
654 for(
int v=0;v<nbasis;v++) {
655 AcceleratorVecViewContainer_h[v].ViewClose();
661template<
class vobj,
class CComplex,
int nbasis,
class VLattice>
664 const VLattice &Basis)
667 GridBase * coarse= coarseData.Grid();
669 for(
int i=0;i<nbasis;i++) {
684template<
class vobj,
class CComplex,
int nbasis,
class VLattice>
687 const VLattice &Basis)
689 int NBatch = coarseData.size();
690 assert(fineData.size() == NBatch);
693 GridBase * coarse = coarseData[0].Grid();
694 for (
int k=0; k<NBatch; k++)
696 for (
int i=0;i<nbasis;i++) {
697 for (
int k=0; k<NBatch; k++) {
699 blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
706template<
class vobj,
class vvobj>
709 typedef typename vobj::scalar_object sobj;
710 typedef typename vvobj::scalar_object ssobj;
720 for(
int d=0;d<no;d++){
732 Coordinate lcoor(ni);
733 ig->LocalIndexToLocalCoor(idx,lcoor);
734 peekLocalSite(s,in_v,lcoor);
736 pokeLocalSite(ss,out_v,lcoor);
743 typedef typename vobj::scalar_object sobj;
745 typedef typename vobj::vector_type vector_type;
747 const int words=
sizeof(vobj)/
sizeof(vector_type);
757 int Nsimd = Fg->
Nsimd();
763 for(
int d=0;d<nd;d++){
778 for(
int i=0;i<nd;i++) nsite *= RegionSize[i];
780 typedef typename vobj::vector_type vector_type;
789 Lexicographic::CoorFromIndex(
base,idx,RegionSize);
790 for(
int i=0;i<nd;i++){
791 from_coor[i] =
base[i] + FromLowerLeft[i];
792 to_coor[i] =
base[i] + ToLowerLeft[i];
794 int from_oidx = 0;
for(
int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
795 int from_lane = 0;
for(
int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
796 int to_oidx = 0;
for(
int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
797 int to_lane = 0;
for(
int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
799 const vector_type* from = (
const vector_type *)&from_v[from_oidx];
800 vector_type* to = (vector_type *)&to_v[to_oidx];
803 for(
int w=0;w<words;w++){
804 stmp =
getlane(from[w], from_lane);
813 typedef typename vobj::scalar_object sobj;
815 typedef typename vobj::vector_type vector_type;
817 const int words=
sizeof(vobj)/
sizeof(vector_type);
826 int Nsimd = Fg->
Nsimd();
842 for(
int i=0;i<nF;i++) nsite *= RegionSize[i];
844 typedef typename vobj::vector_type vector_type;
853 Lexicographic::CoorFromIndex(from_coor,idx,RegionSize);
855 for(
int i=0;i<nT;i++){
857 to_coor[i] = from_coor[j];
863 int from_oidx = 0;
for(
int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
864 int from_lane = 0;
for(
int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
865 int to_oidx = 0;
for(
int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
866 int to_lane = 0;
for(
int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
868 const vector_type* from = (
const vector_type *)&from_v[from_oidx];
869 vector_type* to = (vector_type *)&to_v[to_oidx];
872 for(
int w=0;w<words;w++){
873 stmp =
getlane(from[w], from_lane);
882 typedef typename vobj::scalar_object sobj;
884 typedef typename vobj::vector_type vector_type;
886 const int words=
sizeof(vobj)/
sizeof(vector_type);
895 int Nsimd = Fg->
Nsimd();
911 for(
int i=0;i<nT;i++) nsite *= RegionSize[i];
913 typedef typename vobj::vector_type vector_type;
922 Lexicographic::CoorFromIndex(to_coor,idx,RegionSize);
924 for(
int i=0;i<nF;i++){
926 from_coor[i] = to_coor[j];
929 from_coor[i] = slice;
932 int from_oidx = 0;
for(
int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
933 int from_lane = 0;
for(
int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
934 int to_oidx = 0;
for(
int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
935 int to_lane = 0;
for(
int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
937 const vector_type* from = (
const vector_type *)&from_v[from_oidx];
938 vector_type* to = (vector_type *)&to_v[to_oidx];
941 for(
int w=0;w<words;w++){
942 stmp =
getlane(from[w], from_lane);
951 typedef typename vobj::scalar_object sobj;
964 for(
int d=0;d<nh;d++){
977 Coordinate lcoor(nl);
978 Coordinate hcoor(nh);
979 lg->LocalIndexToLocalCoor(idx,lcoor);
981 hcoor[orthog] = slice;
982 for(int d=0;d<nh;d++){
985 if ( hg->_checker_dim == d ) {
987 lcoor[ddl]=lcoor[ddl]*2;
1001 typedef typename vobj::scalar_object sobj;
1015 for(
int d=0;d<nh;d++){
1027 Coordinate lcoor(nl);
1028 Coordinate hcoor(nh);
1029 lg->LocalIndexToLocalCoor(idx,lcoor);
1030 hcoor[orthog] = slice;
1032 for(int d=0;d<nh;d++){
1034 hcoor[d]=lcoor[ddl];
1035 if ( hg->_checker_dim == d ) {
1036 hcoor[d]=hcoor[d]*2;
1037 lcoor[ddl]=lcoor[ddl]*2;
1052 typedef typename vobj::scalar_object sobj;
1063 for(
int d=0;d<nh;d++){
1071 Coordinate f_ll(nl,0); f_ll[orthog]=slice_lo;
1072 Coordinate t_ll(nh,0); t_ll[orthog]=slice_hi;
1087 typedef typename vobj::scalar_object sobj;
1106 for(int64_t g=0;g<fg->
gSites();g++){
1109 for(
int d=0;d<nd;d++){
1121template<
typename vobj,
typename sobj>
1126 typedef typename vobj::vector_type vtype;
1129 out.resize(in_grid->
lSites());
1131 int ndim = in_grid->
Nd();
1132 int in_nsimd = vtype::Nsimd();
1134 std::vector<Coordinate > in_icoor(in_nsimd);
1136 for(
int lane=0; lane < in_nsimd; lane++){
1137 in_icoor[lane].resize(ndim);
1145 ExtractPointerArray<sobj> out_ptrs(in_nsimd);
1147 Coordinate in_ocoor(ndim);
1148 in_grid->oCoorFromOindex(in_ocoor, in_oidx);
1150 Coordinate lcoor(in_grid->Nd());
1152 for(int lane=0; lane < in_nsimd; lane++){
1154 for(int mu=0;mu<ndim;mu++){
1155 lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
1159 Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
1160 assert(lex < out.size());
1161 out_ptrs[lane] = &out[lex];
1165 const vobj & in_vobj = in_v[in_oidx];
1166 extract(in_vobj, out_ptrs, 0);
1170template<
typename vobj,
typename sobj>
1175 typedef typename vobj::vector_type vtype;
1178 out.resize(in_grid->
lSites());
1180 int ndim = in_grid->
Nd();
1181 int in_nsimd = vtype::Nsimd();
1183 std::vector<Coordinate > in_icoor(in_nsimd);
1185 for(
int lane=0; lane < in_nsimd; lane++){
1186 in_icoor[lane].resize(ndim);
1192 std::vector<sobj*> out_ptrs(in_nsimd);
1194 Coordinate in_ocoor(ndim);
1195 in_grid->oCoorFromOindex(in_ocoor, in_oidx);
1197 Coordinate lcoor(in_grid->Nd());
1199 for(int lane=0; lane < in_nsimd; lane++){
1200 for(int mu=0;mu<ndim;mu++)
1201 lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
1204 Lexicographic::IndexFromCoorReversed(lcoor, lex, in_grid->_ldimensions);
1205 out_ptrs[lane] = &out[lex];
1209 const vobj & in_vobj = in.
_odata[in_oidx];
1210 extract1(in_vobj, out_ptrs, 0);
1215template<
typename vobj,
typename sobj>
1216typename std::enable_if<isSIMDvectorized<vobj>::value
1221 typedef typename vobj::vector_type vtype;
1224 assert(in.size()==grid->
lSites());
1226 const int ndim = grid->
Nd();
1227 constexpr int nsimd = vtype::Nsimd();
1229 std::vector<Coordinate > icoor(nsimd);
1231 for(
int lane=0; lane < nsimd; lane++){
1232 icoor[lane].resize(ndim);
1238 ExtractPointerArray<sobj> ptrs(nsimd);
1240 Coordinate ocoor(ndim);
1241 Coordinate lcoor(ndim);
1242 grid->oCoorFromOindex(ocoor, oidx);
1244 for(int lane=0; lane < nsimd; lane++){
1246 for(int mu=0;mu<ndim;mu++){
1247 lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
1251 Lexicographic::IndexFromCoor(lcoor, lex, grid->_ldimensions);
1252 ptrs[lane] = &in[lex];
1257 merge(vecobj, ptrs, 0);
1258 out_v[oidx] = vecobj;
1262template<
typename vobj,
typename sobj>
1263typename std::enable_if<isSIMDvectorized<vobj>::value
1268 typedef typename vobj::vector_type vtype;
1271 assert(in.size()==grid->
lSites());
1273 int ndim = grid->
Nd();
1274 int nsimd = vtype::Nsimd();
1276 std::vector<Coordinate > icoor(nsimd);
1278 for(
int lane=0; lane < nsimd; lane++){
1279 icoor[lane].resize(ndim);
1285 std::vector<sobj*> ptrs(nsimd);
1287 Coordinate ocoor(ndim);
1288 grid->oCoorFromOindex(ocoor, oidx);
1290 Coordinate lcoor(grid->Nd());
1292 for(int lane=0; lane < nsimd; lane++){
1294 for(int mu=0;mu<ndim;mu++){
1295 lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
1299 Lexicographic::IndexFromCoorReversed(lcoor, lex, grid->_ldimensions);
1300 ptrs[lane] = &in[lex];
1305 merge1(vecobj, ptrs, 0);
1306 out.
_odata[oidx] = vecobj;
1311template<
class VobjOut,
class VobjIn>
1314 typedef typename VobjOut::vector_type Vout;
1315 typedef typename VobjIn::vector_type Vin;
1316 const int N =
sizeof(VobjOut)/
sizeof(Vout);
1323 Vout *vout = (Vout *)&out_v[idx];
1324 Vin *vin = (Vin *)&in_v[idx];
1325 precisionChange(vout,vin,N);
1329template<
class VobjOut,
class VobjIn>
1333 for(
int d=0;d<out.
Grid()->
Nd();d++){
1340 typedef typename VobjOut::scalar_object SobjOut;
1341 typedef typename VobjIn::scalar_object SobjIn;
1343 int ndim = out.
Grid()->
Nd();
1344 int out_nsimd = out_grid->
Nsimd();
1345 int in_nsimd = in_grid->
Nsimd();
1346 std::vector<Coordinate > out_icoor(out_nsimd);
1348 for(
int lane=0; lane < out_nsimd; lane++){
1349 out_icoor[lane].resize(ndim);
1353 std::vector<SobjOut> in_slex_conv(in_grid->
lSites());
1358 Coordinate out_ocoor(ndim);
1359 out_grid->oCoorFromOindex(out_ocoor, out_oidx);
1361 ExtractPointerArray<SobjOut> ptrs(out_nsimd);
1363 Coordinate lcoor(out_grid->Nd());
1365 for(int lane=0; lane < out_nsimd; lane++){
1366 for(int mu=0;mu<ndim;mu++)
1367 lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
1369 int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
1370 ptrs[lane] = &in_slex_conv[llex];
1372 merge(out_v[out_oidx], ptrs, 0);
1385 assert(out_grid->
Nd() == in_grid->
Nd());
1386 for(
int d=0;d<out_grid->
Nd();d++){
1389 int Nsimd_out = out_grid->
Nsimd();
1391 std::vector<Coordinate> out_icorrs(out_grid->
Nsimd());
1392 for(
int lane=0; lane < out_grid->
Nsimd(); lane++)
1395 std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->
lSites());
1397 Coordinate out_ocorr;
1398 out_grid->oCoorFromOindex(out_ocorr, out_oidx);
1401 for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
1402 out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
1407 int in_oidx = 0, in_lane = 0;
1408 for(int d=0;d<in_grid->_ndimension;d++){
1409 in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
1410 in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
1412 fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
1417 size_t fmap_bytes = out_grid->lSites() *
sizeof(std::pair<Integer,Integer>);
1443template<
class VobjOut,
class VobjIn>
1445 if(out.Grid() == in.Grid()){
1452template<
class VobjOut,
class VobjIn>
1460template<
class VobjOut,
class VobjIn>
1464 static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1,
"precisionChange: tensor types must be the same" );
1467 constexpr int Nsimd_out = VobjOut::Nsimd();
1470 std::pair<Integer,Integer>
const* fmap_device = workspace.
getMap();
1477 std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
1478 for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
1479 int in_oidx = fmap_osite[out_lane].first;
1480 int in_lane = fmap_osite[out_lane].second;
1481 copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
1488template<
class VobjOut,
class VobjIn>
1548 typedef typename Vobj::scalar_object Sobj;
1550 int full_vecs = full.size();
1552 assert(full_vecs>=1);
1554 GridBase * full_grid = full[0].Grid();
1564 int cb = full[0].Checkerboard();
1571 for(
int n=0;n<full_vecs;n++){
1572 assert(full[n].Checkerboard() == cb);
1573 for(
int d=0;d<ndim;d++){
1579 int nvector =full_nproc/split_nproc;
1580 assert(nvector*split_nproc==full_nproc);
1581 assert(nvector == full_vecs);
1584 for(
int d=0;d<ndim;d++){
1588 uint64_t lsites = full_grid->
lSites();
1589 uint64_t sz = lsites * nvector;
1590 std::vector<Sobj> tmpdata(sz);
1591 std::vector<Sobj> alldata(sz);
1592 std::vector<Sobj> scalardata(lsites);
1594 for(
int v=0;v<nvector;v++){
1597 alldata[v*lsites+site] = scalardata[site];
1604 for(
int d=ndim-1;d>=0;d--){
1606 if ( ratio[d] != 1 ) {
1608 full_grid ->
AllToAll(d,alldata,tmpdata);
1611 split_grid->
AllToAll(d,alldata,tmpdata);
1616 auto rsites= lsites*M;
1625 int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
1630 for(
int m=0;m<M;m++){
1631 for(
int s=0;s<sP;s++){
1635 uint64_t lex_c = c+chunk*m+chunk*M*s;
1636 uint64_t lex_fvol_vec = c+chunk*s;
1637 uint64_t lex_fvol = lex_fvol_vec%fvol;
1638 uint64_t lex_vec = lex_fvol_vec/fvol;
1641 Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);
1642 coor[d] += m*ldims[d];
1643 Lexicographic::IndexFromCoor(coor, lex_r, rdims);
1644 lex_r += lex_vec * rsites;
1647 alldata[lex_r] = tmpdata[lex_c];
1652 ldims[d]*= ratio[d];
1664 std::vector<Lattice<Vobj> > full_v(nvector,full.
Grid());
1665 for(
int n=0;n<nvector;n++){
1674 typedef typename Vobj::scalar_object Sobj;
1676 int full_vecs = full.size();
1678 assert(full_vecs>=1);
1680 GridBase * full_grid = full[0].Grid();
1690 int cb = full[0].Checkerboard();
1697 for(
int n=0;n<full_vecs;n++){
1698 assert(full[n].Checkerboard() == cb);
1699 for(
int d=0;d<ndim;d++){
1705 int nvector =full_nproc/split_nproc;
1706 assert(nvector*split_nproc==full_nproc);
1707 assert(nvector == full_vecs);
1710 for(
int d=0;d<ndim;d++){
1714 uint64_t lsites = full_grid->
lSites();
1715 uint64_t sz = lsites * nvector;
1716 std::vector<Sobj> tmpdata(sz);
1717 std::vector<Sobj> alldata(sz);
1718 std::vector<Sobj> scalardata(lsites);
1727 uint64_t rsites = split_grid->
lSites();
1730 for(
int d=0;d<ndim;d++){
1732 if ( ratio[d] != 1 ) {
1739 auto ldims = rdims; ldims[d] /= M;
1740 auto lsites= rsites/M;
1743 int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
1749 for(
int m=0;m<M;m++){
1750 for(
int s=0;s<sP;s++){
1754 uint64_t lex_c = c+chunk*m+chunk*M*s;
1755 uint64_t lex_fvol_vec = c+chunk*s;
1756 uint64_t lex_fvol = lex_fvol_vec%fvol;
1757 uint64_t lex_vec = lex_fvol_vec/fvol;
1760 Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);
1761 coor[d] += m*ldims[d];
1762 Lexicographic::IndexFromCoor(coor, lex_r, rdims);
1763 lex_r += lex_vec * rsites;
1766 tmpdata[lex_c] = alldata[lex_r];
1773 split_grid->
AllToAll(d,tmpdata,alldata);
1776 full_grid ->
AllToAll(d,tmpdata,alldata);
1783 lsites = full_grid->
lSites();
1784 for(
int v=0;v<nvector;v++){
1786 scalardata[site] = alldata[v*lsites+site];
1795template<
class vobj,
class CComplex,
int nbasis,
class VLattice>
1798 const VLattice &Basis)
1801 GridBase * coarse= coarseData.Grid();
1809 for(
int v=0;v<nbasis;v++) {
1815 convertType(coarseData_[sc](v),ip_[sc]);
accelerator_inline int acceleratorSIMTlane(int Nsimd)
#define accelerator_inline
void * acceleratorAllocDevice(size_t bytes)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
void acceleratorFreeDevice(void *ptr)
std::vector< T, devAllocator< T > > deviceVector
AcceleratorVector< int, MaxDims > Coordinate
accelerator_inline S getlane(const Grid_simd< S, V > &in, int lane)
accelerator_inline void putlane(Grid_simd< S, V > &vec, const S &_S, int lane)
Grid_simd2< complex< double >, vComplexD > vComplexD2
Grid_simd< complex< float >, SIMD_Ftype > vComplexF
Grid_simd< complex< double >, SIMD_Dtype > vComplexD
Invoke< std::enable_if<!Condition::value, ReturnType > > NotEnableIf
Invoke< std::enable_if< Condition::value, ReturnType > > EnableIf
void LatticeCoordinate(Lattice< iobj > &l, int mu)
auto localInnerProduct(const Lattice< vobj > &lhs, const Lattice< vobj > &rhs) -> Lattice< typename vobj::tensor_reduced >
auto PeekIndex(const Lattice< vobj > &lhs, int i) -> Lattice< decltype(peekIndex< Index >(vobj(), i))>
void pokeSite(const sobj &s, Lattice< vobj > &l, const Coordinate &site)
void peekLocalSite(sobj &s, const LatticeView< vobj > &l, Coordinate &site)
void pokeLocalSite(const sobj &s, LatticeView< vobj > &l, Coordinate &site)
vobj::scalar_object peekSite(const Lattice< vobj > &l, const Coordinate &site)
ComplexD innerProduct(const Lattice< vobj > &left, const Lattice< vobj > &right)
void InsertSliceFast(const Lattice< vobj > &From, Lattice< vobj > &To, int slice, int orthog)
void subdivides(GridBase *coarse, GridBase *fine)
void blockProjectFast(Lattice< iVector< CComplex, nbasis > > &coarseData, const Lattice< vobj > &fineData, const VLattice &Basis)
void InsertSlice(const Lattice< vobj > &lowDim, Lattice< vobj > &higherDim, int slice, int orthog)
void localCopyRegion(const Lattice< vobj > &From, Lattice< vobj > &To, Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize)
auto _precisionChangeFastWrap(Lattice< VobjOut > &out, const Lattice< VobjIn > &in, int dummy) -> decltype(precisionChange(((typename VobjOut::vector_type *) 0),((typename VobjIn::vector_type *) 0), 1), int())
void InsertSliceLocal(const Lattice< vobj > &lowDim, Lattice< vobj > &higherDim, int slice_lo, int slice_hi, int orthog)
void blockNormalise(Lattice< CComplex > &ip, Lattice< vobj > &fineX)
void batchBlockPromote(const std::vector< Lattice< iVector< CComplex, nbasis > > > &coarseData, std::vector< Lattice< vobj > > &fineData, const VLattice &Basis)
void precisionChangeFast(Lattice< VobjOut > &out, const Lattice< VobjIn > &in)
void blockOrthogonalise(Lattice< CComplex > &ip, std::vector< Lattice< vobj > > &Basis)
void batchBlockProject(std::vector< Lattice< iVector< CComplex, nbasis > > > &coarseData, const std::vector< Lattice< vobj > > &fineData, const VLattice &Basis)
void localConvert(const Lattice< vobj > &in, Lattice< vvobj > &out)
void blockInnerProductD(Lattice< CComplex > &CoarseInner, const Lattice< vobj > &fineX, const Lattice< vobj > &fineY)
void ExtractSliceFast(Lattice< vobj > &To, const Lattice< vobj > &From, int slice, int orthog)
void blockPromote(const Lattice< iVector< CComplex, nbasis > > &coarseData, Lattice< vobj > &fineData, const VLattice &Basis)
void acceleratorSetCheckerboard(Lattice< vobj > &full, const Lattice< vobj > &half, int checker_dim_half=0)
void acceleratorPickCheckerboard(int cb, Lattice< vobj > &half, const Lattice< vobj > &full, int checker_dim_half=0)
std::enable_if< isSIMDvectorized< vobj >::value &&!isSIMDvectorized< sobj >::value, void >::type unvectorizeToRevLexOrdArray(std::vector< sobj > &out, const Lattice< vobj > &in)
accelerator_inline void convertType(ComplexD &out, const std::complex< double > &in)
void setCheckerboard(Lattice< vobj > &full, const Lattice< vobj > &half)
void blockOrthonormalize(Lattice< CComplex > &ip, VLattice &Basis)
void blockPick(GridBase *coarse, const Lattice< vobj > &unpicked, Lattice< vobj > &picked, Coordinate coor)
void precisionChangeOrig(Lattice< VobjOut > &out, const Lattice< VobjIn > &in)
void ExtractSlice(Lattice< vobj > &lowDim, const Lattice< vobj > &higherDim, int slice, int orthog)
void pickCheckerboard(int cb, Lattice< vobj > &half, const Lattice< vobj > &full)
void ExtractSliceLocal(Lattice< vobj > &lowDim, const Lattice< vobj > &higherDim, int slice_lo, int slice_hi, int orthog)
std::enable_if< isSIMDvectorized< vobj >::value &&!isSIMDvectorized< sobj >::value, void >::type vectorizeFromRevLexOrdArray(std::vector< sobj > &in, Lattice< vobj > &out)
void blockZAXPY(Lattice< vobj > &fineZ, const Lattice< CComplex > &coarseA, const Lattice< vobj2 > &fineX, const Lattice< vobj > &fineY)
std::enable_if< isSIMDvectorized< vobj >::value &&!isSIMDvectorized< sobj >::value, void >::type vectorizeFromLexOrdArray(std::vector< sobj > &in, Lattice< vobj > &out)
auto localInnerProductD(const Lattice< vobj > &lhs, const Lattice< vobj > &rhs) -> Lattice< iScalar< decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0], rhs.View(CpuRead)[0])))> >
void precisionChange(Lattice< VobjOut > &out, const Lattice< VobjIn > &in, const precisionChangeWorkspace &workspace)
void Replicate(const Lattice< vobj > &coarse, Lattice< vobj > &fine)
void Grid_unsplit(std::vector< Lattice< Vobj > > &full, Lattice< Vobj > &split)
void blockInnerProduct(Lattice< CComplex > &CoarseInner, const Lattice< vobj > &fineX, const Lattice< vobj > &fineY)
std::enable_if< isSIMDvectorized< vobj >::value &&!isSIMDvectorized< sobj >::value, void >::type unvectorizeToLexOrdArray(std::vector< sobj > &out, const Lattice< vobj > &in)
void blockProject(Lattice< iVector< CComplex, nbasis > > &coarseData, const Lattice< vobj > &fineData, const VLattice &Basis)
void blockSum(Lattice< vobj > &coarseData, const Lattice< vobj > &fineData)
void Grid_split(std::vector< Lattice< Vobj > > &full, Lattice< Vobj > &split)
Lattice< obj > pow(const Lattice< obj > &rhs_i, RealD y)
#define autoView(l_v, l, mode)
@ AcceleratorWriteDiscard
#define NAMESPACE_BEGIN(A)
std::complex< RealF > ComplexF
std::complex< RealD > ComplexD
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
accelerator_inline std::enable_if<!isGridTensor< T >::value, T >::type TensorRemove(T arg)
accelerator_inline ComplexD innerProductD2(const ComplexF &l, const ComplexF &r)
#define thread_for(i, num,...)
void AllToAll(int dim, std::vector< T > &in, std::vector< T > &out)
unsigned long _ndimension
int64_t gSites(void) const
Coordinate _checker_dim_mask
const Coordinate & FullDimensions(void)
void GlobalIndexToGlobalCoor(int64_t gidx, Coordinate &gcoor)
void iCoorFromIindex(Coordinate &coor, int lane)
static accelerator_inline constexpr int Nsimd(void)
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const
std::pair< Integer, Integer > * fmap_device
precisionChangeWorkspace & operator=(precisionChangeWorkspace &&r)=delete
precisionChangeWorkspace(const precisionChangeWorkspace &r)=delete
precisionChangeWorkspace & operator=(const precisionChangeWorkspace &r)=delete
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid)
std::pair< Integer, Integer > const * getMap() const
~precisionChangeWorkspace()
void checkGrids(GridBase *out, GridBase *in) const
precisionChangeWorkspace(precisionChangeWorkspace &&r)=delete