28#if defined(GRID_CUDA)||defined(GRID_HIP)
44 typedef typename vobj::scalar_object sobj;
49 std::vector<sobj> sumarray(nthread);
50 for(
int i=0;i<nthread;i++){
55 int nwork, mywork, myoff;
59 for(
int ss=myoff;ss<mywork+myoff; ss++){
60 vvsum = vvsum + arg[ss];
62 sumarray[thr]=
Reduce(vvsum);
66 for(
int i=0;i<nthread;i++){
67 ssum = ssum+sumarray[i];
74 typedef typename vobj::scalar_objectD sobj;
78 std::vector<sobj> sumarray(nthread);
79 for(
int i=0;i<nthread;i++){
84 int nwork, mywork, myoff;
88 for(
int ss=myoff;ss<mywork+myoff; ss++){
89 vvsum = vvsum + arg[ss];
91 sumarray[thr]=
Reduce(vvsum);
95 for(
int i=0;i<nthread;i++){
96 ssum = ssum+sumarray[i];
129inline typename vobj::scalar_object
sum(
const vobj *arg,
Integer osites)
131#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
138inline typename vobj::scalar_objectD
sumD(
const vobj *arg,
Integer osites)
140#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
149#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
160#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
162 return sum_gpu(&arg_v[0],osites);
165 return sum_cpu(&arg_v[0],osites);
180#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
187 return sum_cpu(&arg_v[0],osites);
208template<
class Op,
class T1>
214template<
class Op,
class T1,
class T2>
221template<
class Op,
class T1,
class T2,
class T3>
231 typedef typename vobj::tensor_reduced vscalar;
232 typedef typename vscalar::scalar_object scalar;
236 auto grid = arg.
Grid();
239 for(
int l=0;l<grid->lSites();l++){
243 grid->LocalIndexToLocalCoor(l,coor);
246 if( (l==0) || (r>max)){
250 grid->GlobalMax(max);
258 typedef typename vobj::vector_typeD vector_type;
263 const uint64_t nsimd = grid->
Nsimd();
264 const uint64_t sites = grid->
oSites();
269 auto inner_tmp_v = &inner_tmp[0];
277 auto x_l = left_v(ss);
278 auto y_l = right_v(ss);
283 auto anrm =
sumD(inner_tmp_v,sites);
303 uint64_t *
base= (uint64_t *)&l_v[0];
308 std::cerr<<
" Bad CSUM " << std::hex<< csum <<
" recomputed as "<<csum2<<std::dec<<std::endl;
324 std::cerr<<
" Bad NORM " <<
local <<
" recomputed as "<<local2<<std::endl;
361 const uint64_t nsimd = grid->
Nsimd();
362 const uint64_t sites = grid->
oSites();
370 inner_tmp.resize(sites);
371 auto inner_tmp_v = &inner_tmp[0];
374 auto tmp = a*x_v(ss)+b*y_v(ss);
386 Integer words = sites*
sizeof(vobj)/
sizeof(uint64_t);
387 uint64_t *
base= (uint64_t *)&z_v[0];
392 std::cerr<<
" Bad z_v CSUM " << std::hex<< csum <<
" recomputed as "<<csum2<<std::dec<<std::endl;
398 Integer words = sites*
sizeof(inner_t)/
sizeof(uint64_t);
399 uint64_t *
base= (uint64_t *)&inner_tmp_v[0];
404 std::cerr<<
" Bad inner_tmp_v CSUM " << std::hex<< csum <<
" recomputed as "<<csum2<<std::dec<<std::endl;
424 typedef typename vobj::vector_typeD vector_type;
425 std::vector<ComplexD> tmp(2);
429 const uint64_t nsimd = grid->
Nsimd();
430 const uint64_t sites = grid->
oSites();
437 auto inner_tmp_v = &inner_tmp[0];
438 auto norm_tmp_v = &norm_tmp[0];
443 auto left_tmp = left_v[ss];
457template<
class Op,
class T1>
459 ->typename
decltype(expr.op.func(
eval(0,expr.arg1)))::scalar_object
464template<
class Op,
class T1,
class T2>
466 ->typename
decltype(expr.op.func(
eval(0,expr.arg1),
eval(0,expr.arg2)))::scalar_object
472template<
class Op,
class T1,
class T2,
class T3>
474 ->typename
decltype(expr.op.func(
eval(0,expr.arg1),
487 std::vector<typename vobj::scalar_object> &result,
495 typedef typename vobj::scalar_object sobj;
496 typedef typename vobj::scalar_object::scalar_type
scalar_type;
501 const int Nsimd = grid->
Nsimd();
503 assert(orthogdim >= 0);
504 assert(orthogdim <
Nd);
510 std::vector<vobj> lvSum(rd);
511 std::vector<sobj> lsSum(ld,
Zero());
515 for(
int r=0;r<rd;r++){
522 int ostride=grid->
_ostride[orthogdim];
530 for(
int rt=0;rt<rd;rt++){
534 for(
int idx=0;idx<Nsimd;idx++){
538 int ldx =rt+icoor[orthogdim]*rd;
540 lsSum[ldx]=lsSum[ldx]+extracted[idx];
546 for(
int t=0;t<fd;t++){
562template<
class vobj>
inline
563std::vector<typename vobj::scalar_object>
566 std::vector<typename vobj::scalar_object> result;
588 typedef typename vobj::vector_type vector_type;
595 const int Nsimd = grid->
Nsimd();
597 assert(orthogdim >= 0);
598 assert(orthogdim <
Nd);
604 std::vector<vector_type> lvSum(rd);
605 std::vector<scalar_type > lsSum(ld,
scalar_type(0.0));
609 for(
int r=0;r<rd;r++){
623 for(
int n=0;n<e1;n++){
624 for(
int b=0;b<e2;b++){
625 int ss= so+n*stride+b;
627 lvSum[r]=lvSum[r]+vv;
634 for(
int rt=0;rt<rd;rt++){
640 for(
int idx=0;idx<Nsimd;idx++){
644 int ldx =rt+icoor[orthogdim]*rd;
646 lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
653 for(
int t=0;t<fd;t++){
670 typedef typename vobj::scalar_object sobj;
672 typedef typename vobj::vector_type vector_type;
675 std::vector<ComplexD> ip(Nblock);
679 for(
int ss=0;ss<Nblock;ss++){
680 sn[ss] =
real(ip[ss]);
687 int orthogdim,
RealD scale=1.0)
690 typedef typename vobj::scalar_object sobj;
692 typedef typename vobj::vector_type vector_type;
693 typedef typename vobj::tensor_reduced tensor_reduced;
699 int Nsimd =grid->
Nsimd();
711 for(
int r=0;r<rd;r++){
717 for(
int l=0;l<Nsimd;l++){
719 int ldx =r+icoor[orthogdim]*rd;
723 tensor_reduced at; at=av;
729 int ss= so+n*stride+b;
730 Rv[ss] = at*Xv[ss]+Yv[ss];
738 int nsimd = BlockSolverGrid->
Nsimd();
740 std::vector<int> latt_phys(NN-1);
742 std::vector<int> mpi_phys(NN-1);
747 for(
int d=0;d<NN;d++){
752 if ( d == BlockSolverGrid->
_checker_dim ) checker_dim = dd;
780 typedef typename vobj::scalar_object sobj;
781 typedef typename vobj::vector_type vector_type;
783 for(
int i=0;i<Nslice;i++){
787 for(
int j=0;j<Nslice;j++){
789 Rs = Rs + Xs*(scale*aa(j,i));
813 typedef typename vobj::scalar_object sobj;
814 typedef typename vobj::vector_type vector_type;
816 mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
817 for(
int s=0;s<Nslice;s++){
819 for(
int ss=0;ss<Nslice;ss++){
#define accelerator_for(iterator, num, nsimd,...)
std::vector< T, devAllocator< T > > deviceVector
AcceleratorVector< int, MaxDims > Coordinate
const Coordinate GridDefaultSimd(int dims, int nsimd)
auto closure(const LatticeUnaryExpression< Op, T1 > &expr) -> Lattice< typename std::remove_const< decltype(expr.op.func(vecEval(0, expr.arg1)))>::type >
accelerator_inline sobj eval(const uint64_t ss, const sobj &arg)
auto localNorm2(const Lattice< vobj > &rhs) -> Lattice< typename vobj::tensor_reduced >
void peekLocalSite(sobj &s, const LatticeView< vobj > &l, Coordinate &site)
Lattice< vobj > real(const Lattice< vobj > &lhs)
vobj::scalar_object rankSum(const Lattice< vobj > &arg)
ComplexD innerProduct(const Lattice< vobj > &left, const Lattice< vobj > &right)
RealD maxLocalNorm2(const Lattice< vobj > &arg)
vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
strong_inline void innerProductNorm(ComplexD &ip, RealD &nrm, const Lattice< vobj > &left, const Lattice< vobj > &right)
vobj::scalar_object sum(const vobj *arg, Integer osites)
static void sliceMaddVector(Lattice< vobj > &R, std::vector< RealD > &a, const Lattice< vobj > &X, const Lattice< vobj > &Y, int orthogdim, RealD scale=1.0)
GridBase * makeSubSliceGrid(const GridBase *BlockSolverGrid, int Orthog)
ComplexD rankInnerProduct(const Lattice< vobj > &left, const Lattice< vobj > &right)
vobj::scalar_object sum_large(const Lattice< vobj > &arg)
static void sliceMulMatrix(Lattice< vobj > &R, Eigen::MatrixXcd &aa, const Lattice< vobj > &X, int Orthog, RealD scale=1.0)
static void sliceNorm(std::vector< RealD > &sn, const Lattice< vobj > &rhs, int Orthog)
vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
static void sliceMaddMatrix(Lattice< vobj > &R, Eigen::MatrixXcd &aa, const Lattice< vobj > &X, const Lattice< vobj > &Y, int Orthog, RealD scale=1.0)
static void sliceInnerProductMatrix(Eigen::MatrixXcd &mat, const Lattice< vobj > &lhs, const Lattice< vobj > &rhs, int Orthog)
vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
strong_inline RealD axpby_norm_fast(Lattice< vobj > &z, sobj a, sobj b, const Lattice< vobj > &x, const Lattice< vobj > &y)
strong_inline RealD axpy_norm_fast(Lattice< vobj > &z, sobj a, const Lattice< vobj > &x, const Lattice< vobj > &y)
vobj::scalar_object rankSumLarge(const Lattice< vobj > &arg)
static void sliceInnerProductVector(std::vector< ComplexD > &result, const Lattice< vobj > &lhs, const Lattice< vobj > &rhs, int orthogdim)
void sliceSum(const Lattice< vobj > &Data, std::vector< typename vobj::scalar_object > &result, int orthogdim)
vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
RealD norm2(const Lattice< vobj > &arg)
vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
Word svm_xor(Word *vec, uint64_t L)
void sliceSumReduction(const Lattice< vobj > &Data, std::vector< vobj > &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
void InsertSlice(const Lattice< vobj > &lowDim, Lattice< vobj > &higherDim, int slice, int orthog)
void ExtractSlice(Lattice< vobj > &lowDim, const Lattice< vobj > &higherDim, int slice, int orthog)
#define autoView(l_v, l, mode)
#define NAMESPACE_BEGIN(A)
std::complex< RealD > ComplexD
accelerator_inline ComplexD Reduce(const ComplexD &r)
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
accelerator_inline std::enable_if<!isGridTensor< T >::value, T >::type TensorRemove(T arg)
accelerator_inline ComplexD innerProductD(const ComplexF &l, const ComplexF &r)
#define thread_for(i, num,...)
#define thread_for2d(i1, n1, i2, n2,...)
Coordinate _processor_coor
void GlobalSumVector(RealF *, int N)
void GlobalSumP2P(obj &o)
unsigned long _ndimension
static bool StepLog(const char *name)
static void ReductionLog(double lcl, double glbl)
static bool CsumLog(uint64_t csum)
static bool NormLog(double value)
const Coordinate & GlobalDimensions(void)
Coordinate _checker_dim_mask
void iCoorFromIindex(Coordinate &coor, int lane)
static int GetThreads(void)
static void GetWork(int nwork, int me, int &mywork, int &myoff)
accelerator_inline int Checkerboard(void) const
GridBase * Grid(void) const