42 std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
44 chi_i.Checkerboard() = psi_i.Checkerboard();
51 assert(phi.Checkerboard() == psi.Checkerboard());
53 auto pdiag = &this->
d_diag[0];
54 auto pupper = &this->
d_upper[0];
55 auto plower = &this->
d_lower[0];
68 for(
int s=0; s<
Ls; s++){
69 uint64_t idx_u = ss+((s+1)%
Ls);
70 uint64_t idx_l = ss+((s+
Ls-1)%
Ls);
73 coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
81 std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
82 std::vector<Coeff_t> &shift_coeffs)
84 chi_i.Checkerboard() = psi_i.Checkerboard();
92 int shift_s = (pm == 1) ? (Ls-1) : 0;
94 assert(phi.Checkerboard() == psi.Checkerboard());
96 auto pdiag = &this->d_diag[0];
97 auto pupper = &this->d_upper[0];
98 auto plower = &this->d_lower[0];
99 auto pshift_coeffs = &this->d_shift_coefficients[0];
107 int nloop = grid->
oSites()/Ls;
109 uint64_t ss = sss*Ls;
114 for(
int s=0; s<Ls; s++){
115 uint64_t idx_u = ss+((s+1)%Ls);
116 uint64_t idx_l = ss+((s+Ls-1)%Ls);
120 if(pm == 1){
spProj5p(tmp, psi(ss+shift_s)); }
121 else {
spProj5m(tmp, psi(ss+shift_s)); }
123 coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 +plower[s]*tmp2 + pshift_coeffs[s]*tmp);
131 std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
133 chi_i.Checkerboard() = psi_i.Checkerboard();
140 assert(phi.Checkerboard() == psi.Checkerboard());
142 auto pdiag = &this->
d_diag[0];
143 auto pupper = &this->
d_upper[0];
144 auto plower = &this->
d_lower[0];
153 uint64_t ss = sss*
Ls;
158 for(
int s=0; s<
Ls; s++){
159 uint64_t idx_u = ss+((s+1)%
Ls);
160 uint64_t idx_l = ss+((s+
Ls-1)%
Ls);
163 coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
170 std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
171 std::vector<Coeff_t> &shift_coeffs)
173 chi_i.Checkerboard() = psi_i.Checkerboard();
176 int shift_s = (this->pm == 1) ? (Ls-1) : 0;
181 assert(phi.Checkerboard() == psi.Checkerboard());
183 auto pdiag = &this->d_diag[0];
184 auto pupper = &this->d_upper[0];
185 auto plower = &this->d_lower[0];
186 auto pshift_coeffs = &this->d_shift_coefficients[0];
196 int nloop = grid->
oSites()/Ls;
198 uint64_t ss = sss*Ls;
201 spinor tmp1, tmp2, tmp;
205 for(
int s=0; s<Ls; s++){
207 uint64_t idx_u = ss+((s+1)%Ls);
208 uint64_t idx_l = ss+((s+Ls-1)%Ls);
213 if(s==(Ls-1))
coalescedWrite(chi[ss+s], chi(ss+s)+ pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
214 else coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
215 if(pm == 1){
spProj5p(tmp, psi(ss+s)); }
218 coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+pshift_coeffs[s]*tmp);
227 chi_i.Checkerboard() = psi_i.Checkerboard();
233 auto plee = & this->d_lee [0];
234 auto pdee = & this->d_dee [0];
235 auto puee = & this->d_uee [0];
236 auto pleem = & this->d_leem[0];
237 auto pueem = & this->d_ueem[0];
245 if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i);
return; }
247 int nloop = grid->
oSites()/Ls;
251 spinor tmp,
acc, res;
262 for(
int s=1;s<Ls-1;s++){
264 res -= plee[s-1]*tmp;
270 res = psi(ss+Ls-1) - plee[Ls-2]*tmp -
acc;
273 res = (1.0/pdee[Ls-1])*res;
277 for (
int s=Ls-2;s>=0;s--){
278 res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*
acc;
289 chi_i.Checkerboard() = psi_i.Checkerboard();
297 auto plee = & this->d_lee [0];
298 auto pdee = & this->d_dee [0];
299 auto puee = & this->d_uee [0];
300 auto pleem = & this->d_leem[0];
301 auto pueem = & this->d_ueem[0];
302 auto pMooeeInv_shift_lc = &this->d_MooeeInv_shift_lc[0];
303 auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
313 int nloop = grid->
oSites()/Ls;
317 spinor tmp,
acc, res, tmp_spProj;
325 tmp_spProj = pMooeeInv_shift_lc[0]*res;
327 for(
int s=1;s<Ls-1;s++){
329 tmp_spProj += pMooeeInv_shift_lc[s]*res;
330 res -= plee[s-1]*tmp;
338 tmp_spProj += pMooeeInv_shift_lc[Ls-1]*res;
339 if(pm == 1){
spProj5p(tmp_spProj, tmp_spProj);}
340 else {
spProj5m(tmp_spProj, tmp_spProj); }
342 res = res - plee[Ls-2]*tmp -
acc;
345 res = (1.0/pdee[Ls-1])*res;
348 coalescedWrite(chi[ss+Ls-1], res + pMooeeInv_shift_norm[Ls-1]*tmp_spProj);
349 for (
int s=Ls-2;s>=0;s--){
350 res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*
acc;
352 coalescedWrite(chi[ss+s], res + pMooeeInv_shift_norm[s]*tmp_spProj);
361 if(this->shift != 0.0){ MooeeInvDag_shift(psi_i,chi_i);
return; }
363 chi_i.Checkerboard() = psi_i.Checkerboard();
369 auto plee = &this->d_lee [0];
370 auto pdee = &this->d_dee [0];
371 auto puee = &this->d_uee [0];
372 auto pleem = &this->d_leem[0];
373 auto pueem = &this->d_ueem[0];
381 int nloop = grid->
oSites()/Ls;
385 spinor tmp,
acc, res;
396 for(
int s=1;s<Ls-1;s++){
398 res -= puee[s-1]*tmp;
404 res = psi(ss+Ls-1) - puee[Ls-2]*tmp -
acc;
407 res = (1.0/pdee[Ls-1])*res;
411 for (
int s=Ls-2;s>=0;s--){
412 res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*
acc;
422 chi_i.Checkerboard() = psi_i.Checkerboard();
429 auto plee = & this->d_lee [0];
430 auto pdee = & this->d_dee [0];
431 auto puee = & this->d_uee [0];
432 auto pleem = & this->d_leem[0];
433 auto pueem = & this->d_ueem[0];
435 auto pMooeeInvDag_shift_lc = &this->d_MooeeInv_shift_lc[0];
436 auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
449 int nloop = grid->
oSites()/Ls;
453 spinor tmp,
acc, res, tmp_spProj;
461 tmp_spProj = pMooeeInvDag_shift_lc[0]*res;
463 for(
int s=1;s<Ls-1;s++){
465 tmp_spProj += pMooeeInvDag_shift_lc[s]*res;
466 res -= puee[s-1]*tmp;
474 tmp_spProj += pMooeeInvDag_shift_lc[Ls-1]*res;
475 if(pm == 1){
spProj5p(tmp_spProj, tmp_spProj); }
476 else {
spProj5m(tmp_spProj, tmp_spProj); }
478 res = res - puee[Ls-2]*tmp -
acc;
481 res = (1.0/pdee[Ls-1])*res;
484 coalescedWrite(chi[ss+Ls-1], res + pMooeeInvDag_shift_norm[Ls-1]*tmp_spProj);
485 for (
int s=Ls-2;s>=0;s--){
486 res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*
acc;
488 coalescedWrite(chi[ss+s], res + pMooeeInvDag_shift_norm[s]*tmp_spProj);
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
#define acc(v, a, off, step, n)
#define autoView(l_v, l, mode)
#define NAMESPACE_BEGIN(A)
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
accelerator_inline void spProj5m(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void spProj5p(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
deviceVector< Coeff_t > d_upper
deviceVector< Coeff_t > d_diag
deviceVector< Coeff_t > d_lower
void M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi, std::vector< Coeff_t > &lower, std::vector< Coeff_t > &diag, std::vector< Coeff_t > &upper, std::vector< Coeff_t > &shift_coeffs)
virtual void MooeeInv(const FermionField &in, FermionField &out)
virtual void MooeeInvDag_shift(const FermionField &in, FermionField &out)
virtual void M5D(const FermionField &psi, FermionField &chi)
virtual void MooeeInv_shift(const FermionField &in, FermionField &out)
virtual void M5Ddag(const FermionField &psi, FermionField &chi)
virtual void MooeeInvDag(const FermionField &in, FermionField &out)
void M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi, std::vector< Coeff_t > &lower, std::vector< Coeff_t > &diag, std::vector< Coeff_t > &upper, std::vector< Coeff_t > &shift_coeffs)