61 const FermionField &phi_i,
68 chi_i.Checkerboard()=psi_i.Checkerboard();
75 const int nsimd= Simd::Nsimd();
81 assert(Ls/LLs==nsimd);
82 assert(phi.Checkerboard() == psi.Checkerboard());
90 for(
int o=0;o<LLs;o++){
91 for(
int i=0;i<nsimd;i++){
101 thread_loop( (
int ss=0;ss<grid->
oSites();ss+=LLs),{
103 alignas(64) SiteHalfSpinor hp;
104 alignas(64) SiteHalfSpinor hm;
105 alignas(64) SiteSpinor fp;
106 alignas(64) SiteSpinor fm;
108 for(
int v=0;v<LLs;v++){
111 int vm=(v+LLs-1)%LLs;
116 if ( vp<=v )
rotate(hp,hp,1);
117 if ( vm>=v )
rotate(hm,hm,nsimd-1);
125 chi[ss+v] = d[v]*phi[ss+v];
126 chi[ss+v] = chi[ss+v] +u[v]*fp;
127 chi[ss+v] = chi[ss+v] +l[v]*fm;
131 for(
int v=0;v<LLs;v++){
135 int vp= (v==LLs-1) ? 0 : v+1;
136 int vm= (v==0 ) ? LLs-1 : v-1;
138 Simd hp_00 = psi[ss+vp]()(2)(0);
139 Simd hp_01 = psi[ss+vp]()(2)(1);
140 Simd hp_02 = psi[ss+vp]()(2)(2);
141 Simd hp_10 = psi[ss+vp]()(3)(0);
142 Simd hp_11 = psi[ss+vp]()(3)(1);
143 Simd hp_12 = psi[ss+vp]()(3)(2);
145 Simd hm_00 = psi[ss+vm]()(0)(0);
146 Simd hm_01 = psi[ss+vm]()(0)(1);
147 Simd hm_02 = psi[ss+vm]()(0)(2);
148 Simd hm_10 = psi[ss+vm]()(1)(0);
149 Simd hm_11 = psi[ss+vm]()(1)(1);
150 Simd hm_12 = psi[ss+vm]()(1)(2);
153 hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
154 hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
155 hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
156 hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
157 hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
158 hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
161 hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
162 hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
163 hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
164 hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
165 hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
166 hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
170 Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
171 Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
172 Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
173 Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
174 Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
175 Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
176 Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
177 Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
178 Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
179 Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
180 Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
181 Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
183 vstream(chi[ss+v]()(0)(0),p_00);
184 vstream(chi[ss+v]()(0)(1),p_01);
185 vstream(chi[ss+v]()(0)(2),p_02);
186 vstream(chi[ss+v]()(1)(0),p_10);
187 vstream(chi[ss+v]()(1)(1),p_11);
188 vstream(chi[ss+v]()(1)(2),p_12);
189 vstream(chi[ss+v]()(2)(0),p_20);
190 vstream(chi[ss+v]()(2)(1),p_21);
191 vstream(chi[ss+v]()(2)(2),p_22);
192 vstream(chi[ss+v]()(3)(0),p_30);
193 vstream(chi[ss+v]()(3)(1),p_31);
194 vstream(chi[ss+v]()(3)(2),p_32);
204 const FermionField &phi_i,
211 chi_i.Checkerboard()=psi_i.Checkerboard();
218 int nsimd= Simd::Nsimd();
224 assert(Ls/LLs==nsimd);
225 assert(phi.Checkerboard() == psi.Checkerboard());
233 for(
int o=0;o<LLs;o++){
234 for(
int i=0;i<nsimd;i++){
242 thread_loop( (
int ss=0;ss<grid->
oSites();ss+=LLs),{
244 alignas(64) SiteHalfSpinor hp;
245 alignas(64) SiteHalfSpinor hm;
246 alignas(64) SiteSpinor fp;
247 alignas(64) SiteSpinor fm;
249 for(
int v=0;v<LLs;v++){
252 int vm=(v+LLs-1)%LLs;
257 if ( vp<=v )
rotate(hp,hp,1);
258 if ( vm>=v )
rotate(hm,hm,nsimd-1);
265 chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
266 chi[ss+v] = chi[ss+v] +l[v]*fm;
270 for(
int v=0;v<LLs;v++){
274 int vp= (v==LLs-1) ? 0 : v+1;
275 int vm= (v==0 ) ? LLs-1 : v-1;
277 Simd hp_00 = psi[ss+vp]()(0)(0);
278 Simd hp_01 = psi[ss+vp]()(0)(1);
279 Simd hp_02 = psi[ss+vp]()(0)(2);
280 Simd hp_10 = psi[ss+vp]()(1)(0);
281 Simd hp_11 = psi[ss+vp]()(1)(1);
282 Simd hp_12 = psi[ss+vp]()(1)(2);
284 Simd hm_00 = psi[ss+vm]()(2)(0);
285 Simd hm_01 = psi[ss+vm]()(2)(1);
286 Simd hm_02 = psi[ss+vm]()(2)(2);
287 Simd hm_10 = psi[ss+vm]()(3)(0);
288 Simd hm_11 = psi[ss+vm]()(3)(1);
289 Simd hm_12 = psi[ss+vm]()(3)(2);
292 hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
293 hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
294 hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
295 hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
296 hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
297 hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
300 hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
301 hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
302 hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
303 hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
304 hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
305 hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
308 Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
309 Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
310 Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
311 Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
312 Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
313 Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
315 Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
316 Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
317 Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
318 Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
319 Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
320 Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
322 vstream(chi[ss+v]()(0)(0),p_00);
323 vstream(chi[ss+v]()(0)(1),p_01);
324 vstream(chi[ss+v]()(0)(2),p_02);
325 vstream(chi[ss+v]()(1)(0),p_10);
326 vstream(chi[ss+v]()(1)(1),p_11);
327 vstream(chi[ss+v]()(1)(2),p_12);
328 vstream(chi[ss+v]()(2)(0),p_20);
329 vstream(chi[ss+v]()(2)(1),p_21);
330 vstream(chi[ss+v]()(2)(2),p_22);
331 vstream(chi[ss+v]()(3)(0),p_30);
332 vstream(chi[ss+v]()(3)(1),p_31);
333 vstream(chi[ss+v]()(3)(2),p_32);
358 SiteHalfSpinor BcastP;
359 SiteHalfSpinor BcastM;
360 SiteHalfSpinor SiteChiP;
361 SiteHalfSpinor SiteChiM;
364 for(
int s1=0;s1<LLs;s1++){
365 for(
int s2=0;s2<LLs;s2++){
366 for(
int l=0; l<Simd::Nsimd();l++){
371 if ( s2==0 && l==0) {
376 for(
int sp=0;sp<2;sp++){
377 for(
int co=0;co<
Nc;co++){
378 vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
380 for(
int sp=0;sp<2;sp++){
381 for(
int co=0;co<
Nc;co++){
382 vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
385 for(
int sp=0;sp<2;sp++){
386 for(
int co=0;co<
Nc;co++){
387 SiteChiP()(sp)(co)=
real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co));
388 SiteChiM()(sp)(co)=
real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co));
393 int lex = s1+LLs*site;
394 for(
int sp=0;sp<2;sp++){
395 for(
int co=0;co<
Nc;co++){
396 vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
397 vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
416#define Chi_30 %%zmm10
417#define Chi_31 %%zmm11
418#define Chi_32 %%zmm12
420#define BCAST0 %%zmm13
421#define BCAST1 %%zmm14
422#define BCAST2 %%zmm15
423#define BCAST3 %%zmm16
424#define BCAST4 %%zmm17
425#define BCAST5 %%zmm18
426#define BCAST6 %%zmm19
427#define BCAST7 %%zmm20
428#define BCAST8 %%zmm21
429#define BCAST9 %%zmm22
430#define BCAST10 %%zmm23
431#define BCAST11 %%zmm24
434 for(
int s1=0;s1<LLs;s1++){
435 for(
int s2=0;s2<LLs;s2++){
437 uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1];
438 uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
439 uint64_t a2 = (uint64_t)&psi[lex];
440 for(
int l=0; l<Simd::Nsimd();l++){
462 : :
"r" (a0),
"r" (a1),
"r" (a2) );
477 : :
"r" (a0),
"r" (a1),
"r" (a2) );
481 a2 = a2+
sizeof(
typename Simd::scalar_type);
484 int lexa = s1+LLs*site;
490 : :
"r" ((uint64_t)&chi[lexa]) :
"memory" );
535 SiteHalfSpinor BcastP;
536 SiteHalfSpinor BcastM;
537 SiteHalfSpinor SiteChiP;
538 SiteHalfSpinor SiteChiM;
541 for(
int s1=0;s1<LLs;s1++){
542 for(
int s2=0;s2<LLs;s2++){
543 for(
int l=0; l<Simd::Nsimd();l++){
548 if ( s2==0 && l==0) {
553 for(
int sp=0;sp<2;sp++){
554 for(
int co=0;co<
Nc;co++){
555 vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
557 for(
int sp=0;sp<2;sp++){
558 for(
int co=0;co<
Nc;co++){
559 vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
562 for(
int sp=0;sp<2;sp++){
563 for(
int co=0;co<
Nc;co++){
564 SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co);
565 SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co);
571 int lex = s1+LLs*site;
572 for(
int sp=0;sp<2;sp++){
573 for(
int co=0;co<
Nc;co++){
574 vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
575 vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
599#define pChi_00 %%zmm0
600#define pChi_01 %%zmm1
601#define pChi_02 %%zmm2
602#define pChi_10 %%zmm3
603#define pChi_11 %%zmm4
604#define pChi_12 %%zmm5
605#define pChi_20 %%zmm6
606#define pChi_21 %%zmm7
607#define pChi_22 %%zmm8
608#define pChi_30 %%zmm9
609#define pChi_31 %%zmm10
610#define pChi_32 %%zmm11
612#define BCAST_00 %zmm12
613#define SHUF_00 %zmm13
614#define BCAST_01 %zmm14
615#define SHUF_01 %zmm15
616#define BCAST_02 %zmm16
617#define SHUF_02 %zmm17
618#define BCAST_10 %zmm18
619#define SHUF_10 %zmm19
620#define BCAST_11 %zmm20
621#define SHUF_11 %zmm21
622#define BCAST_12 %zmm22
623#define SHUF_12 %zmm23
631 for(
int s1=0;s1<LLs;s1++){
632 for(
int s2=0;s2<LLs;s2++){
634 uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1];
635 uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
636 uint64_t a2 = (uint64_t)&psi[lex];
637 for(
int l=0; l<Simd::Nsimd();l++){
720 a2 = a2+
sizeof(
typename Simd::scalar_type);
723 int lexa = s1+LLs*site;
739 : :
"r" ((uint64_t)&chi[lexa]) :
"memory" );
782 chi.Checkerboard()=psi.Checkerboard();
785 int LLs = psi.Grid()->_rdimensions[0];
786 int vol = psi.Grid()->oSites()/LLs;
799 if ( inv && (!dag) ) {
804 MooeeInternalCompute(dag,inv,Matp,Matm);
808 assert(_Matp->size()==Ls*LLs);
810 if ( switcheroo<Coeff_t>::iscomplex() ) {
811 thread_loop( (
auto site=0;site<vol;site++),{
812 MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
815 thread_loop( (
auto site=0;site<vol;site++),{
816 MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
std::vector< T, uvmAllocator< T > > Vector
accelerator_inline void vstream(Grid_simd2< S, V > &out, const Grid_simd2< S, V > &in)
accelerator_inline Grid_simd2< S, V > real_madd(Grid_simd2< S, V > a, Grid_simd2< S, V > b, Grid_simd2< S, V > c)
accelerator_inline void vbroadcast(Grid_simd2< S, V > &ret, const Grid_simd2< S, V > &src, int lane)
accelerator_inline Grid_simd< S, V > rotate(Grid_simd< S, V > b, int nrot)
Invoke< std::enable_if< Condition::value, ReturnType > > EnableIf
#define VBCASTCDUP(OFF, A, DEST)
#define VMULIDUP(O, P, B, accum)
#define VMADDSUBIDUP(O, P, B, accum)
#define VMULMEM(O, P, B, accum)
#define VMADDSUBRDUP(O, P, B, accum)
#define VMADDMEM(O, P, B, accum)
#define autoView(l_v, l, mode)
#define NAMESPACE_BEGIN(A)
static constexpr int DaggerYes
iScalar< iScalar< iScalar< vtype > > > iSinglet
static constexpr int DaggerNo
static constexpr int InverseYes
accelerator_inline void vprefetch(const iScalar< v > &vv)
accelerator_inline void spRecon5m(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spProj5m(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void spProj5p(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void spRecon5p(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
virtual void MooeeInvDag(const FermionField &in, FermionField &out)
virtual void M5Ddag(const FermionField &psi, FermionField &chi)
virtual void M5D(const FermionField &psi, FermionField &chi)
virtual void MooeeInv(const FermionField &in, FermionField &out)