Grid 0.7.0
A2Autils.h
Go to the documentation of this file.
1#pragma once
2//#include <Grid/Hadrons/Global.hpp>
4
6
7#undef DELTA_F_EQ_2
8
10//Meson
11// Interested in
12//
13// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
14//
15// Conventional meson field:
16//
17// = sum_x,y Trace[ sum_j G |v_j(y,ty)> <w_j(x,tx)| G sum_i |v_i(x,tx) ><w_i(y,ty)| ]
18// = sum_ij sum_x,y < w_j(x,tx)| G |v_i(x,tx) > <w_i(y,ty) (x)|G| v_j(y,ty) >
19// = sum_ij PI_ji(tx) PI_ij(ty)
20//
21// G5-Hermiticity
22//
23// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
24// = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ]
25// = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> <w_j(x,tx)| G g5 sum_i (|v_j(y,ty)> <w_i(x,tx)|)^dag ] -- (*)
26//
27// NB: Dag applies to internal indices spin,colour,complex
28//
29// = sum_ij sum_x,y Trace[ g5 G |v_j(y,ty)> <w_j(x,tx)| G g5 |w_i(x,tx)> <v_i(y,ty)| ]
30// = sum_ij sum_x,y <v_i(y,ty)|g5 G |v_j(y,ty)> <w_j(x,tx)| G g5 |w_i(x,tx)>
31// = sum_ij PionVV(ty) PionWW(tx)
32//
33// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker
34// expectation value. Otherwise biased.
36
37template <typename FImpl>
39{
40public:
41 typedef typename FImpl::ComplexField ComplexField;
42 typedef typename FImpl::FermionField FermionField;
43 typedef typename FImpl::PropagatorField PropagatorField;
44
45 typedef typename FImpl::SiteSpinor vobj;
46 typedef typename vobj::scalar_object sobj;
47 typedef typename vobj::scalar_type scalar_type;
48 typedef typename vobj::vector_type vector_type;
49
54
56
57
58 // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
59 template <typename TensorType>
60 static void MesonField(TensorType &mat,
61 const FermionField *lhs_wi,
62 const FermionField *rhs_vj,
63 std::vector<Gamma::Algebra> gammas,
64 const std::vector<ComplexField > &mom,
65 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
66
67 template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
68 static void AslashField(TensorType &mat,
69 const FermionField *lhs_wi,
70 const FermionField *rhs_vj,
71 const std::vector<ComplexField> &emB0,
72 const std::vector<ComplexField> &emB1,
73 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
74
75 template <typename TensorType>
76 typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
77 std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
78 void>::type
79 static ContractWWVV(std::vector<PropagatorField> &WWVV,
80 const TensorType &WW_sd,
81 const FermionField *vs,
82 const FermionField *vd);
83
84 template <typename TensorType>
85 typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
86 std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
87 void>::type
88 static ContractWWVV(std::vector<PropagatorField> &WWVV,
89 const TensorType &WW_sd,
90 const FermionField *vs,
91 const FermionField *vd);
92
93 static void ContractFourQuarkColourDiagonal(const PropagatorField &WWVV0,
94 const PropagatorField &WWVV1,
95 const std::vector<Gamma> &gamma0,
96 const std::vector<Gamma> &gamma1,
97 ComplexField &O_trtr,
98 ComplexField &O_fig8);
99
100 static void ContractFourQuarkColourMix(const PropagatorField &WWVV0,
101 const PropagatorField &WWVV1,
102 const std::vector<Gamma> &gamma0,
103 const std::vector<Gamma> &gamma1,
104 ComplexField &O_trtr,
105 ComplexField &O_fig8);
106#ifdef DELTA_F_EQ_2
107 static void DeltaFeq2(int dt_min,int dt_max,
108 Eigen::Tensor<ComplexD,2> &dF2_fig8,
109 Eigen::Tensor<ComplexD,2> &dF2_trtr,
110 Eigen::Tensor<ComplexD,2> &dF2_fig8_mix,
111 Eigen::Tensor<ComplexD,2> &dF2_trtr_mix,
112 Eigen::Tensor<ComplexD,1> &denom_A0,
113 Eigen::Tensor<ComplexD,1> &denom_P,
114 Eigen::Tensor<ComplexD,3> &WW_sd,
115 const FermionField *vs,
116 const FermionField *vd,
117 int orthogdim);
118#endif
119private:
120 inline static void OuterProductWWVV(PropagatorField &WWVV,
121 const vobj &lhs,
122 const vobj &rhs,
123 const int Ns, const int ss);
124};
125
126const int A2Ablocking=8;
127
128template<typename vtype> using iVecSpinMatrix = iVector<iMatrix<iScalar<vtype>, Ns>, A2Ablocking>;
132
133template<typename vtype> using iVecComplex = iVector<iScalar<iScalar<vtype> >, A2Ablocking>;
137
138#define A2A_GPU_KERNELS
139#ifdef A2A_GPU_KERNELS
140template <class FImpl>
141template <typename TensorType>
142void A2Autils<FImpl>::MesonField(TensorType &mat,
143 const FermionField *lhs_wi,
144 const FermionField *rhs_vj,
145 std::vector<Gamma::Algebra> gammas,
146 const std::vector<ComplexField > &mom,
147 int orthogdim, double *t_kernel, double *t_gsum)
148{
149 const int block=A2Ablocking;
150 typedef typename FImpl::SiteSpinor vobj;
151
152 typedef typename vobj::scalar_object sobj;
153 typedef typename vobj::scalar_type scalar_type;
154 typedef typename vobj::vector_type vector_type;
155
156 int Lblock = mat.dimension(3);
157 int Rblock = mat.dimension(4);
158
159 // assert(Lblock % block==0);
160 // assert(Rblock % block==0);
161
162 GridBase *grid = lhs_wi[0].Grid();
163
164 // const int Nd = grid->_ndimension;
165 const int Nsimd = grid->Nsimd();
166
167 int Nt = grid->GlobalDimensions()[orthogdim];
168 int Ngamma = gammas.size();
169 int Nmom = mom.size();
170
171 LatticeVecSpinMatrix SpinMat(grid);
172 LatticeVecSpinMatrix MomSpinMat(grid);
173
174 std::vector<VecSpinMatrix> sliced;
175 for(int i=0;i<Lblock;i++){
176 autoView(SpinMat_v,SpinMat,AcceleratorWrite);
177 autoView(lhs_v,lhs_wi[i],AcceleratorRead);
178 for(int jo=0;jo<Rblock;jo+=block){
179 for(int j=jo;j<MIN(Rblock,jo+block);j++){
180 int jj=j%block;
181 autoView(rhs_v,rhs_vj[j],AcceleratorRead); // Create a vector of views
183 // Should write a SpinOuterColorTrace
185
186 accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
187 auto left = conjugate(lhs_v(ss));
188 auto right = rhs_v(ss);
189 auto vv = SpinMat_v(ss);
190 for(int s1=0;s1<Ns;s1++){
191 for(int s2=0;s2<Ns;s2++){
192 vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
193 + left()(s2)(1) * right()(s1)(1)
194 + left()(s2)(2) * right()(s1)(2);
195 }}
196 coalescedWrite(SpinMat_v[ss],vv);
197 });
198
199 }// j within block
200 // After getting the sitewise product do the mom phase loop
201 for(int m=0;m<Nmom;m++){
202
203 MomSpinMat = SpinMat * mom[m];
204
205 sliceSum(MomSpinMat,sliced,orthogdim);
206
207 for(int mu=0;mu<Ngamma;mu++){
208 for(int t=0;t<sliced.size();t++){
209 for(int j=jo;j<MIN(Rblock,jo+block);j++){
210 int jj=j%block;
211 auto tmp = peekIndex<LorentzIndex>(sliced[t],jj);
212 auto trSG = trace(tmp*Gamma(gammas[mu]));
213 mat(m,mu,t,i,j) = trSG()();
214 }
215 }
216 }
217 }
218 }//jo
219 }
220}
221
222// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
223//
224// With:
225//
226// B_0 = A_0 + i A_1
227// B_1 = A_2 + i A_3
228//
229// then in spin space
230//
231// ( 0 0 -conj(B_1) -B_0 )
232// i * A_mu g_mu = ( 0 0 -conj(B_0) B_1 )
233// ( B_1 B_0 0 0 )
234// ( conj(B_0) -conj(B_1) 0 0 )
235
236template <class FImpl>
237template <typename TensorType>
238void A2Autils<FImpl>::AslashField(TensorType &mat,
239 const FermionField *lhs_wi,
240 const FermionField *rhs_vj,
241 const std::vector<ComplexField> &emB0,
242 const std::vector<ComplexField> &emB1,
243 int orthogdim, double *t_kernel, double *t_gsum)
244{
245 const int block=A2Ablocking;
246 typedef typename FImpl::SiteSpinor vobj;
247
248 typedef typename vobj::scalar_object sobj;
249 typedef typename vobj::scalar_type scalar_type;
250 typedef typename vobj::vector_type vector_type;
251
252 int Lblock = mat.dimension(3);
253 int Rblock = mat.dimension(4);
254
255 int Nem = emB0.size();
256 assert(emB1.size() == Nem);
257
258 // assert(Lblock % block==0);
259 // assert(Rblock % block==0);
260
261 GridBase *grid = lhs_wi[0].Grid();
262
263 const int Nd = grid->_ndimension;
264 const int Nsimd = grid->Nsimd();
265
266 int Nt = grid->GlobalDimensions()[orthogdim];
267
268 LatticeVecSpinMatrix SpinMat(grid);
269 LatticeVecComplex Aslash(grid);
270 std::vector<VecComplex> sliced;
271
272 for(int i=0;i<Lblock;i++){
273 autoView(SpinMat_v,SpinMat,AcceleratorWrite);
274 autoView(Aslash_v,Aslash,AcceleratorWrite);
275 autoView(lhs_v,lhs_wi[i],AcceleratorRead);
276 for(int jo=0;jo<Rblock;jo+=block){
277 for(int j=jo;j<MIN(Rblock,jo+block);j++){
278 int jj=j%block;
279 autoView(rhs_v,rhs_vj[j],AcceleratorRead); // Create a vector of views
281 // Should write a SpinOuterColorTrace
283 accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
284 auto left = conjugate(lhs_v(ss));
285 auto right = rhs_v(ss);
286 auto vv = SpinMat_v(ss);
287 for(int s1=0;s1<Ns;s1++){
288 for(int s2=0;s2<Ns;s2++){
289 vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
290 + left()(s2)(1) * right()(s1)(1)
291 + left()(s2)(2) * right()(s1)(2);
292 }}
293 coalescedWrite(SpinMat_v[ss],vv);
294 });
295 }
296 for(int m=0;m<Nem;m++){
297 autoView(emB0_v,emB0[m],AcceleratorRead);
298 autoView(emB1_v,emB1[m],AcceleratorRead);
299 accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
300 auto vv = SpinMat_v(ss);
301 auto b0 = emB0_v(ss);
302 auto b1 = emB1_v(ss);
303 auto cb0 = conjugate(b0);
304 auto cb1 = conjugate(b1);
305 auto asl = Aslash_v(ss);
306 for(int j=jo;j<MIN(Rblock,jo+block);j++){
307 int jj=j%block;
308 asl(jj)()()=- vv(jj)(3,0)()*b0()()() - vv(jj)(2,0)()*cb1()()()
309 + vv(jj)(3,1)()*b1()()() - vv(jj)(2,1)()*cb0()()()
310 + vv(jj)(0,2)()*b1()()() + vv(jj)(1,2)()*b0()()()
311 + vv(jj)(0,3)()*cb0()()() - vv(jj)(1,3)()*cb1()()();
312
313 }// j within block
314 coalescedWrite(Aslash_v[ss],asl);
315 });
316
317 sliceSum(Aslash,sliced,orthogdim);
318
319 for(int t=0;t<sliced.size();t++){
320 for(int j=jo;j<MIN(Rblock,jo+block);j++){
321 int jj=j%block;
322 mat(m,0,t,i,j) = sliced[t](jj)()();
323 }
324 }
325 }
326 }
327 }
328}
329
330#else
331template <class FImpl>
332template <typename TensorType>
333void A2Autils<FImpl>::MesonField(TensorType &mat,
334 const FermionField *lhs_wi,
335 const FermionField *rhs_vj,
336 std::vector<Gamma::Algebra> gammas,
337 const std::vector<ComplexField > &mom,
338 int orthogdim, double *t_kernel, double *t_gsum)
339{
340 typedef typename FImpl::SiteSpinor vobj;
341
342 typedef typename vobj::scalar_object sobj;
343 typedef typename vobj::scalar_type scalar_type;
344 typedef typename vobj::vector_type vector_type;
345
346 typedef iSpinMatrix<vector_type> SpinMatrix_v;
347 typedef iSpinMatrix<scalar_type> SpinMatrix_s;
348
349 int Lblock = mat.dimension(3);
350 int Rblock = mat.dimension(4);
351
352 GridBase *grid = lhs_wi[0].Grid();
353
354 const int Nd = grid->_ndimension;
355 const int Nsimd = grid->Nsimd();
356
357 int Nt = grid->GlobalDimensions()[orthogdim];
358 int Ngamma = gammas.size();
359 int Nmom = mom.size();
360
361 int fd=grid->_fdimensions[orthogdim];
362 int ld=grid->_ldimensions[orthogdim];
363 int rd=grid->_rdimensions[orthogdim];
364
365 // will locally sum vectors first
366 // sum across these down to scalars
367 // splitting the SIMD
368 int MFrvol = rd*Lblock*Rblock*Nmom;
369 int MFlvol = ld*Lblock*Rblock*Nmom;
370
371 std::vector<SpinMatrix_v > lvSum(MFrvol);
372 for(int r=0;r<MFrvol;r++){
373 lvSum[r] = Zero();
374 }
375
376 std::vector<SpinMatrix_s > lsSum(MFlvol);
377 for(int r=0;r<MFlvol;r++){
378 lsSum[r]=scalar_type(0.0);
379 }
380
381 int e1= grid->_slice_nblock[orthogdim];
382 int e2= grid->_slice_block [orthogdim];
383 int stride=grid->_slice_stride[orthogdim];
384
385 // potentially wasting cores here if local time extent too small
386 if (t_kernel) *t_kernel = -usecond();
387 for(int r=0;r<rd;r++) {
388
389 int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
390
391 for(int n=0;n<e1;n++){
392 for(int b=0;b<e2;b++){
393
394 int ss= so+n*stride+b;
395
396 for(int i=0;i<Lblock;i++){
397
398 // Recreate view potentially expensive outside fo UVM mode
399 autoView(lhs_v,lhs_wi[i],CpuRead);
400 auto left = conjugate(lhs_v[ss]);
401 for(int j=0;j<Rblock;j++){
402
403 SpinMatrix_v vv;
404 // Recreate view potentially expensive outside fo UVM mode
405 autoView(rhs_v,rhs_vj[j],CpuRead);
406 auto right = rhs_v[ss];
407 for(int s1=0;s1<Ns;s1++){
408 for(int s2=0;s2<Ns;s2++){
409 vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
410 + left()(s2)(1) * right()(s1)(1)
411 + left()(s2)(2) * right()(s1)(2);
412 }}
413
414 // After getting the sitewise product do the mom phase loop
415 int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
416 for ( int m=0;m<Nmom;m++){
417 int idx = m+base;
418 autoView(mom_v,mom[m],CpuRead);
419 auto phase = mom_v[ss];
420 mac(&lvSum[idx],&vv,&phase);
421 }
422 }
423 }
424 }
425 }
426 };
427
428 // Sum across simd lanes in the plane, breaking out orthog dir.
429 for(int rt=0;rt<rd;rt++){
430
431 Coordinate icoor(Nd);
432 ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
433
434 for(int i=0;i<Lblock;i++){
435 for(int j=0;j<Rblock;j++){
436 for(int m=0;m<Nmom;m++){
437
438 int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
439
440 extract(lvSum[ij_rdx],extracted);
441
442 for(int idx=0;idx<Nsimd;idx++){
443
444 grid->iCoorFromIindex(icoor,idx);
445
446 int ldx = rt+icoor[orthogdim]*rd;
447
448 int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
449
450 lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
451
452 }
453 }}}
454 }
455 if (t_kernel) *t_kernel += usecond();
456 assert(mat.dimension(0) == Nmom);
457 assert(mat.dimension(1) == Ngamma);
458 assert(mat.dimension(2) == Nt);
459
460 // ld loop and local only??
461 int pd = grid->_processors[orthogdim];
462 int pc = grid->_processor_coor[orthogdim];
463 thread_for_collapse(2,lt,ld,{
464 for(int pt=0;pt<pd;pt++){
465 int t = lt + pt*ld;
466 if (pt == pc){
467 for(int i=0;i<Lblock;i++){
468 for(int j=0;j<Rblock;j++){
469 for(int m=0;m<Nmom;m++){
470 int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
471 for(int mu=0;mu<Ngamma;mu++){
472 // this is a bit slow
473 mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]))()()();
474 }
475 }
476 }
477 }
478 } else {
479 const scalar_type zz(0.0);
480 for(int i=0;i<Lblock;i++){
481 for(int j=0;j<Rblock;j++){
482 for(int mu=0;mu<Ngamma;mu++){
483 for(int m=0;m<Nmom;m++){
484 mat(m,mu,t,i,j) =zz;
485 }
486 }
487 }
488 }
489 }
490 }
491 });
492
494 // This global sum is taking as much as 50% of time on 16 nodes
495 // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
496 // Healthy size that should suffice
498 if (t_gsum) *t_gsum = -usecond();
499 grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
500 if (t_gsum) *t_gsum += usecond();
501}
502
503template <class FImpl>
504template <typename TensorType>
505void A2Autils<FImpl>::AslashField(TensorType &mat,
506 const FermionField *lhs_wi,
507 const FermionField *rhs_vj,
508 const std::vector<ComplexField> &emB0,
509 const std::vector<ComplexField> &emB1,
510 int orthogdim, double *t_kernel, double *t_gsum)
511{
512 typedef typename FermionField::vector_object vobj;
513 typedef typename vobj::scalar_object sobj;
514 typedef typename vobj::scalar_type scalar_type;
515 typedef typename vobj::vector_type vector_type;
516
517 typedef iSpinMatrix<vector_type> SpinMatrix_v;
518 typedef iSpinMatrix<scalar_type> SpinMatrix_s;
519 typedef iSinglet<vector_type> Singlet_v;
520 typedef iSinglet<scalar_type> Singlet_s;
521
522 int Lblock = mat.dimension(3);
523 int Rblock = mat.dimension(4);
524
525 GridBase *grid = lhs_wi[0].Grid();
526
527 const int Nd = grid->_ndimension;
528 const int Nsimd = grid->Nsimd();
529
530 int Nt = grid->GlobalDimensions()[orthogdim];
531 int Nem = emB0.size();
532 assert(emB1.size() == Nem);
533
534 int fd=grid->_fdimensions[orthogdim];
535 int ld=grid->_ldimensions[orthogdim];
536 int rd=grid->_rdimensions[orthogdim];
537
538 // will locally sum vectors first
539 // sum across these down to scalars
540 // splitting the SIMD
541 int MFrvol = rd*Lblock*Rblock*Nem;
542 int MFlvol = ld*Lblock*Rblock*Nem;
543
544 std::vector<vector_type> lvSum(MFrvol);
545 thread_for(r,MFrvol,
546 {
547 lvSum[r] = Zero();
548 });
549
550 std::vector<scalar_type> lsSum(MFlvol);
551 thread_for(r,MFlvol,
552 {
553 lsSum[r] = scalar_type(0.0);
554 });
555
556 int e1= grid->_slice_nblock[orthogdim];
557 int e2= grid->_slice_block [orthogdim];
558 int stride=grid->_slice_stride[orthogdim];
559
560 // Nested parallelism would be ok
561 // Wasting cores here. Test case r
562 if (t_kernel) *t_kernel = -usecond();
563 for(int r=0;r<rd;r++)
564 {
565 int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
566
567 for(int n=0;n<e1;n++)
568 for(int b=0;b<e2;b++)
569 {
570 int ss= so+n*stride+b;
571
572 for(int i=0;i<Lblock;i++)
573 {
574 autoView(wi_v,lhs_wi[i],CpuRead);
575 auto left = conjugate(wi_v[ss]);
576
577 for(int j=0;j<Rblock;j++)
578 {
579 SpinMatrix_v vv;
580 autoView(vj_v,rhs_vj[j],CpuRead);
581 auto right = vj_v[ss];
582
583 for(int s1=0;s1<Ns;s1++)
584 for(int s2=0;s2<Ns;s2++)
585 {
586 vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
587 + left()(s2)(1) * right()(s1)(1)
588 + left()(s2)(2) * right()(s1)(2);
589 }
590
591 // After getting the sitewise product do the mom phase loop
592 int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
593
594 for ( int m=0;m<Nem;m++)
595 {
596 autoView(emB0_v,emB0[m],CpuRead);
597 autoView(emB1_v,emB1[m],CpuRead);
598 int idx = m+base;
599 auto b0 = emB0_v[ss];
600 auto b1 = emB1_v[ss];
601 auto cb0 = conjugate(b0);
602 auto cb1 = conjugate(b1);
603
604 lvSum[idx] += - vv()(3,0)()*b0()()() - vv()(2,0)()*cb1()()()
605 + vv()(3,1)()*b1()()() - vv()(2,1)()*cb0()()()
606 + vv()(0,2)()*b1()()() + vv()(1,2)()*b0()()()
607 + vv()(0,3)()*cb0()()() - vv()(1,3)()*cb1()()();
608 }
609 }
610 }
611 }
612 }
613
614 // Sum across simd lanes in the plane, breaking out orthog dir.
615 thread_for(rt,rd,
616 {
617 Coordinate icoor(Nd);
618 ExtractBuffer<scalar_type> extracted(Nsimd);
619
620 for(int i=0;i<Lblock;i++)
621 for(int j=0;j<Rblock;j++)
622 for(int m=0;m<Nem;m++)
623 {
624
625 int ij_rdx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*rt;
626
627 extract<vector_type,scalar_type>(lvSum[ij_rdx],extracted);
628 for(int idx=0;idx<Nsimd;idx++)
629 {
630 grid->iCoorFromIindex(icoor,idx);
631
632 int ldx = rt+icoor[orthogdim]*rd;
633 int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx;
634
635 lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
636 }
637 }
638 });
639 if (t_kernel) *t_kernel += usecond();
640
641 // ld loop and local only??
642 int pd = grid->_processors[orthogdim];
643 int pc = grid->_processor_coor[orthogdim];
644 thread_for_collapse(2,lt,ld,
645 {
646 for(int pt=0;pt<pd;pt++)
647 {
648 int t = lt + pt*ld;
649 if (pt == pc)
650 {
651 for(int i=0;i<Lblock;i++)
652 for(int j=0;j<Rblock;j++)
653 for(int m=0;m<Nem;m++)
654 {
655 int ij_dx = m+Nem*i + Nem*Lblock * j + Nem*Lblock * Rblock * lt;
656
657 mat(m,0,t,i,j) = lsSum[ij_dx];
658 }
659 }
660 else
661 {
662 const scalar_type zz(0.0);
663
664 for(int i=0;i<Lblock;i++)
665 for(int j=0;j<Rblock;j++)
666 for(int m=0;m<Nem;m++)
667 {
668 mat(m,0,t,i,j) = zz;
669 }
670 }
671 }
672 });
673 if (t_gsum) *t_gsum = -usecond();
674 grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock);
675 if (t_gsum) *t_gsum += usecond();
676}
677#endif
679// Schematic thoughts about more generalised four quark insertion
680//
681// -- Pupil or fig8 type topology (depending on flavour structure) done below
682// -- Have Bag style -- WWVV VVWW
683// _ _
684// / \/ \
685// \_/\_/
686//
687// - Kpipi style (two pion insertions)
688// K
689// *********
690// Type 1
691// *********
692// x
693// ___ _____ pi(b)
694// K / \/____/
695// \ \
696// \____\pi(a)
697//
698//
699// W^W_sd V_s(x) V^_d(xa) Wa(xa) Va^(x) WW_bb' Vb Vb'(x)
700//
701// (Kww.PIvw) VV ; pi_ww.VV
702//
703// But Norman and Chris say g5 hermiticity not used, except on K
704//
705// Kww PIvw PIvw
706//
707// (W^W_sd PIa_dd') PIb_uu' (v_s v_d' w_u v_u')(x)
708//
709// - Can form (Nmode^3)
710//
711// (Kww . PIvw)_sd and then V_sV_d tensor product contracting this
712//
713// - Can form
714//
715// (PIvw)_uu' W_uV_u' tensor product.
716//
717// Both are lattice propagators.
718//
719// Contract with the same four quark type contraction as BK.
720//
721// *********
722// Type 2
723// *********
724// _ _____ pi
725// K / \/ |
726// \_/\_____|pi
727//
728// Norman/Chris would say
729//
730// (Kww VV)(x) . (PIwv Piwv) VW(x)
731//
732// Full four quark via g5 hermiticity
733//
734// (Kww VV)(x) . (PIww Pivw) VV(x)
735//
736// *********
737// Type 3
738// *********
739// ___ _____ pi
740// K / \/ |
741// \ /\ |
742// \ \/ |
743// \________|pi
744//
745//
746// (V(x) . Kww . pi_vw . pivw . V(x)). WV(x)
747//
748// No difference possible to Norman, Chris, Diaqian
749//
750// *********
751// Type 4
752// *********
753// ___ pi
754// K / \/\ |\
755// \___/\/ |/
756// pi
757//
758// (Kww VV ) WV x Tr(PIwv PIwv)
759//
760// Could use alternate PI_ww PIvv for disco loop interesting comparison
761//
762// *********
763// Primitives / Utilities for assembly
764// *********
765//
766// i) Basic type for meson field - mode indexed object, whether WW, VW, VV etc..
767// ii) Multiply two meson fields (modes^3) ; use BLAS MKL via Eigen
768// iii) Multiply and trace two meson fields ; use BLAS MKL via Eigen. The element wise product trick
769// iv) Contract a meson field (whether WW,VW, VV WV) with W and V fields to form LatticePropagator
770// v) L^3 sum a four quark contaction with two LatticePropagators.
771//
772// Use lambda functions to enable flexibility in v)
773// Use lambda functions to unify the pion field / meson field contraction codes.
775
776
777
779// DeltaF=2 contraction ; use exernal WW field for Kaon, anti Kaon sink
781//
782// WW -- i vectors have adjoint, and j vectors not.
783// -- Think of "i" as the strange quark, forward prop from 0
784// -- Think of "j" as the anti-down quark.
785//
786// WW_sd are w^dag_s w_d
787//
788// Hence VV vectors correspondingly are v^dag_d, v_s from t=0
789// and v^dag_d, v_s from t=dT
790//
791// There is an implicit g5 associated with each v^dag_d from use of g5 Hermiticity.
792// The other gamma_5 lies in the WW external meson operator.
793//
794// From UKhadron wallbag.cc:
795//
796// LatticePropagator anti_d0 = adj( Gamma(G5) * Qd0 * Gamma(G5));
797// LatticePropagator anti_d1 = adj( Gamma(G5) * Qd1 * Gamma(G5));
798//
799// PR1 = Qs0 * Gamma(G5) * anti_d0;
800// PR2 = Qs1 * Gamma(G5) * anti_d1;
801//
802// TR1 = trace( PR1 * G1 );
803// TR2 = trace( PR2 * G2 );
804// Wick1 = TR1 * TR2;
805//
806// Wick2 = trace( PR1* G1 * PR2 * G2 );
807// // was Wick2 = trace( Qs0 * Gamma(G5) * anti_d0 * G1 * Qs1 * Gamma(G5) * anti_d1 * G2 );
808//
809// TR TR(tx) = Wick1 = sum_x WW[t0]_sd < v^_d |g5 G| v_s> WW[t1]_s'd' < v^_d' |g5 G| v_s'> |_{x,tx)
810// = sum_x [ Trace(WW[t0] VgV(t,x) ) x Trace( WW_[t1] VgV(t,x) ) ]
811//
812//
813// Calc all Nt Trace(WW VV) products at once, take Nt^2 products of these.
814//
815// Fig8(tx) = Wick2 = sum_x WW[t0]_sd WW[t1]_s'd' < v^_d |g5 G| v_s'> < v^_d' |g5 G| v_s> |_{x,tx}
816//
817// = sum_x Trace( WW[t0] VV[t,x] WW[t1] VV[t,x] )
818//
820//
821// WW is w_s^dag (x) w_d (G5 implicitly absorbed)
822//
823// WWVV will have spin-col (x) spin-col tensor.
824//
825// Want this to look like a strange fwd prop, anti-d prop.
826//
827// Take WW_sd v^dag_d (x) v_s
828//
829
830template <class FImpl>
831template <typename TensorType>
832typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
833 std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
834 void>::type
835A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
836 const TensorType &WW_sd,
837 const FermionField *vs,
838 const FermionField *vd)
839{
840 GridBase *grid = vs[0].Grid();
841
842 // int nd = grid->_ndimension;
843 int Nsimd = grid->Nsimd();
844 int N_t = WW_sd.dimension(0);
845 int N_s = WW_sd.dimension(1);
846 int N_d = WW_sd.dimension(2);
847
848 int d_unroll = 32;// Empirical optimisation
849
850 for(int t=0;t<N_t;t++){
851 WWVV[t] = Zero();
852 }
853
854 thread_for(ss,grid->oSites(),{
855 for(int d_o=0;d_o<N_d;d_o+=d_unroll){
856 for(int t=0;t<N_t;t++){
857 for(int s=0;s<N_s;s++){
858 autoView(vs_v,vs[s],CpuRead);
859 auto tmp1 = vs_v[ss];
860 vobj tmp2 = Zero();
861 vobj tmp3 = Zero();
862 for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
863 autoView(vd_v,vd[d],CpuRead);
864 Scalar_v coeff = WW_sd(t,s,d);
865 tmp3 = conjugate(vd_v[ss]);
866 mac(&tmp2, &coeff, &tmp3);
867 }
868
870 // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
872 OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
873
874 }}
875 }
876 });
877}
878
879template <class FImpl>
880template <typename TensorType>
881typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD, 3>, TensorType>::value ||
882 std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
883 void>::type
884A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
885 const TensorType &WW_sd,
886 const FermionField *vs,
887 const FermionField *vd)
888{
889 GridBase *grid = vs[0].Grid();
890
891 int nd = grid->_ndimension;
892 int Nsimd = grid->Nsimd();
893 int N_t = WW_sd.dimensions()[0];
894 int N_s = WW_sd.dimensions()[1];
895 int N_d = WW_sd.dimensions()[2];
896
897 int d_unroll = 32;// Empirical optimisation
898
899 Eigen::Matrix<Complex, -1, -1, Eigen::RowMajor> buf;
900
901 for(int t=0;t<N_t;t++){
902 WWVV[t] = Zero();
903 }
904
905 for (int t = 0; t < N_t; t++){
906 buf = WW_sd[t];
907 thread_for(ss,grid->oSites(),{
908 for(int d_o=0;d_o<N_d;d_o+=d_unroll){
909 for(int s=0;s<N_s;s++){
910 autoView(vs_v,vs[s],CpuRead);
911 auto tmp1 = vs_v[ss];
912 vobj tmp2 = Zero();
913 vobj tmp3 = Zero();
914 for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
915 autoView(vd_v,vd[d],CpuRead);
916 Scalar_v coeff = buf(s,d);
917 tmp3 = conjugate(vd_v[ss]);
918 mac(&tmp2, &coeff, &tmp3);
919 }
921 // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
923 OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
924 }}
925 });
926 }
927}
928
929template <class FImpl>
931 const vobj &lhs,
932 const vobj &rhs,
933 const int Ns, const int ss)
934{
935 autoView(WWVV_v,WWVV,CpuWrite);
936 for (int s1 = 0; s1 < Ns; s1++){
937 for (int s2 = 0; s2 < Ns; s2++){
938 WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
939 WWVV_v[ss]()(s1,s2)(0, 1) += lhs()(s1)(0) * rhs()(s2)(1);
940 WWVV_v[ss]()(s1,s2)(0, 2) += lhs()(s1)(0) * rhs()(s2)(2);
941 WWVV_v[ss]()(s1,s2)(1, 0) += lhs()(s1)(1) * rhs()(s2)(0);
942 WWVV_v[ss]()(s1,s2)(1, 1) += lhs()(s1)(1) * rhs()(s2)(1);
943 WWVV_v[ss]()(s1,s2)(1, 2) += lhs()(s1)(1) * rhs()(s2)(2);
944 WWVV_v[ss]()(s1,s2)(2, 0) += lhs()(s1)(2) * rhs()(s2)(0);
945 WWVV_v[ss]()(s1,s2)(2, 1) += lhs()(s1)(2) * rhs()(s2)(1);
946 WWVV_v[ss]()(s1,s2)(2, 2) += lhs()(s1)(2) * rhs()(s2)(2);
947 }
948 }
949}
950
951template<class FImpl>
953 const PropagatorField &WWVV1,
954 const std::vector<Gamma> &gamma0,
955 const std::vector<Gamma> &gamma1,
956 ComplexField &O_trtr,
957 ComplexField &O_fig8)
958{
959 assert(gamma0.size()==gamma1.size());
960 int Ng = gamma0.size();
961
962 GridBase *grid = WWVV0.Grid();
963
964 autoView(WWVV0_v , WWVV0,CpuRead);
965 autoView(WWVV1_v , WWVV1,CpuRead);
966 autoView(O_trtr_v, O_trtr,CpuWrite);
967 autoView(O_fig8_v, O_fig8,CpuWrite);
968 thread_for(ss,grid->oSites(),{
969
970 typedef typename ComplexField::vector_object vobj;
971
972 vobj v_trtr;
973 vobj v_fig8;
974
975 auto VV0 = WWVV0_v[ss];
976 auto VV1 = WWVV1_v[ss];
977
978 for(int g=0;g<Ng;g++){
979
980 v_trtr = trace(VV0 * gamma0[g])* trace(VV1*gamma1[g]);
981 v_fig8 = trace(VV0 * gamma0[g] * VV1 * gamma1[g]);
982
983 if ( g==0 ) {
984 O_trtr_v[ss] = v_trtr;
985 O_fig8_v[ss] = v_fig8;
986 } else {
987 O_trtr_v[ss]+= v_trtr;
988 O_fig8_v[ss]+= v_fig8;
989 }
990
991 }
992 });
993}
994
995template<class FImpl>
997 const PropagatorField &WWVV1,
998 const std::vector<Gamma> &gamma0,
999 const std::vector<Gamma> &gamma1,
1000 ComplexField &O_trtr,
1001 ComplexField &O_fig8)
1002{
1003 assert(gamma0.size()==gamma1.size());
1004 int Ng = gamma0.size();
1005
1006 GridBase *grid = WWVV0.Grid();
1007
1008 autoView( WWVV0_v , WWVV0,CpuRead);
1009 autoView( WWVV1_v , WWVV1,CpuRead);
1010 autoView( O_trtr_v, O_trtr,CpuWrite);
1011 autoView( O_fig8_v, O_fig8,CpuWrite);
1012
1013 thread_for(ss,grid->oSites(),{
1014
1015 typedef typename ComplexField::vector_object vobj;
1016
1017 auto VV0 = WWVV0_v[ss];
1018 auto VV1 = WWVV1_v[ss];
1019
1020 for(int g=0;g<Ng;g++){
1021
1022 auto VV0G = VV0 * gamma0[g]; // Spin multiply
1023 auto VV1G = VV1 * gamma1[g];
1024
1025 vobj v_trtr=Zero();
1026 vobj v_fig8=Zero();
1027
1029 // Colour mixed
1031 // _ _
1032 // s_sa G_st d_tb s_s'b G_s't' d_t'a
1033 //
1034 //
1035 // Contracted with prop factor (VV0)_sd,ab (VV1)_s'd',ba
1036 //
1037 // Wick1 [ spin TR TR ]
1038 //
1039 // (VV0*G0)_ss,ba . (VV1*G1)_tt,ab
1040 //
1041 // Wick2 [ spin fig8 ]
1042 //
1043 // (VV0*G0)_st,aa (VV1*G1)_ts,bb
1044 //
1046
1047 for(int a=0;a<Nc;a++){
1048 for(int b=0;b<Nc;b++){
1049 for(int s=0;s<Ns;s++){
1050 for(int t=0;t<Ns;t++){
1051 // Mixed traces
1052 v_trtr()()() += VV0G()(s,s)(a,b)*VV1G()(t,t)(b,a); // Was the fig8 before Fierzing
1053 v_fig8()()() += VV0G()(s,t)(a,a)*VV1G()(t,s)(b,b); // Was the trtr before Fierzing
1054
1055 /*
1056 * CHECKS -- use Fierz identities as a strong test, 4 Oct 2018.
1057 *
1058BagMix [8,0] fig8 (21.5596,-3.83908e-17) trtr (0.064326,2.51001e-17) // Fierz -1 0 0 0 0
1059BagMix [8,1] fig8 (-1346.99,1.2481e-16) trtr (34.2501,-3.36935e-17) // 0 0 2 0 0
1060BagMix [8,2] fig8 (13.7536,-6.04625e-19) trtr (-215.542,3.24326e-17) // 0 1/2 0 0 0
1061BagMix [8,3] fig8 (555.878,-7.39942e-17) trtr (463.82,-4.73909e-17) // 0 0 0 1/2 -1/2
1062BagMix [8,4] fig8 (-1602.48,9.08511e-17) trtr (-936.302,1.14156e-16) // 0 0 0 -3/2 -1/2
1063
1064Bag [8,0] fig8 (-0.064326,1.06281e-17) trtr (-21.5596,1.06051e-17)
1065Bag [8,2] fig8 (17.125,-3.40959e-17) trtr (-673.493,7.68134e-17)
1066Bag [8,1] fig8 (-431.084,2.76423e-17) trtr (27.5073,-5.76967e-18) /////////// TR TR FIG8
1067Bag [8,3] fig8 (700.061,-1.14925e-16) trtr (1079.18,-1.35476e-16) // 555.878 = 0.5(1079.18+32.5776) ; 463.82 =0.5(700.061+227.58)
1068Bag [8,4] fig8 (-227.58,3.58808e-17) trtr (-32.5776,1.83286e-17) // - 1602.48 = - 1.5*1079.18 + .5* 32.5776; 936.302=-1.5* 700+0.5*227
1069 */
1070
1071 //Unmixed debug check consistency
1072 // v_trtr()()() += VV0G()(s,s)(a,a)*VV1G()(t,t)(b,b);
1073 // v_fig8()()() += VV0G()(s,t)(a,b)*VV1G()(t,s)(b,a);
1074 }}}}
1075
1076 if ( g==0 ) {
1077 O_trtr_v[ss] = v_trtr;
1078 O_fig8_v[ss] = v_fig8;
1079 } else {
1080 O_trtr_v[ss]+= v_trtr;
1081 O_fig8_v[ss]+= v_fig8;
1082 }
1083
1084 }
1085 });
1086}
1087
1088
1089
1090#ifdef DELTA_F_EQ_2
1092// Perhaps this should move out of the utils and into Hadrons module
1093// Now makes use of the primitives above and doesn't touch inside
1094// the lattice structures.
1096template<class FImpl>
1097void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
1098 Eigen::Tensor<ComplexD,2> &dF2_fig8,
1099 Eigen::Tensor<ComplexD,2> &dF2_trtr,
1100 Eigen::Tensor<ComplexD,2> &dF2_fig8_mix,
1101 Eigen::Tensor<ComplexD,2> &dF2_trtr_mix,
1102 Eigen::Tensor<ComplexD,1> &denom_A0,
1103 Eigen::Tensor<ComplexD,1> &denom_P,
1104 Eigen::Tensor<ComplexD,3> &WW_sd,
1105 const FermionField *vs,
1106 const FermionField *vd,
1107 int orthogdim)
1108{
1109 GridBase *grid = vs[0].Grid();
1110
1111 LOG(Message) << "Computing A2A DeltaF=2 graph" << std::endl;
1112
1113 auto G5 = Gamma(Gamma::Algebra::Gamma5);
1114
1115 double nodes = grid->NodeCount();
1116 int nd = grid->_ndimension;
1117 int Nsimd = grid->Nsimd();
1118 int N_t = WW_sd.dimension(0);
1119 int N_s = WW_sd.dimension(1);
1120 int N_d = WW_sd.dimension(2);
1121
1122 assert(grid->GlobalDimensions()[orthogdim] == N_t);
1123 double vol = 1.0;
1124 for(int dim=0;dim<nd;dim++){
1125 vol = vol * grid->GlobalDimensions()[dim];
1126 }
1127
1128 double t_tot = -usecond();
1129 std::vector<PropagatorField> WWVV (N_t,grid);
1130
1131 double t_outer= -usecond();
1132 ContractWWVV(WWVV,WW_sd,&vs[0],&vd[0]);
1133 t_outer+=usecond();
1134
1136 // Implicit gamma-5
1138 for(int t=0;t<N_t;t++){
1139 WWVV[t] = WWVV[t]* G5 ;
1140 }
1141
1143 // Contraction
1145 int Ng=5;
1146 dF2_trtr.resize(N_t,Ng);
1147 dF2_fig8.resize(N_t,Ng);
1148 dF2_trtr_mix.resize(N_t,Ng);
1149 dF2_fig8_mix.resize(N_t,Ng);
1150 denom_A0.resize(N_t);
1151 denom_P.resize(N_t);
1152 for(int t=0;t<N_t;t++){
1153 for(int g=0;g<Ng;g++) dF2_trtr(t,g)= ComplexD(0.0);
1154 for(int g=0;g<Ng;g++) dF2_fig8(t,g)= ComplexD(0.0);
1155 for(int g=0;g<Ng;g++) dF2_trtr_mix(t,g)= ComplexD(0.0);
1156 for(int g=0;g<Ng;g++) dF2_fig8_mix(t,g)= ComplexD(0.0);
1157 denom_A0(t) =ComplexD(0.0);
1158 denom_P(t) =ComplexD(0.0);
1159 }
1160
1161 ComplexField D0(grid); D0 = Zero(); // <P|A0> correlator from each wall
1162 ComplexField D1(grid); D1 = Zero();
1163
1164 ComplexField O1_trtr(grid); O1_trtr = Zero();
1165 ComplexField O2_trtr(grid); O2_trtr = Zero();
1166 ComplexField O3_trtr(grid); O3_trtr = Zero();
1167 ComplexField O4_trtr(grid); O4_trtr = Zero();
1168 ComplexField O5_trtr(grid); O5_trtr = Zero();
1169
1170 ComplexField O1_fig8(grid); O1_fig8 = Zero();
1171 ComplexField O2_fig8(grid); O2_fig8 = Zero();
1172 ComplexField O3_fig8(grid); O3_fig8 = Zero();
1173 ComplexField O4_fig8(grid); O4_fig8 = Zero();
1174 ComplexField O5_fig8(grid); O5_fig8 = Zero();
1175
1176 ComplexField VV_trtr(grid); VV_trtr = Zero();
1177 ComplexField AA_trtr(grid); AA_trtr = Zero();
1178 ComplexField SS_trtr(grid); SS_trtr = Zero();
1179 ComplexField PP_trtr(grid); PP_trtr = Zero();
1180 ComplexField TT_trtr(grid); TT_trtr = Zero();
1181
1182 ComplexField VV_fig8(grid); VV_fig8 = Zero();
1183 ComplexField AA_fig8(grid); AA_fig8 = Zero();
1184 ComplexField SS_fig8(grid); SS_fig8 = Zero();
1185 ComplexField PP_fig8(grid); PP_fig8 = Zero();
1186 ComplexField TT_fig8(grid); TT_fig8 = Zero();
1187
1189 // Used to store appropriate correlation funcs
1191 std::vector<TComplex> C1;
1192 std::vector<TComplex> C2;
1193 std::vector<TComplex> C3;
1194 std::vector<TComplex> C4;
1195 std::vector<TComplex> C5;
1196
1198 // Could do AA, VV, SS, PP, TT and form linear combinations later.
1199 // Almost 2x. but for many modes, the above loop dominates.
1201 double t_contr= -usecond();
1202
1203 // Tr Tr Wick contraction
1204 auto VX = Gamma(Gamma::Algebra::GammaX);
1205 auto VY = Gamma(Gamma::Algebra::GammaY);
1206 auto VZ = Gamma(Gamma::Algebra::GammaZ);
1207 auto VT = Gamma(Gamma::Algebra::GammaT);
1208
1209 auto AX = Gamma(Gamma::Algebra::GammaXGamma5);
1210 auto AY = Gamma(Gamma::Algebra::GammaYGamma5);
1211 auto AZ = Gamma(Gamma::Algebra::GammaZGamma5);
1212 auto AT = Gamma(Gamma::Algebra::GammaTGamma5);
1213
1214 auto S = Gamma(Gamma::Algebra::Identity);
1215 auto P = Gamma(Gamma::Algebra::Gamma5);
1216
1217 auto T0 = Gamma(Gamma::Algebra::SigmaXY);
1218 auto T1 = Gamma(Gamma::Algebra::SigmaXZ);
1219 auto T2 = Gamma(Gamma::Algebra::SigmaXT);
1220 auto T3 = Gamma(Gamma::Algebra::SigmaYZ);
1221 auto T4 = Gamma(Gamma::Algebra::SigmaYT);
1222 auto T5 = Gamma(Gamma::Algebra::SigmaZT);
1223
1224 std::cout <<GridLogMessage << " dt " <<dt_min<<"..." <<dt_max<<std::endl;
1225
1226 for(int t0=0;t0<N_t;t0++){
1227 std::cout <<GridLogMessage << " t0 " <<t0<<std::endl;
1228 // for(int dt=dt_min;dt<dt_max;dt++){
1229 {
1230 int dt = dt_min;
1231 int t1 = (t0+dt)%N_t;
1232
1233 std::cout <<GridLogMessage << " t1 " <<t1<<std::endl;
1234 std::vector<Gamma> VV({VX,VY,VZ,VT});
1235 std::vector<Gamma> AA({AX,AY,AZ,AT});
1236 std::vector<Gamma> SS({S});
1237 std::vector<Gamma> PP({P});
1238 std::vector<Gamma> TT({T0,T1,T2,T3,T4,T5});
1239 std::vector<Gamma> A0({AT});
1240
1241 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],VV,VV,VV_trtr,VV_fig8); // VV
1242 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],AA,AA,AA_trtr,AA_fig8); // AA
1243 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],SS,SS,SS_trtr,SS_fig8); // SS
1244 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],PP,PP,PP_trtr,PP_fig8); // PP
1245 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],TT,TT,TT_trtr,TT_fig8); // TT
1246
1247 O1_trtr = VV_trtr+AA_trtr; O2_trtr = VV_trtr-AA_trtr; // VV+AA,VV-AA
1248 O1_fig8 = VV_fig8+AA_fig8; O2_fig8 = VV_fig8-AA_fig8;
1249
1250 O3_trtr = SS_trtr-PP_trtr; O4_trtr = SS_trtr+PP_trtr; // SS+PP,SS-PP
1251 O3_fig8 = SS_fig8-PP_fig8; O4_fig8 = SS_fig8+PP_fig8;
1252
1253 O5_trtr = TT_trtr;
1254 O5_fig8 = TT_fig8;
1255
1256 sliceSum(O1_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1257 sliceSum(O2_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1258 sliceSum(O3_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1259 sliceSum(O4_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1260 sliceSum(O5_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1261
1262 sliceSum(O1_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1263 sliceSum(O2_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1264 sliceSum(O3_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1265 sliceSum(O4_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1266 sliceSum(O5_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1267
1268 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],A0,A0,AA_trtr,AA_fig8); // A0 insertion
1269
1270 sliceSum(AA_trtr,C1, orthogdim);
1271 sliceSum(PP_trtr,C2, orthogdim);
1272
1273 for(int t=0;t<N_t;t++){
1274 denom_A0(t)+=C1[(t+t0)%N_t]()()()/vol;
1275 denom_P(t) +=C2[(t+t0)%N_t]()()()/vol;
1276 }
1277
1279 // Colour mixed contractions
1281
1282 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],VV,VV,VV_trtr,VV_fig8); // VV
1283 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],AA,AA,AA_trtr,AA_fig8); // AA
1284 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],SS,SS,SS_trtr,SS_fig8); // SS
1285 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],PP,PP,PP_trtr,PP_fig8); // PP
1286 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],TT,TT,TT_trtr,TT_fig8); // TT
1287
1288 O1_trtr = VV_trtr+AA_trtr; O2_trtr = VV_trtr-AA_trtr; // VV+AA,VV-AA
1289 O1_fig8 = VV_fig8+AA_fig8; O2_fig8 = VV_fig8-AA_fig8;
1290
1291 O3_trtr = SS_trtr-PP_trtr; O4_trtr = SS_trtr+PP_trtr; // SS+PP,SS-PP
1292 O3_fig8 = SS_fig8-PP_fig8; O4_fig8 = SS_fig8+PP_fig8;
1293
1294 O5_trtr = TT_trtr;
1295 O5_fig8 = TT_fig8;
1296
1297 sliceSum(O1_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1298 sliceSum(O2_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1299 sliceSum(O3_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1300 sliceSum(O4_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1301 sliceSum(O5_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1302
1303 sliceSum(O1_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1304 sliceSum(O2_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1305 sliceSum(O3_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1306 sliceSum(O4_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1307 sliceSum(O5_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1308
1309 }
1310 }
1311 t_contr +=usecond();
1312
1313 t_tot+=usecond();
1314 double million=1.0e6;
1315 LOG(Message) << "Computing A2A DeltaF=2 graph t_tot " << t_tot /million << " s "<< std::endl;
1316 LOG(Message) << "Computing A2A DeltaF=2 graph t_outer " << t_outer /million << " s "<< std::endl;
1317 LOG(Message) << "Computing A2A DeltaF=2 graph t_contr " << t_contr /million << " s "<< std::endl;
1318}
1319#endif
1320
1321 /*
1322 static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
1323 const FermionField *wi,
1324 const FermionField *vj,
1325 const std::vector<ComplexField > &mom,
1326 int orthogdim);
1327
1328 static void PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
1329 const FermionField *wi,
1330 const FermionField *vj,
1331 int orthogdim,
1332 int g5);
1333
1334 static void PionFieldWV(Eigen::Tensor<ComplexD,3> &mat,
1335 const FermionField *wi,
1336 const FermionField *vj,
1337 int orthogdim);
1338 static void PionFieldWW(Eigen::Tensor<ComplexD,3> &mat,
1339 const FermionField *wi,
1340 const FermionField *wj,
1341 int orthogdim);
1342 static void PionFieldVV(Eigen::Tensor<ComplexD,3> &mat,
1343 const FermionField *vi,
1344 const FermionField *vj,
1345 int orthogdim);
1346 */
1347
1348/*
1349
1350template <class FImpl>
1351template <typename TensorType>
1352void A2Autils<FImpl>::MesonField(TensorType &mat,
1353 const FermionField *lhs_wi,
1354 const FermionField *rhs_vj,
1355 std::vector<Gamma::Algebra> gammas,
1356 const std::vector<ComplexField > &mom,
1357 int orthogdim, double *t_kernel, double *t_gsum)
1358{
1359 typedef typename FImpl::SiteSpinor vobj;
1360
1361 typedef typename vobj::scalar_object sobj;
1362 typedef typename vobj::scalar_type scalar_type;
1363 typedef typename vobj::vector_type vector_type;
1364
1365 typedef iSpinMatrix<vector_type> SpinMatrix_v;
1366 typedef iSpinMatrix<scalar_type> SpinMatrix_s;
1367
1368 int Lblock = mat.dimension(3);
1369 int Rblock = mat.dimension(4);
1370
1371 GridBase *grid = lhs_wi[0].Grid();
1372
1373 const int Nd = grid->_ndimension;
1374 const int Nsimd = grid->Nsimd();
1375
1376 int Nt = grid->GlobalDimensions()[orthogdim];
1377 int Ngamma = gammas.size();
1378 int Nmom = mom.size();
1379
1380 int fd=grid->_fdimensions[orthogdim];
1381 int ld=grid->_ldimensions[orthogdim];
1382 int rd=grid->_rdimensions[orthogdim];
1383
1384 // will locally sum vectors first
1385 // sum across these down to scalars
1386 // splitting the SIMD
1387 int MFrvol = rd*Lblock*Rblock*Nmom;
1388 int MFlvol = ld*Lblock*Rblock*Nmom;
1389
1390 std::vector<SpinMatrix_v > lvSum(MFrvol);
1391 for(int r=0;r<MFrvol;r++){
1392 lvSum[r] = Zero();
1393 }
1394
1395 std::vector<SpinMatrix_s > lsSum(MFlvol);
1396 for(int r=0;r<MFlvol;r++){
1397 lsSum[r]=scalar_type(0.0);
1398 }
1399
1400 int e1= grid->_slice_nblock[orthogdim];
1401 int e2= grid->_slice_block [orthogdim];
1402 int stride=grid->_slice_stride[orthogdim];
1403
1404 // potentially wasting cores here if local time extent too small
1405 if (t_kernel) *t_kernel = -usecond();
1406 for(int r=0;r<rd;r++) {
1407
1408 int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
1409
1410 for(int n=0;n<e1;n++){
1411 for(int b=0;b<e2;b++){
1412
1413 int ss= so+n*stride+b;
1414
1415 for(int i=0;i<Lblock;i++){
1416
1417 // Recreate view potentially expensive outside fo UVM mode
1418 autoView(lhs_v,lhs_wi[i],CpuRead);
1419 auto left = conjugate(lhs_v[ss]);
1420 for(int j=0;j<Rblock;j++){
1421
1422 SpinMatrix_v vv;
1423 // Recreate view potentially expensive outside fo UVM mode
1424 autoView(rhs_v,rhs_vj[j],CpuRead);
1425 auto right = rhs_v[ss];
1426 for(int s1=0;s1<Ns;s1++){
1427 for(int s2=0;s2<Ns;s2++){
1428 vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
1429 + left()(s2)(1) * right()(s1)(1)
1430 + left()(s2)(2) * right()(s1)(2);
1431 }}
1432
1433 // After getting the sitewise product do the mom phase loop
1434 int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
1435 for ( int m=0;m<Nmom;m++){
1436 int idx = m+base;
1437 autoView(mom_v,mom[m],CpuRead);
1438 auto phase = mom_v[ss];
1439 mac(&lvSum[idx],&vv,&phase);
1440 }
1441 }
1442 }
1443 }
1444 }
1445 };
1446
1447 // Sum across simd lanes in the plane, breaking out orthog dir.
1448 for(int rt=0;rt<rd;rt++){
1449
1450 Coordinate icoor(Nd);
1451 ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
1452
1453 for(int i=0;i<Lblock;i++){
1454 for(int j=0;j<Rblock;j++){
1455 for(int m=0;m<Nmom;m++){
1456
1457 int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
1458
1459 extract(lvSum[ij_rdx],extracted);
1460
1461 for(int idx=0;idx<Nsimd;idx++){
1462
1463 grid->iCoorFromIindex(icoor,idx);
1464
1465 int ldx = rt+icoor[orthogdim]*rd;
1466
1467 int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
1468
1469 lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
1470
1471 }
1472 }}}
1473 }
1474 if (t_kernel) *t_kernel += usecond();
1475 assert(mat.dimension(0) == Nmom);
1476 assert(mat.dimension(1) == Ngamma);
1477 assert(mat.dimension(2) == Nt);
1478
1479 // ld loop and local only??
1480 int pd = grid->_processors[orthogdim];
1481 int pc = grid->_processor_coor[orthogdim];
1482 thread_for_collapse(2,lt,ld,{
1483 for(int pt=0;pt<pd;pt++){
1484 int t = lt + pt*ld;
1485 if (pt == pc){
1486 for(int i=0;i<Lblock;i++){
1487 for(int j=0;j<Rblock;j++){
1488 for(int m=0;m<Nmom;m++){
1489 int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
1490 for(int mu=0;mu<Ngamma;mu++){
1491 // this is a bit slow
1492 mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]))()()();
1493 }
1494 }
1495 }
1496 }
1497 } else {
1498 const scalar_type zz(0.0);
1499 for(int i=0;i<Lblock;i++){
1500 for(int j=0;j<Rblock;j++){
1501 for(int mu=0;mu<Ngamma;mu++){
1502 for(int m=0;m<Nmom;m++){
1503 mat(m,mu,t,i,j) =zz;
1504 }
1505 }
1506 }
1507 }
1508 }
1509 }
1510 });
1511
1513 // This global sum is taking as much as 50% of time on 16 nodes
1514 // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
1515 // Healthy size that should suffice
1517 if (t_gsum) *t_gsum = -usecond();
1518 grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
1519 if (t_gsum) *t_gsum += usecond();
1520}
1521
1522template<class FImpl>
1523void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
1524 const FermionField *wi,
1525 const FermionField *vj,
1526 int orthogdim,
1527 int g5)
1528{
1529 int Lblock = mat.dimension(1);
1530 int Rblock = mat.dimension(2);
1531
1532 GridBase *grid = wi[0].Grid();
1533
1534 const int nd = grid->_ndimension;
1535 const int Nsimd = grid->Nsimd();
1536
1537 int Nt = grid->GlobalDimensions()[orthogdim];
1538
1539 int fd=grid->_fdimensions[orthogdim];
1540 int ld=grid->_ldimensions[orthogdim];
1541 int rd=grid->_rdimensions[orthogdim];
1542
1543 // will locally sum vectors first
1544 // sum across these down to scalars
1545 // splitting the SIMD
1546 int MFrvol = rd*Lblock*Rblock;
1547 int MFlvol = ld*Lblock*Rblock;
1548
1549 std::vector<vector_type > lvSum(MFrvol);
1550 thread_for(r,MFrvol,{
1551 lvSum[r] = Zero();
1552 });
1553
1554 std::vector<scalar_type > lsSum(MFlvol);
1555 thread_for(r,MFlvol,{
1556 lsSum[r]=scalar_type(0.0);
1557 });
1558
1559 int e1= grid->_slice_nblock[orthogdim];
1560 int e2= grid->_slice_block [orthogdim];
1561 int stride=grid->_slice_stride[orthogdim];
1562
1563 thread_for(r,rd,{
1564
1565 int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
1566
1567 for(int n=0;n<e1;n++){
1568 for(int b=0;b<e2;b++){
1569
1570 int ss= so+n*stride+b;
1571
1572 for(int i=0;i<Lblock;i++){
1573
1574 autoView(wi_v,wi[i],CpuRead);
1575 auto w = conjugate(wi_v[ss]);
1576 if (g5) {
1577 w()(2)(0) = - w()(2)(0);
1578 w()(2)(1) = - w()(2)(1);
1579 w()(2)(2) = - w()(2)(2);
1580 w()(3)(0) = - w()(3)(0);
1581 w()(3)(1) = - w()(3)(1);
1582 w()(3)(2) = - w()(3)(2);
1583 }
1584 for(int j=0;j<Rblock;j++){
1585
1586 autoView(vj_v,vj[j],CpuRead);
1587 auto v = vj_v[ss];
1588 auto vv = v()(0)(0);
1589
1590 vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
1591 + w()(0)(1) * v()(0)(1)
1592 + w()(0)(2) * v()(0)(2)
1593 + w()(1)(0) * v()(1)(0)
1594 + w()(1)(1) * v()(1)(1)
1595 + w()(1)(2) * v()(1)(2)
1596 + w()(2)(0) * v()(2)(0)
1597 + w()(2)(1) * v()(2)(1)
1598 + w()(2)(2) * v()(2)(2)
1599 + w()(3)(0) * v()(3)(0)
1600 + w()(3)(1) * v()(3)(1)
1601 + w()(3)(2) * v()(3)(2);
1602
1603 int idx = i+Lblock*j+Lblock*Rblock*r;
1604 lvSum[idx] = lvSum[idx]+vv;
1605 }
1606 }
1607 }
1608 }
1609 });
1610
1611 // Sum across simd lanes in the plane, breaking out orthog dir.
1612 thread_for(rt,rd,{
1613
1614 Coordinate icoor(nd);
1615 iScalar<vector_type> temp;
1616 ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);
1617
1618 for(int i=0;i<Lblock;i++){
1619 for(int j=0;j<Rblock;j++){
1620
1621 int ij_rdx = i+Lblock*j+Lblock*Rblock*rt;
1622
1623 temp._internal =lvSum[ij_rdx];
1624 extract(temp,extracted);
1625
1626 for(int idx=0;idx<Nsimd;idx++){
1627
1628 grid->iCoorFromIindex(icoor,idx);
1629
1630 int ldx = rt+icoor[orthogdim]*rd;
1631
1632 int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx;
1633
1634 lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
1635
1636 }
1637 }}
1638 });
1639
1640 assert(mat.dimension(0) == Nt);
1641 // ld loop and local only??
1642 int pd = grid->_processors[orthogdim];
1643 int pc = grid->_processor_coor[orthogdim];
1644 thread_for_collapse(2,lt,ld,{
1645 for(int pt=0;pt<pd;pt++){
1646 int t = lt + pt*ld;
1647 if (pt == pc){
1648 for(int i=0;i<Lblock;i++){
1649 for(int j=0;j<Rblock;j++){
1650 int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
1651 mat(t,i,j) = lsSum[ij_dx];
1652 }
1653 }
1654 } else {
1655 const scalar_type zz(0.0);
1656 for(int i=0;i<Lblock;i++){
1657 for(int j=0;j<Rblock;j++){
1658 mat(t,i,j) =zz;
1659 }
1660 }
1661 }
1662 }
1663 });
1664
1665 grid->GlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock);
1666}
1667
1668template<class FImpl>
1669void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
1670 const FermionField *wi,
1671 const FermionField *vj,
1672 const std::vector<ComplexField > &mom,
1673 int orthogdim)
1674{
1675 int Lblock = mat.dimension(2);
1676 int Rblock = mat.dimension(3);
1677
1678 GridBase *grid = wi[0].Grid();
1679
1680 const int nd = grid->_ndimension;
1681 const int Nsimd = grid->Nsimd();
1682
1683 int Nt = grid->GlobalDimensions()[orthogdim];
1684 int Nmom = mom.size();
1685
1686 int fd=grid->_fdimensions[orthogdim];
1687 int ld=grid->_ldimensions[orthogdim];
1688 int rd=grid->_rdimensions[orthogdim];
1689
1690 // will locally sum vectors first
1691 // sum across these down to scalars
1692 // splitting the SIMD
1693 int MFrvol = rd*Lblock*Rblock*Nmom;
1694 int MFlvol = ld*Lblock*Rblock*Nmom;
1695
1696 std::vector<vector_type > lvSum(MFrvol);
1697 thread_for(r,MFrvol,{
1698 lvSum[r] = Zero();
1699 });
1700
1701 std::vector<scalar_type > lsSum(MFlvol);
1702 thread_for(r,MFlvol,{
1703 lsSum[r]=scalar_type(0.0);
1704 });
1705
1706 int e1= grid->_slice_nblock[orthogdim];
1707 int e2= grid->_slice_block [orthogdim];
1708 int stride=grid->_slice_stride[orthogdim];
1709
1710 thread_for(r,rd,{
1711
1712 int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
1713
1714 for(int n=0;n<e1;n++){
1715 for(int b=0;b<e2;b++){
1716
1717 int ss= so+n*stride+b;
1718
1719 for(int i=0;i<Lblock;i++){
1720
1721 autoView(wi_v,wi[i],CpuRead);
1722 auto w = conjugate(wi_v[ss]);
1723
1724 for(int j=0;j<Rblock;j++){
1725
1726 autoView(vj_v,vj[j],CpuRead);
1727 auto v = vj_v[ss];
1728
1729 auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
1730 + w()(0)(1) * v()(0)(1)
1731 + w()(0)(2) * v()(0)(2)
1732 + w()(1)(0) * v()(1)(0)
1733 + w()(1)(1) * v()(1)(1)
1734 + w()(1)(2) * v()(1)(2)
1735 - w()(2)(0) * v()(2)(0)
1736 - w()(2)(1) * v()(2)(1)
1737 - w()(2)(2) * v()(2)(2)
1738 - w()(3)(0) * v()(3)(0)
1739 - w()(3)(1) * v()(3)(1)
1740 - w()(3)(2) * v()(3)(2);
1741
1742
1743 // After getting the sitewise product do the mom phase loop
1744 int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
1745 for ( int m=0;m<Nmom;m++){
1746 int idx = m+base;
1747 autoView(mom_v,mom[m],CpuRead);
1748 auto phase = mom_v[ss];
1749 mac(&lvSum[idx],&vv,&phase()()());
1750 }
1751 }
1752 }
1753 }
1754 }
1755 });
1756
1757
1758 // Sum across simd lanes in the plane, breaking out orthog dir.
1759 thread_for(rt,rd,{
1760
1761 Coordinate icoor(nd);
1762 iScalar<vector_type> temp;
1763 ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);
1764
1765 for(int i=0;i<Lblock;i++){
1766 for(int j=0;j<Rblock;j++){
1767 for(int m=0;m<Nmom;m++){
1768
1769 int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
1770
1771 temp._internal = lvSum[ij_rdx];
1772 extract(temp,extracted);
1773
1774 for(int idx=0;idx<Nsimd;idx++){
1775
1776 grid->iCoorFromIindex(icoor,idx);
1777
1778 int ldx = rt+icoor[orthogdim]*rd;
1779
1780 int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
1781
1782 lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
1783
1784 }
1785 }}}
1786 });
1787
1788 assert(mat.dimension(0) == Nmom);
1789 assert(mat.dimension(1) == Nt);
1790
1791 int pd = grid->_processors[orthogdim];
1792 int pc = grid->_processor_coor[orthogdim];
1793 thread_for_collapse(2,lt,ld,{
1794 for(int pt=0;pt<pd;pt++){
1795 int t = lt + pt*ld;
1796 if (pt == pc){
1797 for(int i=0;i<Lblock;i++){
1798 for(int j=0;j<Rblock;j++){
1799 for(int m=0;m<Nmom;m++){
1800 int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
1801 mat(m,t,i,j) = lsSum[ij_dx];
1802 }
1803 }
1804 }
1805 } else {
1806 const scalar_type zz(0.0);
1807 for(int i=0;i<Lblock;i++){
1808 for(int j=0;j<Rblock;j++){
1809 for(int m=0;m<Nmom;m++){
1810 mat(m,t,i,j) =zz;
1811 }
1812 }
1813 }
1814 }
1815 }
1816 });
1817
1818 grid->GlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock);
1819}
1820
1821template<class FImpl>
1822void A2Autils<FImpl>::PionFieldWV(Eigen::Tensor<ComplexD,3> &mat,
1823 const FermionField *wi,
1824 const FermionField *vj,
1825 int orthogdim)
1826{
1827 const int g5=1;
1828 PionFieldXX(mat,wi,vj,orthogdim,g5);
1829}
1830template<class FImpl>
1831void A2Autils<FImpl>::PionFieldWW(Eigen::Tensor<ComplexD,3> &mat,
1832 const FermionField *wi,
1833 const FermionField *wj,
1834 int orthogdim)
1835{
1836 const int nog5=0;
1837 PionFieldXX(mat,wi,wj,orthogdim,nog5);
1838}
1839template<class FImpl>
1840void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat,
1841 const FermionField *vi,
1842 const FermionField *vj,
1843 int orthogdim)
1844{
1845 const int nog5=0;
1846 PionFieldXX(mat,vi,vj,orthogdim,nog5);
1847}
1848*/
1849
1851
Lattice< vVecSpinMatrix > LatticeVecSpinMatrix
Definition A2Autils.h:131
iVector< iMatrix< iScalar< vtype >, Ns >, A2Ablocking > iVecSpinMatrix
Definition A2Autils.h:128
iVecComplex< vComplex > vVecComplex
Definition A2Autils.h:135
iVecComplex< Complex > VecComplex
Definition A2Autils.h:134
Lattice< vVecComplex > LatticeVecComplex
Definition A2Autils.h:136
iVecSpinMatrix< vComplex > vVecSpinMatrix
Definition A2Autils.h:130
iVector< iScalar< iScalar< vtype > >, A2Ablocking > iVecComplex
Definition A2Autils.h:133
iVecSpinMatrix< Complex > VecSpinMatrix
Definition A2Autils.h:129
const int A2Ablocking
Definition A2Autils.h:126
#define accelerator_for(iterator, num, nsimd,...)
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95
accelerator_inline Grid_simd2< S, V > trace(const Grid_simd2< S, V > &arg)
void mac(Lattice< obj1 > &ret, const Lattice< obj2 > &lhs, const Lattice< obj3 > &rhs)
Lattice< vobj > conjugate(const Lattice< vobj > &lhs)
void sliceSum(const Lattice< vobj > &Data, std::vector< typename vobj::scalar_object > &result, int orthogdim)
#define autoView(l_v, l, mode)
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL")
@ AcceleratorRead
@ CpuRead
@ AcceleratorWrite
@ CpuWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
iScalar< iMatrix< iMatrix< vtype, Nc >, Ns > > iSpinColourMatrix
Definition QCD.h:105
static constexpr int Ns
Definition QCD.h:51
static constexpr int Nd
Definition QCD.h:52
iScalar< iScalar< iScalar< vtype > > > iSinglet
Definition QCD.h:102
iScalar< iMatrix< iScalar< vtype >, Ns > > iSpinMatrix
Definition QCD.h:103
std::complex< RealD > ComplexD
Definition Simd.h:79
std::complex< Real > Complex
Definition Simd.h:80
#define T2
#define T3
#define T1
#define T0
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87
AcceleratorVector< __T,GRID_MAX_SIMD > ExtractBuffer
accelerator void extract(const vobj &vec, ExtractBuffer< sobj > &extracted)
accelerator_inline auto peekIndex(const vtype &arg, int i) -> RemoveCRV(TensorIndexRecursion< Level >::peekIndex(arg, 0))
#define thread_for_collapse(N, i, num,...)
Definition Threads.h:70
#define thread_for(i, num,...)
Definition Threads.h:60
double usecond(void)
Definition Timer.h:50
uint64_t base
#define MIN(a, b)
Definition Zolotarev.cc:23
iSpinMatrix< scalar_type > SpinMatrix_s
Definition A2Autils.h:51
FImpl::PropagatorField PropagatorField
Definition A2Autils.h:43
static void ContractFourQuarkColourDiagonal(const PropagatorField &WWVV0, const PropagatorField &WWVV1, const std::vector< Gamma > &gamma0, const std::vector< Gamma > &gamma1, ComplexField &O_trtr, ComplexField &O_fig8)
Definition A2Autils.h:952
iSpinMatrix< vector_type > SpinMatrix_v
Definition A2Autils.h:50
vobj::scalar_object sobj
Definition A2Autils.h:46
iSinglet< scalar_type > Scalar_s
Definition A2Autils.h:53
vobj::vector_type vector_type
Definition A2Autils.h:48
FImpl::SiteSpinor vobj
Definition A2Autils.h:45
iSinglet< vector_type > Scalar_v
Definition A2Autils.h:52
static void OuterProductWWVV(PropagatorField &WWVV, const vobj &lhs, const vobj &rhs, const int Ns, const int ss)
Definition A2Autils.h:930
static std::enable_if<(std::is_same< Eigen::Tensor< ComplexD, 3 >, TensorType >::value||std::is_same< Eigen::TensorMap< Eigen::Tensor< Complex, 3, Eigen::RowMajor > >, TensorType >::value), void >::type ContractWWVV(std::vector< PropagatorField > &WWVV, const TensorType &WW_sd, const FermionField *vs, const FermionField *vd)
Definition A2Autils.h:835
static std::enable_if<!(std::is_same< Eigen::Tensor< ComplexD, 3 >, TensorType >::value||std::is_same< Eigen::TensorMap< Eigen::Tensor< Complex, 3, Eigen::RowMajor > >, TensorType >::value), void >::type ContractWWVV(std::vector< PropagatorField > &WWVV, const TensorType &WW_sd, const FermionField *vs, const FermionField *vd)
static void AslashField(TensorType &mat, const FermionField *lhs_wi, const FermionField *rhs_vj, const std::vector< ComplexField > &emB0, const std::vector< ComplexField > &emB1, int orthogdim, double *t_kernel=nullptr, double *t_gsum=nullptr)
Definition A2Autils.h:238
FImpl::FermionField FermionField
Definition A2Autils.h:42
FImpl::ComplexField ComplexField
Definition A2Autils.h:41
iSpinColourMatrix< vector_type > SpinColourMatrix_v
Definition A2Autils.h:55
static void ContractFourQuarkColourMix(const PropagatorField &WWVV0, const PropagatorField &WWVV1, const std::vector< Gamma > &gamma0, const std::vector< Gamma > &gamma1, ComplexField &O_trtr, ComplexField &O_fig8)
Definition A2Autils.h:996
static void MesonField(TensorType &mat, const FermionField *lhs_wi, const FermionField *rhs_vj, std::vector< Gamma::Algebra > gammas, const std::vector< ComplexField > &mom, int orthogdim, double *t_kernel=nullptr, double *t_gsum=nullptr)
Definition A2Autils.h:142
vobj::scalar_type scalar_type
Definition A2Autils.h:47
accelerator_inline size_type size(void) const
Definition Coordinate.h:52
void GlobalSumVector(RealF *, int N)
Definition Gamma.h:10
const Coordinate & GlobalDimensions(void)
Coordinate _slice_stride
Coordinate _fdimensions
Coordinate _slice_nblock
Coordinate _slice_block
int oSites(void) const
Coordinate _rdimensions
Coordinate _ostride
Coordinate _ldimensions
void iCoorFromIindex(Coordinate &coor, int lane)
int Nsimd(void) const
int NodeCount(void)
Definition Simd.h:194