37template <
typename FImpl>
45 typedef typename FImpl::SiteSpinor
vobj;
46 typedef typename vobj::scalar_object
sobj;
59 template <
typename TensorType>
63 std::vector<Gamma::Algebra> gammas,
64 const std::vector<ComplexField > &mom,
65 int orthogdim,
double *t_kernel =
nullptr,
double *t_gsum =
nullptr);
67 template <
typename TensorType>
71 const std::vector<ComplexField> &emB0,
72 const std::vector<ComplexField> &emB1,
73 int orthogdim,
double *t_kernel =
nullptr,
double *t_gsum =
nullptr);
75 template <
typename TensorType>
76 typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
77 std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
80 const TensorType &WW_sd,
84 template <
typename TensorType>
85 typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
86 std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
89 const TensorType &WW_sd,
95 const std::vector<Gamma> &gamma0,
96 const std::vector<Gamma> &gamma1,
102 const std::vector<Gamma> &gamma0,
103 const std::vector<Gamma> &gamma1,
107 static void DeltaFeq2(
int dt_min,
int dt_max,
108 Eigen::Tensor<ComplexD,2> &dF2_fig8,
109 Eigen::Tensor<ComplexD,2> &dF2_trtr,
110 Eigen::Tensor<ComplexD,2> &dF2_fig8_mix,
111 Eigen::Tensor<ComplexD,2> &dF2_trtr_mix,
112 Eigen::Tensor<ComplexD,1> &denom_A0,
113 Eigen::Tensor<ComplexD,1> &denom_P,
114 Eigen::Tensor<ComplexD,3> &WW_sd,
123 const int Ns,
const int ss);
138#define A2A_GPU_KERNELS
139#ifdef A2A_GPU_KERNELS
140template <
class FImpl>
141template <
typename TensorType>
145 std::vector<Gamma::Algebra> gammas,
146 const std::vector<ComplexField > &mom,
147 int orthogdim,
double *t_kernel,
double *t_gsum)
150 typedef typename FImpl::SiteSpinor
vobj;
152 typedef typename vobj::scalar_object
sobj;
156 int Lblock = mat.dimension(3);
157 int Rblock = mat.dimension(4);
165 const int Nsimd = grid->
Nsimd();
168 int Ngamma = gammas.
size();
169 int Nmom = mom.size();
174 std::vector<VecSpinMatrix> sliced;
175 for(
int i=0;i<Lblock;i++){
178 for(
int jo=0;jo<Rblock;jo+=block){
179 for(
int j=jo;j<
MIN(Rblock,jo+block);j++){
187 auto left = conjugate(lhs_v(ss));
188 auto right = rhs_v(ss);
189 auto vv = SpinMat_v(ss);
190 for(int s1=0;s1<Ns;s1++){
191 for(int s2=0;s2<Ns;s2++){
192 vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
193 + left()(s2)(1) * right()(s1)(1)
194 + left()(s2)(2) * right()(s1)(2);
201 for(
int m=0;m<Nmom;m++){
203 MomSpinMat = SpinMat * mom[m];
205 sliceSum(MomSpinMat,sliced,orthogdim);
207 for(
int mu=0;mu<Ngamma;mu++){
208 for(
int t=0;t<sliced.size();t++){
209 for(
int j=jo;j<
MIN(Rblock,jo+block);j++){
213 mat(m,mu,t,i,j) = trSG()();
236template <
class FImpl>
237template <
typename TensorType>
241 const std::vector<ComplexField> &emB0,
242 const std::vector<ComplexField> &emB1,
243 int orthogdim,
double *t_kernel,
double *t_gsum)
246 typedef typename FImpl::SiteSpinor
vobj;
248 typedef typename vobj::scalar_object
sobj;
252 int Lblock = mat.dimension(3);
253 int Rblock = mat.dimension(4);
255 int Nem = emB0.size();
256 assert(emB1.size() == Nem);
264 const int Nsimd = grid->
Nsimd();
270 std::vector<VecComplex> sliced;
272 for(
int i=0;i<Lblock;i++){
276 for(
int jo=0;jo<Rblock;jo+=block){
277 for(
int j=jo;j<
MIN(Rblock,jo+block);j++){
284 auto left = conjugate(lhs_v(ss));
285 auto right = rhs_v(ss);
286 auto vv = SpinMat_v(ss);
287 for(int s1=0;s1<Ns;s1++){
288 for(int s2=0;s2<Ns;s2++){
289 vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
290 + left()(s2)(1) * right()(s1)(1)
291 + left()(s2)(2) * right()(s1)(2);
296 for(
int m=0;m<Nem;m++){
300 auto vv = SpinMat_v(ss);
301 auto b0 = emB0_v(ss);
302 auto b1 = emB1_v(ss);
303 auto cb0 = conjugate(b0);
304 auto cb1 = conjugate(b1);
305 auto asl = Aslash_v(ss);
306 for(int j=jo;j<MIN(Rblock,jo+block);j++){
308 asl(jj)()()=- vv(jj)(3,0)()*b0()()() - vv(jj)(2,0)()*cb1()()()
309 + vv(jj)(3,1)()*b1()()() - vv(jj)(2,1)()*cb0()()()
310 + vv(jj)(0,2)()*b1()()() + vv(jj)(1,2)()*b0()()()
311 + vv(jj)(0,3)()*cb0()()() - vv(jj)(1,3)()*cb1()()();
319 for(
int t=0;t<sliced.size();t++){
320 for(
int j=jo;j<
MIN(Rblock,jo+block);j++){
322 mat(m,0,t,i,j) = sliced[t](jj)()();
331template <
class FImpl>
332template <
typename TensorType>
334 const FermionField *lhs_wi,
335 const FermionField *rhs_vj,
336 std::vector<Gamma::Algebra> gammas,
337 const std::vector<ComplexField > &mom,
338 int orthogdim,
double *t_kernel,
double *t_gsum)
340 typedef typename FImpl::SiteSpinor vobj;
342 typedef typename vobj::scalar_object sobj;
344 typedef typename vobj::vector_type vector_type;
349 int Lblock = mat.dimension(3);
350 int Rblock = mat.dimension(4);
355 const int Nsimd = grid->
Nsimd();
358 int Ngamma = gammas.
size();
359 int Nmom = mom.size();
368 int MFrvol = rd*Lblock*Rblock*Nmom;
369 int MFlvol = ld*Lblock*Rblock*Nmom;
371 std::vector<SpinMatrix_v > lvSum(MFrvol);
372 for(
int r=0;r<MFrvol;r++){
376 std::vector<SpinMatrix_s > lsSum(MFlvol);
377 for(
int r=0;r<MFlvol;r++){
386 if (t_kernel) *t_kernel = -
usecond();
387 for(
int r=0;r<rd;r++) {
391 for(
int n=0;n<e1;n++){
392 for(
int b=0;b<e2;b++){
394 int ss= so+n*stride+b;
396 for(
int i=0;i<Lblock;i++){
401 for(
int j=0;j<Rblock;j++){
406 auto right = rhs_v[ss];
407 for(
int s1=0;s1<
Ns;s1++){
408 for(
int s2=0;s2<
Ns;s2++){
409 vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
410 + left()(s2)(1) * right()(s1)(1)
411 + left()(s2)(2) * right()(s1)(2);
415 int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
416 for (
int m=0;m<Nmom;m++){
419 auto phase = mom_v[ss];
420 mac(&lvSum[idx],&vv,&phase);
429 for(
int rt=0;rt<rd;rt++){
434 for(
int i=0;i<Lblock;i++){
435 for(
int j=0;j<Rblock;j++){
436 for(
int m=0;m<Nmom;m++){
438 int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
440 extract(lvSum[ij_rdx],extracted);
442 for(
int idx=0;idx<Nsimd;idx++){
446 int ldx = rt+icoor[orthogdim]*rd;
448 int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
450 lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
455 if (t_kernel) *t_kernel +=
usecond();
456 assert(mat.dimension(0) == Nmom);
457 assert(mat.dimension(1) == Ngamma);
458 assert(mat.dimension(2) == Nt);
464 for(
int pt=0;pt<pd;pt++){
467 for(
int i=0;i<Lblock;i++){
468 for(
int j=0;j<Rblock;j++){
469 for(
int m=0;m<Nmom;m++){
470 int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
471 for(
int mu=0;mu<Ngamma;mu++){
473 mat(m,mu,t,i,j) =
trace(lsSum[ij_dx]*
Gamma(gammas[mu]))()()();
480 for(
int i=0;i<Lblock;i++){
481 for(
int j=0;j<Rblock;j++){
482 for(
int mu=0;mu<Ngamma;mu++){
483 for(
int m=0;m<Nmom;m++){
498 if (t_gsum) *t_gsum = -
usecond();
500 if (t_gsum) *t_gsum +=
usecond();
503template <
class FImpl>
504template <
typename TensorType>
506 const FermionField *lhs_wi,
507 const FermionField *rhs_vj,
508 const std::vector<ComplexField> &emB0,
509 const std::vector<ComplexField> &emB1,
510 int orthogdim,
double *t_kernel,
double *t_gsum)
512 typedef typename FermionField::vector_object vobj;
513 typedef typename vobj::scalar_object sobj;
515 typedef typename vobj::vector_type vector_type;
522 int Lblock = mat.dimension(3);
523 int Rblock = mat.dimension(4);
528 const int Nsimd = grid->
Nsimd();
531 int Nem = emB0.
size();
532 assert(emB1.size() == Nem);
541 int MFrvol = rd*Lblock*Rblock*Nem;
542 int MFlvol = ld*Lblock*Rblock*Nem;
544 std::vector<vector_type> lvSum(MFrvol);
550 std::vector<scalar_type> lsSum(MFlvol);
562 if (t_kernel) *t_kernel = -
usecond();
563 for(
int r=0;r<rd;r++)
567 for(
int n=0;n<e1;n++)
568 for(
int b=0;b<e2;b++)
570 int ss= so+n*stride+b;
572 for(
int i=0;i<Lblock;i++)
577 for(
int j=0;j<Rblock;j++)
581 auto right = vj_v[ss];
583 for(
int s1=0;s1<
Ns;s1++)
584 for(
int s2=0;s2<
Ns;s2++)
586 vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
587 + left()(s2)(1) * right()(s1)(1)
588 + left()(s2)(2) * right()(s1)(2);
592 int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
594 for (
int m=0;m<Nem;m++)
599 auto b0 = emB0_v[ss];
600 auto b1 = emB1_v[ss];
604 lvSum[idx] += - vv()(3,0)()*b0()()() - vv()(2,0)()*cb1()()()
605 + vv()(3,1)()*b1()()() - vv()(2,1)()*cb0()()()
606 + vv()(0,2)()*b1()()() + vv()(1,2)()*b0()()()
607 + vv()(0,3)()*cb0()()() - vv()(1,3)()*cb1()()();
620 for(
int i=0;i<Lblock;i++)
621 for(
int j=0;j<Rblock;j++)
622 for(
int m=0;m<Nem;m++)
625 int ij_rdx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*rt;
628 for(
int idx=0;idx<Nsimd;idx++)
632 int ldx = rt+icoor[orthogdim]*rd;
633 int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx;
635 lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
639 if (t_kernel) *t_kernel +=
usecond();
646 for(
int pt=0;pt<pd;pt++)
651 for(
int i=0;i<Lblock;i++)
652 for(
int j=0;j<Rblock;j++)
653 for(
int m=0;m<Nem;m++)
655 int ij_dx = m+Nem*i + Nem*Lblock * j + Nem*Lblock * Rblock * lt;
657 mat(m,0,t,i,j) = lsSum[ij_dx];
664 for(
int i=0;i<Lblock;i++)
665 for(
int j=0;j<Rblock;j++)
666 for(
int m=0;m<Nem;m++)
673 if (t_gsum) *t_gsum = -
usecond();
675 if (t_gsum) *t_gsum +=
usecond();
830template <
class FImpl>
831template <
typename TensorType>
832typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
833 std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
836 const TensorType &WW_sd,
843 int Nsimd = grid->
Nsimd();
844 int N_t = WW_sd.dimension(0);
845 int N_s = WW_sd.dimension(1);
846 int N_d = WW_sd.dimension(2);
850 for(
int t=0;t<N_t;t++){
855 for(int d_o=0;d_o<N_d;d_o+=d_unroll){
856 for(int t=0;t<N_t;t++){
857 for(int s=0;s<N_s;s++){
858 autoView(vs_v,vs[s],CpuRead);
859 auto tmp1 = vs_v[ss];
862 for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
863 autoView(vd_v,vd[d],CpuRead);
864 Scalar_v coeff = WW_sd(t,s,d);
865 tmp3 = conjugate(vd_v[ss]);
866 mac(&tmp2, &coeff, &tmp3);
872 OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
879template <
class FImpl>
880template <
typename TensorType>
881typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD, 3>, TensorType>::value ||
882 std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
885 const TensorType &WW_sd,
886 const FermionField *vs,
887 const FermionField *vd)
892 int Nsimd = grid->
Nsimd();
893 int N_t = WW_sd.dimensions()[0];
894 int N_s = WW_sd.dimensions()[1];
895 int N_d = WW_sd.dimensions()[2];
899 Eigen::Matrix<
Complex, -1, -1, Eigen::RowMajor> buf;
901 for(
int t=0;t<N_t;t++){
905 for (
int t = 0; t < N_t; t++){
908 for(int d_o=0;d_o<N_d;d_o+=d_unroll){
909 for(int s=0;s<N_s;s++){
910 autoView(vs_v,vs[s],CpuRead);
911 auto tmp1 = vs_v[ss];
914 for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
915 autoView(vd_v,vd[d],CpuRead);
916 Scalar_v coeff = buf(s,d);
917 tmp3 = conjugate(vd_v[ss]);
918 mac(&tmp2, &coeff, &tmp3);
923 OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
929template <
class FImpl>
933 const int Ns,
const int ss)
936 for (
int s1 = 0; s1 <
Ns; s1++){
937 for (
int s2 = 0; s2 <
Ns; s2++){
938 WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
939 WWVV_v[ss]()(s1,s2)(0, 1) += lhs()(s1)(0) * rhs()(s2)(1);
940 WWVV_v[ss]()(s1,s2)(0, 2) += lhs()(s1)(0) * rhs()(s2)(2);
941 WWVV_v[ss]()(s1,s2)(1, 0) += lhs()(s1)(1) * rhs()(s2)(0);
942 WWVV_v[ss]()(s1,s2)(1, 1) += lhs()(s1)(1) * rhs()(s2)(1);
943 WWVV_v[ss]()(s1,s2)(1, 2) += lhs()(s1)(1) * rhs()(s2)(2);
944 WWVV_v[ss]()(s1,s2)(2, 0) += lhs()(s1)(2) * rhs()(s2)(0);
945 WWVV_v[ss]()(s1,s2)(2, 1) += lhs()(s1)(2) * rhs()(s2)(1);
946 WWVV_v[ss]()(s1,s2)(2, 2) += lhs()(s1)(2) * rhs()(s2)(2);
954 const std::vector<Gamma> &gamma0,
955 const std::vector<Gamma> &gamma1,
959 assert(gamma0.size()==gamma1.size());
960 int Ng = gamma0.size();
970 typedef typename ComplexField::vector_object vobj;
975 auto VV0 = WWVV0_v[ss];
976 auto VV1 = WWVV1_v[ss];
978 for(int g=0;g<Ng;g++){
980 v_trtr = trace(VV0 * gamma0[g])* trace(VV1*gamma1[g]);
981 v_fig8 = trace(VV0 * gamma0[g] * VV1 * gamma1[g]);
984 O_trtr_v[ss] = v_trtr;
985 O_fig8_v[ss] = v_fig8;
987 O_trtr_v[ss]+= v_trtr;
988 O_fig8_v[ss]+= v_fig8;
998 const std::vector<Gamma> &gamma0,
999 const std::vector<Gamma> &gamma1,
1003 assert(gamma0.size()==gamma1.size());
1004 int Ng = gamma0.size();
1015 typedef typename ComplexField::vector_object vobj;
1017 auto VV0 = WWVV0_v[ss];
1018 auto VV1 = WWVV1_v[ss];
1020 for(int g=0;g<Ng;g++){
1022 auto VV0G = VV0 * gamma0[g];
1023 auto VV1G = VV1 * gamma1[g];
1047 for(int a=0;a<Nc;a++){
1048 for(int b=0;b<Nc;b++){
1049 for(int s=0;s<Ns;s++){
1050 for(int t=0;t<Ns;t++){
1052 v_trtr()()() += VV0G()(s,s)(a,b)*VV1G()(t,t)(b,a);
1053 v_fig8()()() += VV0G()(s,t)(a,a)*VV1G()(t,s)(b,b);
1077 O_trtr_v[ss] = v_trtr;
1078 O_fig8_v[ss] = v_fig8;
1080 O_trtr_v[ss]+= v_trtr;
1081 O_fig8_v[ss]+= v_fig8;
1096template<
class FImpl>
1098 Eigen::Tensor<ComplexD,2> &dF2_fig8,
1099 Eigen::Tensor<ComplexD,2> &dF2_trtr,
1100 Eigen::Tensor<ComplexD,2> &dF2_fig8_mix,
1101 Eigen::Tensor<ComplexD,2> &dF2_trtr_mix,
1102 Eigen::Tensor<ComplexD,1> &denom_A0,
1103 Eigen::Tensor<ComplexD,1> &denom_P,
1104 Eigen::Tensor<ComplexD,3> &WW_sd,
1105 const FermionField *vs,
1106 const FermionField *vd,
1111 LOG(Message) <<
"Computing A2A DeltaF=2 graph" << std::endl;
1113 auto G5 =
Gamma(Gamma::Algebra::Gamma5);
1117 int Nsimd = grid->
Nsimd();
1118 int N_t = WW_sd.dimension(0);
1119 int N_s = WW_sd.dimension(1);
1120 int N_d = WW_sd.dimension(2);
1124 for(
int dim=0;dim<nd;dim++){
1129 std::vector<PropagatorField> WWVV (N_t,grid);
1132 ContractWWVV(WWVV,WW_sd,&vs[0],&vd[0]);
1138 for(
int t=0;t<N_t;t++){
1139 WWVV[t] = WWVV[t]* G5 ;
1146 dF2_trtr.resize(N_t,Ng);
1147 dF2_fig8.resize(N_t,Ng);
1148 dF2_trtr_mix.resize(N_t,Ng);
1149 dF2_fig8_mix.resize(N_t,Ng);
1150 denom_A0.resize(N_t);
1151 denom_P.resize(N_t);
1152 for(
int t=0;t<N_t;t++){
1153 for(
int g=0;g<Ng;g++) dF2_trtr(t,g)=
ComplexD(0.0);
1154 for(
int g=0;g<Ng;g++) dF2_fig8(t,g)=
ComplexD(0.0);
1155 for(
int g=0;g<Ng;g++) dF2_trtr_mix(t,g)=
ComplexD(0.0);
1156 for(
int g=0;g<Ng;g++) dF2_fig8_mix(t,g)=
ComplexD(0.0);
1161 ComplexField D0(grid); D0 =
Zero();
1162 ComplexField D1(grid); D1 =
Zero();
1164 ComplexField O1_trtr(grid); O1_trtr =
Zero();
1165 ComplexField O2_trtr(grid); O2_trtr =
Zero();
1166 ComplexField O3_trtr(grid); O3_trtr =
Zero();
1167 ComplexField O4_trtr(grid); O4_trtr =
Zero();
1168 ComplexField O5_trtr(grid); O5_trtr =
Zero();
1170 ComplexField O1_fig8(grid); O1_fig8 =
Zero();
1171 ComplexField O2_fig8(grid); O2_fig8 =
Zero();
1172 ComplexField O3_fig8(grid); O3_fig8 =
Zero();
1173 ComplexField O4_fig8(grid); O4_fig8 =
Zero();
1174 ComplexField O5_fig8(grid); O5_fig8 =
Zero();
1176 ComplexField VV_trtr(grid); VV_trtr =
Zero();
1177 ComplexField AA_trtr(grid); AA_trtr =
Zero();
1178 ComplexField SS_trtr(grid); SS_trtr =
Zero();
1179 ComplexField PP_trtr(grid); PP_trtr =
Zero();
1180 ComplexField TT_trtr(grid); TT_trtr =
Zero();
1182 ComplexField VV_fig8(grid); VV_fig8 =
Zero();
1183 ComplexField AA_fig8(grid); AA_fig8 =
Zero();
1184 ComplexField SS_fig8(grid); SS_fig8 =
Zero();
1185 ComplexField PP_fig8(grid); PP_fig8 =
Zero();
1186 ComplexField TT_fig8(grid); TT_fig8 =
Zero();
1191 std::vector<TComplex> C1;
1192 std::vector<TComplex> C2;
1193 std::vector<TComplex> C3;
1194 std::vector<TComplex> C4;
1195 std::vector<TComplex> C5;
1204 auto VX =
Gamma(Gamma::Algebra::GammaX);
1205 auto VY =
Gamma(Gamma::Algebra::GammaY);
1206 auto VZ =
Gamma(Gamma::Algebra::GammaZ);
1207 auto VT =
Gamma(Gamma::Algebra::GammaT);
1209 auto AX =
Gamma(Gamma::Algebra::GammaXGamma5);
1210 auto AY =
Gamma(Gamma::Algebra::GammaYGamma5);
1211 auto AZ =
Gamma(Gamma::Algebra::GammaZGamma5);
1212 auto AT =
Gamma(Gamma::Algebra::GammaTGamma5);
1214 auto S =
Gamma(Gamma::Algebra::Identity);
1215 auto P =
Gamma(Gamma::Algebra::Gamma5);
1217 auto T0 =
Gamma(Gamma::Algebra::SigmaXY);
1218 auto T1 =
Gamma(Gamma::Algebra::SigmaXZ);
1219 auto T2 =
Gamma(Gamma::Algebra::SigmaXT);
1220 auto T3 =
Gamma(Gamma::Algebra::SigmaYZ);
1221 auto T4 =
Gamma(Gamma::Algebra::SigmaYT);
1222 auto T5 =
Gamma(Gamma::Algebra::SigmaZT);
1224 std::cout <<
GridLogMessage <<
" dt " <<dt_min<<
"..." <<dt_max<<std::endl;
1226 for(
int t0=0;t0<N_t;t0++){
1231 int t1 = (t0+dt)%N_t;
1234 std::vector<Gamma> VV({VX,VY,VZ,VT});
1235 std::vector<Gamma> AA({AX,AY,AZ,AT});
1236 std::vector<Gamma> SS({S});
1237 std::vector<Gamma> PP({P});
1238 std::vector<Gamma> TT({
T0,
T1,
T2,
T3,T4,T5});
1239 std::vector<Gamma> A0({AT});
1241 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],VV,VV,VV_trtr,VV_fig8);
1242 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],AA,AA,AA_trtr,AA_fig8);
1243 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],SS,SS,SS_trtr,SS_fig8);
1244 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],PP,PP,PP_trtr,PP_fig8);
1245 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],TT,TT,TT_trtr,TT_fig8);
1247 O1_trtr = VV_trtr+AA_trtr; O2_trtr = VV_trtr-AA_trtr;
1248 O1_fig8 = VV_fig8+AA_fig8; O2_fig8 = VV_fig8-AA_fig8;
1250 O3_trtr = SS_trtr-PP_trtr; O4_trtr = SS_trtr+PP_trtr;
1251 O3_fig8 = SS_fig8-PP_fig8; O4_fig8 = SS_fig8+PP_fig8;
1256 sliceSum(O1_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1257 sliceSum(O2_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1258 sliceSum(O3_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1259 sliceSum(O4_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1260 sliceSum(O5_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1262 sliceSum(O1_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1263 sliceSum(O2_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1264 sliceSum(O3_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1265 sliceSum(O4_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1266 sliceSum(O5_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1268 ContractFourQuarkColourDiagonal(WWVV[t0], WWVV[t1],A0,A0,AA_trtr,AA_fig8);
1273 for(
int t=0;t<N_t;t++){
1274 denom_A0(t)+=C1[(t+t0)%N_t]()()()/vol;
1275 denom_P(t) +=C2[(t+t0)%N_t]()()()/vol;
1282 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],VV,VV,VV_trtr,VV_fig8);
1283 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],AA,AA,AA_trtr,AA_fig8);
1284 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],SS,SS,SS_trtr,SS_fig8);
1285 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],PP,PP,PP_trtr,PP_fig8);
1286 ContractFourQuarkColourMix(WWVV[t0], WWVV[t1],TT,TT,TT_trtr,TT_fig8);
1288 O1_trtr = VV_trtr+AA_trtr; O2_trtr = VV_trtr-AA_trtr;
1289 O1_fig8 = VV_fig8+AA_fig8; O2_fig8 = VV_fig8-AA_fig8;
1291 O3_trtr = SS_trtr-PP_trtr; O4_trtr = SS_trtr+PP_trtr;
1292 O3_fig8 = SS_fig8-PP_fig8; O4_fig8 = SS_fig8+PP_fig8;
1297 sliceSum(O1_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr_mix(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1298 sliceSum(O2_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr_mix(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1299 sliceSum(O3_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr_mix(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1300 sliceSum(O4_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr_mix(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1301 sliceSum(O5_trtr,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_trtr_mix(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1303 sliceSum(O1_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8_mix(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1304 sliceSum(O2_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8_mix(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1305 sliceSum(O3_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8_mix(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1306 sliceSum(O4_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8_mix(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1307 sliceSum(O5_fig8,C1, orthogdim);
for(
int t=0;t<N_t;t++) dF2_fig8_mix(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
1314 double million=1.0e6;
1315 LOG(Message) <<
"Computing A2A DeltaF=2 graph t_tot " << t_tot /million <<
" s "<< std::endl;
1316 LOG(Message) <<
"Computing A2A DeltaF=2 graph t_outer " << t_outer /million <<
" s "<< std::endl;
1317 LOG(Message) <<
"Computing A2A DeltaF=2 graph t_contr " << t_contr /million <<
" s "<< std::endl;
1513 // This global sum is taking as much as 50% of time on 16 nodes
1514 // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
1515 // Healthy size that should suffice
1517 if (t_gsum) *t_gsum = -usecond();
1518 grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
1519 if (t_gsum) *t_gsum += usecond();
1522template<class FImpl>
1523void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
1524 const FermionField *wi,
1525 const FermionField *vj,
1529 int Lblock = mat.dimension(1);
1530 int Rblock = mat.dimension(2);
1532 GridBase *grid = wi[0].Grid();
1534 const int nd = grid->_ndimension;
1535 const int Nsimd = grid->Nsimd();
1537 int Nt = grid->GlobalDimensions()[orthogdim];
1539 int fd=grid->_fdimensions[orthogdim];
1540 int ld=grid->_ldimensions[orthogdim];
1541 int rd=grid->_rdimensions[orthogdim];
1543 // will locally sum vectors first
1544 // sum across these down to scalars
1545 // splitting the SIMD
1546 int MFrvol = rd*Lblock*Rblock;
1547 int MFlvol = ld*Lblock*Rblock;
1549 std::vector<vector_type > lvSum(MFrvol);
1550 thread_for(r,MFrvol,{
1554 std::vector<scalar_type > lsSum(MFlvol);
1555 thread_for(r,MFlvol,{
1556 lsSum[r]=scalar_type(0.0);
1559 int e1= grid->_slice_nblock[orthogdim];
1560 int e2= grid->_slice_block [orthogdim];
1561 int stride=grid->_slice_stride[orthogdim];
1565 int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
1567 for(int n=0;n<e1;n++){
1568 for(int b=0;b<e2;b++){
1570 int ss= so+n*stride+b;
1572 for(int i=0;i<Lblock;i++){
1574 autoView(wi_v,wi[i],CpuRead);
1575 auto w = conjugate(wi_v[ss]);
1577 w()(2)(0) = - w()(2)(0);
1578 w()(2)(1) = - w()(2)(1);
1579 w()(2)(2) = - w()(2)(2);
1580 w()(3)(0) = - w()(3)(0);
1581 w()(3)(1) = - w()(3)(1);
1582 w()(3)(2) = - w()(3)(2);
1584 for(int j=0;j<Rblock;j++){
1586 autoView(vj_v,vj[j],CpuRead);
1588 auto vv = v()(0)(0);
1590 vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
1591 + w()(0)(1) * v()(0)(1)
1592 + w()(0)(2) * v()(0)(2)
1593 + w()(1)(0) * v()(1)(0)
1594 + w()(1)(1) * v()(1)(1)
1595 + w()(1)(2) * v()(1)(2)
1596 + w()(2)(0) * v()(2)(0)
1597 + w()(2)(1) * v()(2)(1)
1598 + w()(2)(2) * v()(2)(2)
1599 + w()(3)(0) * v()(3)(0)
1600 + w()(3)(1) * v()(3)(1)
1601 + w()(3)(2) * v()(3)(2);
1603 int idx = i+Lblock*j+Lblock*Rblock*r;
1604 lvSum[idx] = lvSum[idx]+vv;
1611 // Sum across simd lanes in the plane, breaking out orthog dir.
1614 Coordinate icoor(nd);
1615 iScalar<vector_type> temp;
1616 ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);
1618 for(int i=0;i<Lblock;i++){
1619 for(int j=0;j<Rblock;j++){
1621 int ij_rdx = i+Lblock*j+Lblock*Rblock*rt;
1623 temp._internal =lvSum[ij_rdx];
1624 extract(temp,extracted);
1626 for(int idx=0;idx<Nsimd;idx++){
1628 grid->iCoorFromIindex(icoor,idx);
1630 int ldx = rt+icoor[orthogdim]*rd;
1632 int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx;
1634 lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
1640 assert(mat.dimension(0) == Nt);
1641 // ld loop and local only??
1642 int pd = grid->_processors[orthogdim];
1643 int pc = grid->_processor_coor[orthogdim];
1644 thread_for_collapse(2,lt,ld,{
1645 for(int pt=0;pt<pd;pt++){
1648 for(int i=0;i<Lblock;i++){
1649 for(int j=0;j<Rblock;j++){
1650 int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
1651 mat(t,i,j) = lsSum[ij_dx];
1655 const scalar_type zz(0.0);
1656 for(int i=0;i<Lblock;i++){
1657 for(int j=0;j<Rblock;j++){
1665 grid->GlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock);
1668template<class FImpl>
1669void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
1670 const FermionField *wi,
1671 const FermionField *vj,
1672 const std::vector<ComplexField > &mom,
1675 int Lblock = mat.dimension(2);
1676 int Rblock = mat.dimension(3);
1678 GridBase *grid = wi[0].Grid();
1680 const int nd = grid->_ndimension;
1681 const int Nsimd = grid->Nsimd();
1683 int Nt = grid->GlobalDimensions()[orthogdim];
1684 int Nmom = mom.size();
1686 int fd=grid->_fdimensions[orthogdim];
1687 int ld=grid->_ldimensions[orthogdim];
1688 int rd=grid->_rdimensions[orthogdim];
1690 // will locally sum vectors first
1691 // sum across these down to scalars
1692 // splitting the SIMD
1693 int MFrvol = rd*Lblock*Rblock*Nmom;
1694 int MFlvol = ld*Lblock*Rblock*Nmom;
1696 std::vector<vector_type > lvSum(MFrvol);
1697 thread_for(r,MFrvol,{
1701 std::vector<scalar_type > lsSum(MFlvol);
1702 thread_for(r,MFlvol,{
1703 lsSum[r]=scalar_type(0.0);
1706 int e1= grid->_slice_nblock[orthogdim];
1707 int e2= grid->_slice_block [orthogdim];
1708 int stride=grid->_slice_stride[orthogdim];
1712 int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
1714 for(int n=0;n<e1;n++){
1715 for(int b=0;b<e2;b++){
1717 int ss= so+n*stride+b;
1719 for(int i=0;i<Lblock;i++){
1721 autoView(wi_v,wi[i],CpuRead);
1722 auto w = conjugate(wi_v[ss]);
1724 for(int j=0;j<Rblock;j++){
1726 autoView(vj_v,vj[j],CpuRead);
1729 auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
1730 + w()(0)(1) * v()(0)(1)
1731 + w()(0)(2) * v()(0)(2)
1732 + w()(1)(0) * v()(1)(0)
1733 + w()(1)(1) * v()(1)(1)
1734 + w()(1)(2) * v()(1)(2)
1735 - w()(2)(0) * v()(2)(0)
1736 - w()(2)(1) * v()(2)(1)
1737 - w()(2)(2) * v()(2)(2)
1738 - w()(3)(0) * v()(3)(0)
1739 - w()(3)(1) * v()(3)(1)
1740 - w()(3)(2) * v()(3)(2);
1743 // After getting the sitewise product do the mom phase loop
1744 int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
1745 for ( int m=0;m<Nmom;m++){
1747 autoView(mom_v,mom[m],CpuRead);
1748 auto phase = mom_v[ss];
1749 mac(&lvSum[idx],&vv,&phase()()());
1758 // Sum across simd lanes in the plane, breaking out orthog dir.
1761 Coordinate icoor(nd);
1762 iScalar<vector_type> temp;
1763 ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);
1765 for(int i=0;i<Lblock;i++){
1766 for(int j=0;j<Rblock;j++){
1767 for(int m=0;m<Nmom;m++){
1769 int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
1771 temp._internal = lvSum[ij_rdx];
1772 extract(temp,extracted);
1774 for(int idx=0;idx<Nsimd;idx++){
1776 grid->iCoorFromIindex(icoor,idx);
1778 int ldx = rt+icoor[orthogdim]*rd;
1780 int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
1782 lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
1788 assert(mat.dimension(0) == Nmom);
1789 assert(mat.dimension(1) == Nt);
1791 int pd = grid->_processors[orthogdim];
1792 int pc = grid->_processor_coor[orthogdim];
1793 thread_for_collapse(2,lt,ld,{
1794 for(int pt=0;pt<pd;pt++){
1797 for(int i=0;i<Lblock;i++){
1798 for(int j=0;j<Rblock;j++){
1799 for(int m=0;m<Nmom;m++){
1800 int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
1801 mat(m,t,i,j) = lsSum[ij_dx];
1806 const scalar_type zz(0.0);
1807 for(int i=0;i<Lblock;i++){
1808 for(int j=0;j<Rblock;j++){
1809 for(int m=0;m<Nmom;m++){
1818 grid->GlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock);
1821template<class FImpl>
1822void A2Autils<FImpl>::PionFieldWV(Eigen::Tensor<ComplexD,3> &mat,
1823 const FermionField *wi,
1824 const FermionField *vj,
1828 PionFieldXX(mat,wi,vj,orthogdim,g5);
1830template<class FImpl>
1831void A2Autils<FImpl>::PionFieldWW(Eigen::Tensor<ComplexD,3> &mat,
1832 const FermionField *wi,
1833 const FermionField *wj,
1837 PionFieldXX(mat,wi,wj,orthogdim,nog5);
1839template<class FImpl>
1840void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat,
1841 const FermionField *vi,
1842 const FermionField *vj,
1846 PionFieldXX(mat,vi,vj,orthogdim,nog5);
Lattice< vVecSpinMatrix > LatticeVecSpinMatrix
iVector< iMatrix< iScalar< vtype >, Ns >, A2Ablocking > iVecSpinMatrix
iVecComplex< vComplex > vVecComplex
iVecComplex< Complex > VecComplex
Lattice< vVecComplex > LatticeVecComplex
iVecSpinMatrix< vComplex > vVecSpinMatrix
iVector< iScalar< iScalar< vtype > >, A2Ablocking > iVecComplex
iVecSpinMatrix< Complex > VecSpinMatrix
#define accelerator_for(iterator, num, nsimd,...)
AcceleratorVector< int, MaxDims > Coordinate
accelerator_inline Grid_simd2< S, V > trace(const Grid_simd2< S, V > &arg)
void mac(Lattice< obj1 > &ret, const Lattice< obj2 > &lhs, const Lattice< obj3 > &rhs)
Lattice< vobj > conjugate(const Lattice< vobj > &lhs)
void sliceSum(const Lattice< vobj > &Data, std::vector< typename vobj::scalar_object > &result, int orthogdim)
#define autoView(l_v, l, mode)
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL")
#define NAMESPACE_BEGIN(A)
iScalar< iMatrix< iMatrix< vtype, Nc >, Ns > > iSpinColourMatrix
iScalar< iScalar< iScalar< vtype > > > iSinglet
iScalar< iMatrix< iScalar< vtype >, Ns > > iSpinMatrix
std::complex< RealD > ComplexD
std::complex< Real > Complex
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
accelerator_inline auto peekIndex(const vtype &arg, int i) -> RemoveCRV(TensorIndexRecursion< Level >::peekIndex(arg, 0))
#define thread_for_collapse(N, i, num,...)
#define thread_for(i, num,...)
iSpinMatrix< scalar_type > SpinMatrix_s
FImpl::PropagatorField PropagatorField
static void ContractFourQuarkColourDiagonal(const PropagatorField &WWVV0, const PropagatorField &WWVV1, const std::vector< Gamma > &gamma0, const std::vector< Gamma > &gamma1, ComplexField &O_trtr, ComplexField &O_fig8)
iSpinMatrix< vector_type > SpinMatrix_v
iSinglet< scalar_type > Scalar_s
vobj::vector_type vector_type
iSinglet< vector_type > Scalar_v
static void OuterProductWWVV(PropagatorField &WWVV, const vobj &lhs, const vobj &rhs, const int Ns, const int ss)
static std::enable_if<(std::is_same< Eigen::Tensor< ComplexD, 3 >, TensorType >::value||std::is_same< Eigen::TensorMap< Eigen::Tensor< Complex, 3, Eigen::RowMajor > >, TensorType >::value), void >::type ContractWWVV(std::vector< PropagatorField > &WWVV, const TensorType &WW_sd, const FermionField *vs, const FermionField *vd)
static std::enable_if<!(std::is_same< Eigen::Tensor< ComplexD, 3 >, TensorType >::value||std::is_same< Eigen::TensorMap< Eigen::Tensor< Complex, 3, Eigen::RowMajor > >, TensorType >::value), void >::type ContractWWVV(std::vector< PropagatorField > &WWVV, const TensorType &WW_sd, const FermionField *vs, const FermionField *vd)
static void AslashField(TensorType &mat, const FermionField *lhs_wi, const FermionField *rhs_vj, const std::vector< ComplexField > &emB0, const std::vector< ComplexField > &emB1, int orthogdim, double *t_kernel=nullptr, double *t_gsum=nullptr)
FImpl::FermionField FermionField
FImpl::ComplexField ComplexField
iSpinColourMatrix< vector_type > SpinColourMatrix_v
static void ContractFourQuarkColourMix(const PropagatorField &WWVV0, const PropagatorField &WWVV1, const std::vector< Gamma > &gamma0, const std::vector< Gamma > &gamma1, ComplexField &O_trtr, ComplexField &O_fig8)
static void MesonField(TensorType &mat, const FermionField *lhs_wi, const FermionField *rhs_vj, std::vector< Gamma::Algebra > gammas, const std::vector< ComplexField > &mom, int orthogdim, double *t_kernel=nullptr, double *t_gsum=nullptr)
vobj::scalar_type scalar_type
accelerator_inline size_type size(void) const
Coordinate _processor_coor
void GlobalSumVector(RealF *, int N)
unsigned long _ndimension
const Coordinate & GlobalDimensions(void)
void iCoorFromIindex(Coordinate &coor, int lane)