Grid 0.7.0
WilsonCloverHelpers.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
6
7 Copyright (C) 2021 - 2022
8
9 Author: Daniel Richtmann <daniel.richtmann@gmail.com>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28
29#pragma once
30
31// Helper routines that implement common clover functionality
32
34
35template<class Impl> class WilsonCloverHelpers {
36public:
37
40
41 // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
42 static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
43 {
44 conformable(lambda.Grid(), U[0].Grid());
45 GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
46 // insertion in upper staple
47 // please check redundancy of shift operations
48
49 // C1+
50 tmp = lambda * U[nu];
51 out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
52
53 // C2+
54 tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
55 out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
56
57 // C3+
58 tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
59 out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
60
61 // C4+
62 out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
63
64 // insertion in lower staple
65 // C1-
66 out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
67
68 // C2-
69 tmp = adj(lambda) * U[nu];
70 out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
71
72 // C3-
73 tmp = lambda * U[nu];
74 out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
75
76 // C4-
77 out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
78
79 return out;
80 }
81
82 static CloverField fillCloverYZ(const GaugeLinkField &F)
83 {
84 CloverField T(F.Grid());
85 T = Zero();
88 accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
89 {
90 coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
91 coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
92 coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
93 coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
94 });
95
96 return T;
97 }
98
99 static CloverField fillCloverXZ(const GaugeLinkField &F)
100 {
101 CloverField T(F.Grid());
102 T = Zero();
103
106 accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
107 {
108 coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
109 coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
110 coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
111 coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
112 });
113
114 return T;
115 }
116
117 static CloverField fillCloverXY(const GaugeLinkField &F)
118 {
119 CloverField T(F.Grid());
120 T = Zero();
121
124 accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
125 {
126 coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
127 coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
128 coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
129 coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
130 });
131
132 return T;
133 }
134
135 static CloverField fillCloverXT(const GaugeLinkField &F)
136 {
137 CloverField T(F.Grid());
138 T = Zero();
139
140 autoView( T_v , T, AcceleratorWrite);
141 autoView( F_v , F, AcceleratorRead);
142 accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
143 {
144 coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
145 coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
146 coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
147 coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
148 });
149
150 return T;
151 }
152
153 static CloverField fillCloverYT(const GaugeLinkField &F)
154 {
155 CloverField T(F.Grid());
156 T = Zero();
157
158 autoView( T_v ,T,AcceleratorWrite);
160 accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
161 {
162 coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
163 coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
164 coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
165 coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
166 });
167
168 return T;
169 }
170
171 static CloverField fillCloverZT(const GaugeLinkField &F)
172 {
173 CloverField T(F.Grid());
174
175 T = Zero();
176
177 autoView( T_v , T,AcceleratorWrite);
178 autoView( F_v , F,AcceleratorRead);
179 accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
180 {
181 coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
182 coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
183 coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
184 coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
185 });
186
187 return T;
188 }
189
190 template<class _Spinor>
191 static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
192 auto CC = coalescedRead(C);
193 mult(&phi, &CC, &chi);
194 }
195
196 template<class _SpinorField>
197 inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
198 const int Nsimd = SiteSpinor::Nsimd();
199 autoView(out_v, out, AcceleratorWrite);
200 autoView(phi_v, phi, AcceleratorRead);
201 autoView(C_v, C, AcceleratorRead);
202 typedef decltype(coalescedRead(out_v[0])) calcSpinor;
203 accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
204 calcSpinor tmp;
205 multClover(tmp,C_v[sss],phi_v(sss));
206 coalescedWrite(out_v[sss],tmp);
207 });
208 }
209};
210
211
213
214template<class Impl> class CompactWilsonCloverHelpers {
215public:
216
218
222
223 #if 0
224 static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
225 assert(i != j);
226 if(i < j) {
227 return triangle()(block)(triangle_index(i, j));
228 } else { // i > j
229 return conjugate(triangle()(block)(triangle_index(i, j)));
230 }
231 }
232 #else
233 template<typename vobj>
234 static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
235 assert(i != j);
236 if(i < j) {
237 return triangle()(block)(triangle_index(i, j));
238 } else { // i > j
239 return conjugate(triangle()(block)(triangle_index(i, j)));
240 }
241 }
242 #endif
243
244 static accelerator_inline int triangle_index(int i, int j) {
245 if(i == j)
246 return 0;
247 else if(i < j)
248 return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
249 else // i > j
250 return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
251 }
252
253 static void MooeeKernel_gpu(int Nsite,
254 int Ls,
255 const FermionField& in,
256 FermionField& out,
257 const CloverDiagonalField& diagonal,
258 const CloverTriangleField& triangle) {
259 autoView(diagonal_v, diagonal, AcceleratorRead);
260 autoView(triangle_v, triangle, AcceleratorRead);
261 autoView(in_v, in, AcceleratorRead);
262 autoView(out_v, out, AcceleratorWrite);
263
264 typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
265
266 const uint64_t NN = Nsite * Ls;
267
268 accelerator_for(ss, NN, Simd::Nsimd(), {
269 int sF = ss;
270 int sU = ss/Ls;
271 CalcSpinor res;
272 CalcSpinor in_t = in_v(sF);
273 auto diagonal_t = diagonal_v(sU);
274 auto triangle_t = triangle_v(sU);
275 for(int block=0; block<Nhs; block++) {
276 int s_start = block*Nhs;
277 for(int i=0; i<Nred; i++) {
278 int si = s_start + i/Nc, ci = i%Nc;
279 res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
280 for(int j=0; j<Nred; j++) {
281 if (j == i) continue;
282 int sj = s_start + j/Nc, cj = j%Nc;
283 res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
284 };
285 };
286 };
287 coalescedWrite(out_v[sF], res);
288 });
289 }
290
291 static void MooeeKernel_cpu(int Nsite,
292 int Ls,
293 const FermionField& in,
294 FermionField& out,
295 const CloverDiagonalField& diagonal,
296 const CloverTriangleField& triangle) {
297 autoView(diagonal_v, diagonal, CpuRead);
298 autoView(triangle_v, triangle, CpuRead);
299 autoView(in_v, in, CpuRead);
300 autoView(out_v, out, CpuWrite);
301
302 typedef SiteSpinor CalcSpinor;
303
304#if defined(A64FX) || defined(A64FXFIXEDSIZE)
305#define PREFETCH_CLOVER(BASE) { \
306 uint64_t base; \
307 int pf_dist_L1 = 1; \
308 int pf_dist_L2 = -5; /* -> penalty -> disable */ \
309 \
310 if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
311 base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
312 svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL1STRM); \
313 svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL1STRM); \
314 svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL1STRM); \
315 svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL1STRM); \
316 svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
317 svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
318 } \
319 \
320 if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
321 base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
322 svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL2STRM); \
323 svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL2STRM); \
324 svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL2STRM); \
325 svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL2STRM); \
326 svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
327 svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
328 } \
329 }
330// TODO: Implement/generalize this for other architectures
331// I played around a bit on KNL (see below) but didn't bring anything
332// #elif defined(AVX512)
333// #define PREFETCH_CLOVER(BASE) { \
334// uint64_t base; \
335// int pf_dist_L1 = 1; \
336// int pf_dist_L2 = +4; \
337// \
338// if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
339// base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
340// _mm_prefetch((const char*)(base + 0), _MM_HINT_T0); \
341// _mm_prefetch((const char*)(base + 64), _MM_HINT_T0); \
342// _mm_prefetch((const char*)(base + 128), _MM_HINT_T0); \
343// _mm_prefetch((const char*)(base + 192), _MM_HINT_T0); \
344// _mm_prefetch((const char*)(base + 256), _MM_HINT_T0); \
345// _mm_prefetch((const char*)(base + 320), _MM_HINT_T0); \
346// } \
347// \
348// if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
349// base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
350// _mm_prefetch((const char*)(base + 0), _MM_HINT_T1); \
351// _mm_prefetch((const char*)(base + 64), _MM_HINT_T1); \
352// _mm_prefetch((const char*)(base + 128), _MM_HINT_T1); \
353// _mm_prefetch((const char*)(base + 192), _MM_HINT_T1); \
354// _mm_prefetch((const char*)(base + 256), _MM_HINT_T1); \
355// _mm_prefetch((const char*)(base + 320), _MM_HINT_T1); \
356// } \
357// }
358#else
359#define PREFETCH_CLOVER(BASE)
360#endif
361
362 const uint64_t NN = Nsite * Ls;
363
364 thread_for(ss, NN, {
365 int sF = ss;
366 int sU = ss/Ls;
367 CalcSpinor res;
368 CalcSpinor in_t = in_v[sF];
369 auto diag_t = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
370 auto triangle_t = triangle_v[sU];
371
372 // upper half
374
375 auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
376 auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
377 auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
378 auto in_cc_1_0 = conjugate(in_t()(1)(0));
379 auto in_cc_1_1 = conjugate(in_t()(1)(1));
380
381 res()(0)(0) = diag_t()(0)( 0) * in_t()(0)(0)
382 + triangle_t()(0)( 0) * in_t()(0)(1)
383 + triangle_t()(0)( 1) * in_t()(0)(2)
384 + triangle_t()(0)( 2) * in_t()(1)(0)
385 + triangle_t()(0)( 3) * in_t()(1)(1)
386 + triangle_t()(0)( 4) * in_t()(1)(2);
387
388 res()(0)(1) = triangle_t()(0)( 0) * in_cc_0_0;
389 res()(0)(1) = diag_t()(0)( 1) * in_t()(0)(1)
390 + triangle_t()(0)( 5) * in_t()(0)(2)
391 + triangle_t()(0)( 6) * in_t()(1)(0)
392 + triangle_t()(0)( 7) * in_t()(1)(1)
393 + triangle_t()(0)( 8) * in_t()(1)(2)
394 + conjugate( res()(0)( 1));
395
396 res()(0)(2) = triangle_t()(0)( 1) * in_cc_0_0
397 + triangle_t()(0)( 5) * in_cc_0_1;
398 res()(0)(2) = diag_t()(0)( 2) * in_t()(0)(2)
399 + triangle_t()(0)( 9) * in_t()(1)(0)
400 + triangle_t()(0)(10) * in_t()(1)(1)
401 + triangle_t()(0)(11) * in_t()(1)(2)
402 + conjugate( res()(0)( 2));
403
404 res()(1)(0) = triangle_t()(0)( 2) * in_cc_0_0
405 + triangle_t()(0)( 6) * in_cc_0_1
406 + triangle_t()(0)( 9) * in_cc_0_2;
407 res()(1)(0) = diag_t()(0)( 3) * in_t()(1)(0)
408 + triangle_t()(0)(12) * in_t()(1)(1)
409 + triangle_t()(0)(13) * in_t()(1)(2)
410 + conjugate( res()(1)( 0));
411
412 res()(1)(1) = triangle_t()(0)( 3) * in_cc_0_0
413 + triangle_t()(0)( 7) * in_cc_0_1
414 + triangle_t()(0)(10) * in_cc_0_2
415 + triangle_t()(0)(12) * in_cc_1_0;
416 res()(1)(1) = diag_t()(0)( 4) * in_t()(1)(1)
417 + triangle_t()(0)(14) * in_t()(1)(2)
418 + conjugate( res()(1)( 1));
419
420 res()(1)(2) = triangle_t()(0)( 4) * in_cc_0_0
421 + triangle_t()(0)( 8) * in_cc_0_1
422 + triangle_t()(0)(11) * in_cc_0_2
423 + triangle_t()(0)(13) * in_cc_1_0
424 + triangle_t()(0)(14) * in_cc_1_1;
425 res()(1)(2) = diag_t()(0)( 5) * in_t()(1)(2)
426 + conjugate( res()(1)( 2));
427
428 vstream(out_v[sF]()(0)(0), res()(0)(0));
429 vstream(out_v[sF]()(0)(1), res()(0)(1));
430 vstream(out_v[sF]()(0)(2), res()(0)(2));
431 vstream(out_v[sF]()(1)(0), res()(1)(0));
432 vstream(out_v[sF]()(1)(1), res()(1)(1));
433 vstream(out_v[sF]()(1)(2), res()(1)(2));
434
435 // lower half
437
438 auto in_cc_2_0 = conjugate(in_t()(2)(0));
439 auto in_cc_2_1 = conjugate(in_t()(2)(1));
440 auto in_cc_2_2 = conjugate(in_t()(2)(2));
441 auto in_cc_3_0 = conjugate(in_t()(3)(0));
442 auto in_cc_3_1 = conjugate(in_t()(3)(1));
443
444 res()(2)(0) = diag_t()(1)( 0) * in_t()(2)(0)
445 + triangle_t()(1)( 0) * in_t()(2)(1)
446 + triangle_t()(1)( 1) * in_t()(2)(2)
447 + triangle_t()(1)( 2) * in_t()(3)(0)
448 + triangle_t()(1)( 3) * in_t()(3)(1)
449 + triangle_t()(1)( 4) * in_t()(3)(2);
450
451 res()(2)(1) = triangle_t()(1)( 0) * in_cc_2_0;
452 res()(2)(1) = diag_t()(1)( 1) * in_t()(2)(1)
453 + triangle_t()(1)( 5) * in_t()(2)(2)
454 + triangle_t()(1)( 6) * in_t()(3)(0)
455 + triangle_t()(1)( 7) * in_t()(3)(1)
456 + triangle_t()(1)( 8) * in_t()(3)(2)
457 + conjugate( res()(2)( 1));
458
459 res()(2)(2) = triangle_t()(1)( 1) * in_cc_2_0
460 + triangle_t()(1)( 5) * in_cc_2_1;
461 res()(2)(2) = diag_t()(1)( 2) * in_t()(2)(2)
462 + triangle_t()(1)( 9) * in_t()(3)(0)
463 + triangle_t()(1)(10) * in_t()(3)(1)
464 + triangle_t()(1)(11) * in_t()(3)(2)
465 + conjugate( res()(2)( 2));
466
467 res()(3)(0) = triangle_t()(1)( 2) * in_cc_2_0
468 + triangle_t()(1)( 6) * in_cc_2_1
469 + triangle_t()(1)( 9) * in_cc_2_2;
470 res()(3)(0) = diag_t()(1)( 3) * in_t()(3)(0)
471 + triangle_t()(1)(12) * in_t()(3)(1)
472 + triangle_t()(1)(13) * in_t()(3)(2)
473 + conjugate( res()(3)( 0));
474
475 res()(3)(1) = triangle_t()(1)( 3) * in_cc_2_0
476 + triangle_t()(1)( 7) * in_cc_2_1
477 + triangle_t()(1)(10) * in_cc_2_2
478 + triangle_t()(1)(12) * in_cc_3_0;
479 res()(3)(1) = diag_t()(1)( 4) * in_t()(3)(1)
480 + triangle_t()(1)(14) * in_t()(3)(2)
481 + conjugate( res()(3)( 1));
482
483 res()(3)(2) = triangle_t()(1)( 4) * in_cc_2_0
484 + triangle_t()(1)( 8) * in_cc_2_1
485 + triangle_t()(1)(11) * in_cc_2_2
486 + triangle_t()(1)(13) * in_cc_3_0
487 + triangle_t()(1)(14) * in_cc_3_1;
488 res()(3)(2) = diag_t()(1)( 5) * in_t()(3)(2)
489 + conjugate( res()(3)( 2));
490
491 vstream(out_v[sF]()(2)(0), res()(2)(0));
492 vstream(out_v[sF]()(2)(1), res()(2)(1));
493 vstream(out_v[sF]()(2)(2), res()(2)(2));
494 vstream(out_v[sF]()(3)(0), res()(3)(0));
495 vstream(out_v[sF]()(3)(1), res()(3)(1));
496 vstream(out_v[sF]()(3)(2), res()(3)(2));
497 });
498 }
499
500 static void MooeeKernel(int Nsite,
501 int Ls,
502 const FermionField& in,
503 FermionField& out,
504 const CloverDiagonalField& diagonal,
505 const CloverTriangleField& triangle) {
506#if defined(GRID_CUDA) || defined(GRID_HIP)
507 MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
508#else
509 MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
510#endif
511 }
512
513 static void Invert(const CloverDiagonalField& diagonal,
514 const CloverTriangleField& triangle,
515 CloverDiagonalField& diagonalInv,
516 CloverTriangleField& triangleInv) {
517 conformable(diagonal, diagonalInv);
518 conformable(triangle, triangleInv);
519 conformable(diagonal, triangle);
520
521 diagonalInv.Checkerboard() = diagonal.Checkerboard();
522 triangleInv.Checkerboard() = triangle.Checkerboard();
523
524 GridBase* grid = diagonal.Grid();
525
526 long lsites = grid->lSites();
527
528 typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
529 typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
530
531 autoView(diagonal_v, diagonal, CpuRead);
532 autoView(triangle_v, triangle, CpuRead);
533 autoView(diagonalInv_v, diagonalInv, CpuWrite);
534 autoView(triangleInv_v, triangleInv, CpuWrite);
535
536 thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
537 Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
538 Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
539
540 scalar_object_diagonal diagonal_tmp = Zero();
541 scalar_object_diagonal diagonal_inv_tmp = Zero();
542 scalar_object_triangle triangle_tmp = Zero();
543 scalar_object_triangle triangle_inv_tmp = Zero();
544
545 Coordinate lcoor;
546 grid->LocalIndexToLocalCoor(site, lcoor);
547
548 peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
549 peekLocalSite(triangle_tmp, triangle_v, lcoor);
550
551 // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
552 for (long s_row=0;s_row<Ns;s_row++) {
553 for (long s_col=0;s_col<Ns;s_col++) {
554 if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
555 int block = s_row / Nhs;
556 int s_row_block = s_row % Nhs;
557 int s_col_block = s_col % Nhs;
558 for (long c_row=0;c_row<Nc;c_row++) {
559 for (long c_col=0;c_col<Nc;c_col++) {
560 int i = s_row_block * Nc + c_row;
561 int j = s_col_block * Nc + c_col;
562 if(i == j)
563 clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
564 else
565 clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
566 }
567 }
568 }
569 }
570
571 clover_inv_eigen = clover_eigen.inverse();
572
573 for (long s_row=0;s_row<Ns;s_row++) {
574 for (long s_col=0;s_col<Ns;s_col++) {
575 if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
576 int block = s_row / Nhs;
577 int s_row_block = s_row % Nhs;
578 int s_col_block = s_col % Nhs;
579 for (long c_row=0;c_row<Nc;c_row++) {
580 for (long c_col=0;c_col<Nc;c_col++) {
581 int i = s_row_block * Nc + c_row;
582 int j = s_col_block * Nc + c_col;
583 if(i == j)
584 diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
585 else if(i < j)
586 triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
587 else
588 continue;
589 }
590 }
591 }
592 }
593
594 pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
595 pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
596 });
597 }
598
599 static void ConvertLayout(const CloverField& full,
600 CloverDiagonalField& diagonal,
601 CloverTriangleField& triangle) {
602 conformable(full, diagonal);
603 conformable(full, triangle);
604
605 diagonal.Checkerboard() = full.Checkerboard();
606 triangle.Checkerboard() = full.Checkerboard();
607
608 autoView(full_v, full, AcceleratorRead);
609 autoView(diagonal_v, diagonal, AcceleratorWrite);
610 autoView(triangle_v, triangle, AcceleratorWrite);
611
612 // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
613 accelerator_for(ss, full.Grid()->oSites(), 1, {
614 for(int s_row = 0; s_row < Ns; s_row++) {
615 for(int s_col = 0; s_col < Ns; s_col++) {
616 if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
617 int block = s_row / Nhs;
618 int s_row_block = s_row % Nhs;
619 int s_col_block = s_col % Nhs;
620 for(int c_row = 0; c_row < Nc; c_row++) {
621 for(int c_col = 0; c_col < Nc; c_col++) {
622 int i = s_row_block * Nc + c_row;
623 int j = s_col_block * Nc + c_col;
624 if(i == j)
625 diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
626 else if(i < j)
627 triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
628 else
629 continue;
630 }
631 }
632 }
633 }
634 });
635 }
636
637
638 static void ConvertLayout(const CloverDiagonalField& diagonal,
639 const CloverTriangleField& triangle,
640 CloverField& full) {
641 conformable(full, diagonal);
642 conformable(full, triangle);
643
644 full.Checkerboard() = diagonal.Checkerboard();
645
646 full = Zero();
647
648 autoView(diagonal_v, diagonal, AcceleratorRead);
649 autoView(triangle_v, triangle, AcceleratorRead);
650 autoView(full_v, full, AcceleratorWrite);
651
652 // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
653 accelerator_for(ss, full.Grid()->oSites(), 1, {
654 for(int s_row = 0; s_row < Ns; s_row++) {
655 for(int s_col = 0; s_col < Ns; s_col++) {
656 if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
657 int block = s_row / Nhs;
658 int s_row_block = s_row % Nhs;
659 int s_col_block = s_col % Nhs;
660 for(int c_row = 0; c_row < Nc; c_row++) {
661 for(int c_col = 0; c_col < Nc; c_col++) {
662 int i = s_row_block * Nc + c_row;
663 int j = s_col_block * Nc + c_col;
664 if(i == j)
665 full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
666 else
667 full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
668 }
669 }
670 }
671 }
672 });
673 }
674
675 static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
676 // Checks/grid
677 double t0 = usecond();
678 conformable(diagonal, triangle);
679 GridBase* grid = diagonal.Grid();
680
681 // Determine the boundary coordinates/sites
682 double t1 = usecond();
683 int t_dir = Nd - 1;
684 Lattice<iScalar<vInteger>> t_coor(grid);
685 LatticeCoordinate(t_coor, t_dir);
686 int T = grid->GlobalDimensions()[t_dir];
687
688 // Set off-diagonal parts at boundary to zero -- OK
689 double t2 = usecond();
690 CloverTriangleField zeroTriangle(grid);
691 zeroTriangle.Checkerboard() = triangle.Checkerboard();
692 zeroTriangle = Zero();
693 triangle = where(t_coor == 0, zeroTriangle, triangle);
694 triangle = where(t_coor == T-1, zeroTriangle, triangle);
695
696 // Set diagonal to unity (scaled correctly) -- OK
697 double t3 = usecond();
698 CloverDiagonalField tmp(grid);
699 tmp.Checkerboard() = diagonal.Checkerboard();
700 tmp = -1.0 * csw_t + diag_mass;
701 diagonal = where(t_coor == 0, tmp, diagonal);
702 diagonal = where(t_coor == T-1, tmp, diagonal);
703
704 // Correct values next to boundary
705 double t4 = usecond();
706 if(cF != 1.0) {
707 tmp = cF - 1.0;
708 tmp += diagonal;
709 diagonal = where(t_coor == 1, tmp, diagonal);
710 diagonal = where(t_coor == T-2, tmp, diagonal);
711 }
712
713 // Report timings
714 double t5 = usecond();
715#if 0
716 std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
717 << " checks = " << (t1 - t0) / 1e6
718 << ", coordinate = " << (t2 - t1) / 1e6
719 << ", off-diag zero = " << (t3 - t2) / 1e6
720 << ", diagonal unity = " << (t4 - t3) / 1e6
721 << ", near-boundary = " << (t5 - t4) / 1e6
722 << ", total = " << (t5 - t0) / 1e6
723 << std::endl;
724#endif
725 }
726
727 template<class Field, class Mask>
728 static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
729 conformable(f, m);
730 auto grid = f.Grid();
731 const uint32_t Nsite = grid->oSites();
732 const uint32_t Nsimd = grid->Nsimd();
733 autoView(f_v, f, AcceleratorWrite);
734 autoView(m_v, m, AcceleratorRead);
735 // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
736 accelerator_for(ss, Nsite, Nsimd, {
737 coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
738 });
739 }
740
741 template<class MaskField>
742 static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
743 assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
744 assert(odd.Grid()->_isCheckerBoarded && odd.Checkerboard() == Odd);
745 assert(!full.Grid()->_isCheckerBoarded);
746
747 GridBase* grid = full.Grid();
748 int t_dir = Nd-1;
749 Lattice<iScalar<vInteger>> t_coor(grid);
750 LatticeCoordinate(t_coor, t_dir);
751 int T = grid->GlobalDimensions()[t_dir];
752
753 MaskField zeroMask(grid); zeroMask = Zero();
754 full = 1.0;
755 full = where(t_coor == 0, zeroMask, full);
756 full = where(t_coor == T-1, zeroMask, full);
757
758 pickCheckerboard(Even, even, full);
759 pickCheckerboard(Odd, odd, full);
760 }
761};
762
#define accelerator_inline
#define accelerator_for(iterator, num, nsimd,...)
static const int Even
static const int Odd
AcceleratorVector< int, MaxDims > Coordinate
Definition Coordinate.h:95
accelerator_inline void vstream(Grid_simd2< S, V > &out, const Grid_simd2< S, V > &in)
accelerator_inline Grid_simd< S, V > abs(const Grid_simd< S, V > &r)
void mult(Lattice< obj1 > &ret, const Lattice< obj2 > &lhs, const Lattice< obj3 > &rhs)
void conformable(const Lattice< obj1 > &lhs, const Lattice< obj2 > &rhs)
void LatticeCoordinate(Lattice< iobj > &l, int mu)
void peekLocalSite(sobj &s, const LatticeView< vobj > &l, Coordinate &site)
void pokeLocalSite(const sobj &s, LatticeView< vobj > &l, Coordinate &site)
Lattice< vobj > conjugate(const Lattice< vobj > &lhs)
Lattice< vobj > adj(const Lattice< vobj > &lhs)
void pickCheckerboard(int cb, Lattice< vobj > &half, const Lattice< vobj > &full)
#define autoView(l_v, l, mode)
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL")
@ AcceleratorRead
@ CpuRead
@ AcceleratorWrite
@ CpuWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
static constexpr int Nhs
Definition QCD.h:53
static constexpr int Ns
Definition QCD.h:51
static constexpr int Nd
Definition QCD.h:52
static constexpr int Nc
Definition QCD.h:50
std::complex< RealD > ComplexD
Definition Simd.h:79
double RealD
Definition Simd.h:61
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
Definition Tensor_SIMT.h:61
accelerator_inline std::enable_if<!isGridTensor< T >::value, T >::type TensorRemove(T arg)
#define thread_for(i, num,...)
Definition Threads.h:60
#define strong_inline
Definition Threads.h:36
double usecond(void)
Definition Timer.h:50
#define PREFETCH_CLOVER(BASE)
static INTERNAL_PRECISION U
Definition Zolotarev.cc:230
static INTERNAL_PRECISION F
Definition Zolotarev.cc:230
static void ConvertLayout(const CloverDiagonalField &diagonal, const CloverTriangleField &triangle, CloverField &full)
static void MooeeKernel_cpu(int Nsite, int Ls, const FermionField &in, FermionField &out, const CloverDiagonalField &diagonal, const CloverTriangleField &triangle)
static accelerator_inline vobj triangle_elem(const iImplCloverTriangle< vobj > &triangle, int block, int i, int j)
static void Invert(const CloverDiagonalField &diagonal, const CloverTriangleField &triangle, CloverDiagonalField &diagonalInv, CloverTriangleField &triangleInv)
static void MooeeKernel_gpu(int Nsite, int Ls, const FermionField &in, FermionField &out, const CloverDiagonalField &diagonal, const CloverTriangleField &triangle)
static void ModifyBoundaries(CloverDiagonalField &diagonal, CloverTriangleField &triangle, RealD csw_t, RealD cF, RealD diag_mass)
static strong_inline void ApplyBoundaryMask(Field &f, const Mask &m)
static void MooeeKernel(int Nsite, int Ls, const FermionField &in, FermionField &out, const CloverDiagonalField &diagonal, const CloverTriangleField &triangle)
static accelerator_inline int triangle_index(int i, int j)
static void ConvertLayout(const CloverField &full, CloverDiagonalField &diagonal, CloverTriangleField &triangle)
static void SetupMasks(MaskField &full, MaskField &even, MaskField &odd)
void LocalIndexToLocalCoor(int lidx, Coordinate &lcoor)
const Coordinate & GlobalDimensions(void)
int lSites(void) const
static CloverField fillCloverYZ(const GaugeLinkField &F)
void multCloverField(_SpinorField &out, const CloverField &C, const _SpinorField &phi)
static CloverField fillCloverXT(const GaugeLinkField &F)
static CloverField fillCloverXZ(const GaugeLinkField &F)
static CloverField fillCloverXY(const GaugeLinkField &F)
static CloverField fillCloverZT(const GaugeLinkField &F)
static accelerator_inline void multClover(_Spinor &phi, const SiteClover &C, const _Spinor &chi)
static GaugeLinkField Cmunu(std::vector< GaugeLinkField > &U, GaugeLinkField &lambda, int mu, int nu)
static CloverField fillCloverYT(const GaugeLinkField &F)
Definition Simd.h:194