Grid 0.7.0
WilsonKernelsAsmBodyA64FX.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: WilsonKernelsAsmBodyA64FX.h
6
7 Copyright (C) 2020
8
9Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28
29// GCC 10 messes up SVE instruction scheduling using -O3, but
30// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
31// performance now is better than armclang 20.2
32
33#ifdef KERNEL_DAG
34#define DIR0_PROJ XP_PROJ
35#define DIR1_PROJ YP_PROJ
36#define DIR2_PROJ ZP_PROJ
37#define DIR3_PROJ TP_PROJ
38#define DIR4_PROJ XM_PROJ
39#define DIR5_PROJ YM_PROJ
40#define DIR6_PROJ ZM_PROJ
41#define DIR7_PROJ TM_PROJ
42#define DIR0_RECON XP_RECON
43#define DIR1_RECON YP_RECON_ACCUM
44#define DIR2_RECON ZP_RECON_ACCUM
45#define DIR3_RECON TP_RECON_ACCUM
46#define DIR4_RECON XM_RECON_ACCUM
47#define DIR5_RECON YM_RECON_ACCUM
48#define DIR6_RECON ZM_RECON_ACCUM
49#define DIR7_RECON TM_RECON_ACCUM
50#else
51#define DIR0_PROJ XM_PROJ
52#define DIR1_PROJ YM_PROJ
53#define DIR2_PROJ ZM_PROJ
54#define DIR3_PROJ TM_PROJ
55#define DIR4_PROJ XP_PROJ
56#define DIR5_PROJ YP_PROJ
57#define DIR6_PROJ ZP_PROJ
58#define DIR7_PROJ TP_PROJ
59#define DIR0_RECON XM_RECON
60#define DIR1_RECON YM_RECON_ACCUM
61#define DIR2_RECON ZM_RECON_ACCUM
62#define DIR3_RECON TM_RECON_ACCUM
63#define DIR4_RECON XP_RECON_ACCUM
64#define DIR5_RECON YP_RECON_ACCUM
65#define DIR6_RECON ZP_RECON_ACCUM
66#define DIR7_RECON TP_RECON_ACCUM
67#endif
68
69//using namespace std;
70
71#undef SHOW
72//#define SHOW
73
74#undef WHERE
75
76#ifdef INTERIOR_AND_EXTERIOR
77#define WHERE "INT_AND_EXT"
78#endif
79
80#ifdef INTERIOR
81#define WHERE "INT"
82#endif
83
84#ifdef EXTERIOR
85#define WHERE "EXT"
86#endif
87
88//#pragma message("here")
89
90
91
93// Comms then compute kernel
95#ifdef INTERIOR_AND_EXTERIOR
96
97#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
98 basep = st.GetPFInfo(nent,plocal); nent++; \
99 if ( local ) { \
100 LOAD_CHIMU(base); \
101 LOAD_TABLE(PERMUTE_DIR); \
102 PROJ; \
103 MAYBEPERM(PERMUTE_DIR,perm); \
104 } else { \
105 LOAD_CHI(base); \
106 } \
107 base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
108 MULT_2SPIN_1(Dir); \
109 PREFETCH_CHIMU(base); \
110 PREFETCH_CHIMU_L2(basep); \
111 /* PREFETCH_GAUGE_L1(NxtDir); */ \
112 MULT_2SPIN_2; \
113 if (s == 0) { \
114 if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
115 } \
116 RECON; \
117
118/*
119NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
120 though I expected that it would improve on performance
121*/
122
123#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
124 base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
125 PREFETCH1_CHIMU(base); \
126 ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
127
128#define RESULT(base,basep) SAVE_RESULT(base,basep);
129
130#endif
131
133// Pre comms kernel -- prefetch like normal because it is mostly right
135#ifdef INTERIOR
136
137#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
138 basep = st.GetPFInfo(nent,plocal); nent++; \
139 if ( local ) { \
140 LOAD_CHIMU(base); \
141 LOAD_TABLE(PERMUTE_DIR); \
142 PROJ; \
143 MAYBEPERM(PERMUTE_DIR,perm); \
144 }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
145 if ( local || st.same_node[Dir] ) { \
146 MULT_2SPIN_1(Dir); \
147 MULT_2SPIN_2; \
148 RECON; \
149 } \
150 base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
151 PREFETCH_CHIMU(base); \
152 PREFETCH_CHIMU_L2(basep); \
153
154#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
155 base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
156 PREFETCH1_CHIMU(base); \
157 { ZERO_PSI; } \
158 ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
159
160#define RESULT(base,basep) SAVE_RESULT(base,basep);
161
162#endif
163
165// Post comms kernel
167#ifdef EXTERIOR
168
169#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
170 base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
171 if((!local)&&(!st.same_node[Dir]) ) { \
172 LOAD_CHI(base); \
173 MULT_2SPIN_1(Dir); \
174 MULT_2SPIN_2; \
175 RECON; \
176 nmu++; \
177 }
178
179#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
180 nmu=0; \
181 { ZERO_PSI;} \
182 base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
183 if((!local)&&(!st.same_node[Dir]) ) { \
184 LOAD_CHI(base); \
185 MULT_2SPIN_1(Dir); \
186 MULT_2SPIN_2; \
187 RECON; \
188 nmu++; \
189 }
190
191#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
192
193#endif
194
195
196{
197 int nmu;
199 uint64_t base;
200 uint64_t basep;
201 const uint64_t plocal =(uint64_t) & in[0];
202
204 int nmax=U.oSites();
205 for(int site=0;site<Ns;site++) {
206#ifndef EXTERIOR
207 // int sU =lo.Reorder(ssU);
208 int sU =ssU;
209 int ssn=ssU+1; if(ssn>=nmax) ssn=0;
210 // int sUn=lo.Reorder(ssn);
211 int sUn=ssn;
212#else
213 int sU =ssU;
214 int ssn=ssU+1; if(ssn>=nmax) ssn=0;
215 int sUn=ssn;
216#endif
217 for(int s=0;s<Ls;s++) {
218 ss =sU*Ls+s;
219 ssn=sUn*Ls+s;
220 int ent=ss*8;// 2*Ndim
221 int nent=ssn*8;
222
223 uint64_t delta_base, delta_base_p;
224
226
227#ifdef SHOW
228 float rescale = 64. * 12.;
229 std::cout << "=================================================================" << std::endl;
230 std::cout << "ss = " << ss << " ssn = " << ssn << std::endl;
231 std::cout << "sU = " << sU << " ssU = " << ssU << std::endl;
232 std::cout << " " << std::endl;
233
234
235 std::cout << "Dir = " << Xp << " " << WHERE<< std::endl;
236
237 std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
238 std::cout << "st.same_node[Dir] = " << st.same_node[Xp] << std::endl;
239 std::cout << "base = " << (base - plocal)/rescale << std::endl;
240 std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
241 //printf("U = %llu\n", (uint64_t)&[sU](Dir));
242 std::cout << "----------------------------------------------------" << std::endl;
243#endif
244
246
247#ifdef SHOW
248 std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
249
250 std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
251 std::cout << "st.same_node[Dir] = " << st.same_node[Yp] << std::endl;
252 std::cout << "base = " << (base - plocal)/rescale << std::endl;
253 std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
254 //printf("U = %llu\n", (uint64_t)&[sU](Dir));
255 std::cout << "----------------------------------------------------" << std::endl;
256#endif
257
259
260#ifdef SHOW
261 std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
262
263 std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
264 std::cout << "st.same_node[Dir] = " << st.same_node[Zp] << std::endl;
265 std::cout << "base = " << (base - plocal)/rescale << std::endl;
266 std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
267 //printf("U = %llu\n", (uint64_t)&[sU](Dir));
268 std::cout << "----------------------------------------------------" << std::endl;
269#endif
270
272
273#ifdef SHOW
274 std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
275
276 std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
277 std::cout << "st.same_node[Dir] = " << st.same_node[Tp] << std::endl;
278 std::cout << "base = " << (base - plocal)/rescale << std::endl;
279 std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
280 //printf("U = %llu\n", (uint64_t)&[sU](Dir));
281 std::cout << "----------------------------------------------------" << std::endl;
282#endif
283
285
286#ifdef SHOW
287 std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
288
289 std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
290 std::cout << "st.same_node[Dir] = " << st.same_node[Xm] << std::endl;
291 std::cout << "base = " << (base - plocal)/rescale << std::endl;
292 std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
293 //printf("U = %llu\n", (uint64_t)&[sU](Dir));
294 std::cout << "----------------------------------------------------" << std::endl;
295#endif
296
297 // DC ZVA test
298 // { uint64_t basestore = (uint64_t)&out[ss];
299 // PREFETCH_RESULT_L2_STORE(basestore); }
300
301
303
304#ifdef SHOW
305 std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
306
307 std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
308 std::cout << "st.same_node[Dir] = " << st.same_node[Ym] << std::endl;
309 std::cout << "base = " << (base - plocal)/rescale << std::endl;
310 std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
311 //printf("U = %llu\n", (uint64_t)&[sU](Dir));
312 std::cout << "----------------------------------------------------" << std::endl;
313#endif
314
315 // DC ZVA test
316 //{ uint64_t basestore = (uint64_t)&out[ss];
317 // PREFETCH_RESULT_L2_STORE(basestore); }
318
319
321
322#ifdef SHOW
323 std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
324
325 std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
326 std::cout << "st.same_node[Dir] = " << st.same_node[Zm] << std::endl;
327 std::cout << "base = " << (base - plocal)/rescale << std::endl;
328 std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
329 //printf("U = %llu\n", (uint64_t)&[sU](Dir));
330 std::cout << "----------------------------------------------------" << std::endl;
331#endif
332
333 // DC ZVA test
334 //{ uint64_t basestore = (uint64_t)&out[ss];
335 // PREFETCH_RESULT_L2_STORE(basestore); }
336
337
339
340#ifdef SHOW
341 std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
342
343 std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
344 std::cout << "st.same_node[Dir] = " << st.same_node[Tm] << std::endl;
345 std::cout << "base = " << (base - plocal)/rescale << std::endl;
346 std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
347 //printf("U = %llu\n", (uint64_t)&[sU](Dir));
348 std::cout << "----------------------------------------------------" << std::endl;
349#endif
350
351#ifdef EXTERIOR
352 if (nmu==0) break;
353 // if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
354#endif
355 base = (uint64_t) &out[ss];
356 basep= st.GetPFInfo(nent,plocal); ent++;
357 basep = (uint64_t) &out[ssn];
358 //PREFETCH_RESULT_L1_STORE(base);
359 RESULT(base,basep);
360
361#ifdef SHOW
362 std::cout << "Dir = FINAL " << WHERE<< std::endl;;
363
364 base_ss = base;
365 std::cout << "base = " << (base - (uint64_t) &out[0])/rescale << std::endl;
366 std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
367 //printf("U = %llu\n", (uint64_t)&[sU](Dir));
368 std::cout << "----------------------------------------------------" << std::endl;
369#endif
370
371 }
372 ssU++;
373 UNLOCK_GAUGE(0);
374 }
375}
376
377#undef DIR0_PROJ
378#undef DIR1_PROJ
379#undef DIR2_PROJ
380#undef DIR3_PROJ
381#undef DIR4_PROJ
382#undef DIR5_PROJ
383#undef DIR6_PROJ
384#undef DIR7_PROJ
385#undef DIR0_RECON
386#undef DIR1_RECON
387#undef DIR2_RECON
388#undef DIR3_RECON
389#undef DIR4_RECON
390#undef DIR5_RECON
391#undef DIR6_RECON
392#undef DIR7_RECON
393#undef ASM_LEG
394#undef ASM_LEG_XP
395#undef RESULT
#define UNLOCK_GAUGE(dir)
Definition BGQQPX.h:138
#define MASK_REGS
Definition BGQQPX.h:65
#define perm(a, b, n, w)
static constexpr int Xm
Definition QCD.h:45
static constexpr int Tm
Definition QCD.h:48
static constexpr int Ns
Definition QCD.h:51
static constexpr int Tp
Definition QCD.h:44
static constexpr int Zp
Definition QCD.h:43
static constexpr int Zm
Definition QCD.h:47
static constexpr int Xp
Definition QCD.h:41
static constexpr int Yp
Definition QCD.h:42
static constexpr int Ym
Definition QCD.h:46
#define PERMUTE_DIR2
#define PERMUTE_DIR1
#define PERMUTE_DIR0
#define PERMUTE_DIR3
#define DIR0_PROJ
#define DIR3_PROJ
#define DIR2_PROJ
#define DIR6_PROJ
#define DIR5_PROJ
#define DIR7_PROJ
#define DIR1_PROJ
#define DIR4_PROJ
const uint64_t plocal
uint64_t basep
#define DIR1_RECON
#define DIR3_RECON
#define DIR7_RECON
#define DIR6_RECON
#define DIR2_RECON
#define DIR5_RECON
#define DIR4_RECON
uint64_t base
#define DIR0_RECON
static INTERNAL_PRECISION U
Definition Zolotarev.cc:230