Grid
0.7.0
WilsonKernelsAsmBodyA64FX.h
Go to the documentation of this file.
1
/*************************************************************************************
2
3
Grid physics library, www.github.com/paboyle/Grid
4
5
Source file: WilsonKernelsAsmBodyA64FX.h
6
7
Copyright (C) 2020
8
9
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
10
11
This program is free software; you can redistribute it and/or modify
12
it under the terms of the GNU General Public License as published by
13
the Free Software Foundation; either version 2 of the License, or
14
(at your option) any later version.
15
16
This program is distributed in the hope that it will be useful,
17
but WITHOUT ANY WARRANTY; without even the implied warranty of
18
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
GNU General Public License for more details.
20
21
You should have received a copy of the GNU General Public License along
22
with this program; if not, write to the Free Software Foundation, Inc.,
23
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25
See the full license in the file "LICENSE" in the top level distribution directory
26
*************************************************************************************/
27
/* END LEGAL */
28
29
// GCC 10 messes up SVE instruction scheduling using -O3, but
30
// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
31
// performance now is better than armclang 20.2
32
33
#ifdef KERNEL_DAG
34
#define DIR0_PROJ XP_PROJ
35
#define DIR1_PROJ YP_PROJ
36
#define DIR2_PROJ ZP_PROJ
37
#define DIR3_PROJ TP_PROJ
38
#define DIR4_PROJ XM_PROJ
39
#define DIR5_PROJ YM_PROJ
40
#define DIR6_PROJ ZM_PROJ
41
#define DIR7_PROJ TM_PROJ
42
#define DIR0_RECON XP_RECON
43
#define DIR1_RECON YP_RECON_ACCUM
44
#define DIR2_RECON ZP_RECON_ACCUM
45
#define DIR3_RECON TP_RECON_ACCUM
46
#define DIR4_RECON XM_RECON_ACCUM
47
#define DIR5_RECON YM_RECON_ACCUM
48
#define DIR6_RECON ZM_RECON_ACCUM
49
#define DIR7_RECON TM_RECON_ACCUM
50
#else
51
#define DIR0_PROJ XM_PROJ
52
#define DIR1_PROJ YM_PROJ
53
#define DIR2_PROJ ZM_PROJ
54
#define DIR3_PROJ TM_PROJ
55
#define DIR4_PROJ XP_PROJ
56
#define DIR5_PROJ YP_PROJ
57
#define DIR6_PROJ ZP_PROJ
58
#define DIR7_PROJ TP_PROJ
59
#define DIR0_RECON XM_RECON
60
#define DIR1_RECON YM_RECON_ACCUM
61
#define DIR2_RECON ZM_RECON_ACCUM
62
#define DIR3_RECON TM_RECON_ACCUM
63
#define DIR4_RECON XP_RECON_ACCUM
64
#define DIR5_RECON YP_RECON_ACCUM
65
#define DIR6_RECON ZP_RECON_ACCUM
66
#define DIR7_RECON TP_RECON_ACCUM
67
#endif
68
69
//using namespace std;
70
71
#undef SHOW
72
//#define SHOW
73
74
#undef WHERE
75
76
#ifdef INTERIOR_AND_EXTERIOR
77
#define WHERE "INT_AND_EXT"
78
#endif
79
80
#ifdef INTERIOR
81
#define WHERE "INT"
82
#endif
83
84
#ifdef EXTERIOR
85
#define WHERE "EXT"
86
#endif
87
88
//#pragma message("here")
89
90
91
93
// Comms then compute kernel
95
#ifdef INTERIOR_AND_EXTERIOR
96
97
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
98
basep = st.GetPFInfo(nent,plocal); nent++; \
99
if ( local ) { \
100
LOAD_CHIMU(base); \
101
LOAD_TABLE(PERMUTE_DIR); \
102
PROJ; \
103
MAYBEPERM(PERMUTE_DIR,perm); \
104
} else { \
105
LOAD_CHI(base); \
106
} \
107
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
108
MULT_2SPIN_1(Dir); \
109
PREFETCH_CHIMU(base); \
110
PREFETCH_CHIMU_L2(basep); \
111
/* PREFETCH_GAUGE_L1(NxtDir); */
\
112
MULT_2SPIN_2; \
113
if (s == 0) { \
114
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
115
} \
116
RECON; \
117
118
/*
119
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
120
though I expected that it would improve on performance
121
*/
122
123
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
124
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
125
PREFETCH1_CHIMU(base); \
126
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
127
128
#define RESULT(base,basep) SAVE_RESULT(base,basep);
129
130
#endif
131
133
// Pre comms kernel -- prefetch like normal because it is mostly right
135
#ifdef INTERIOR
136
137
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
138
basep = st.GetPFInfo(nent,plocal); nent++; \
139
if ( local ) { \
140
LOAD_CHIMU(base); \
141
LOAD_TABLE(PERMUTE_DIR); \
142
PROJ; \
143
MAYBEPERM(PERMUTE_DIR,perm); \
144
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
145
if ( local || st.same_node[Dir] ) { \
146
MULT_2SPIN_1(Dir); \
147
MULT_2SPIN_2; \
148
RECON; \
149
} \
150
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
151
PREFETCH_CHIMU(base); \
152
PREFETCH_CHIMU_L2(basep); \
153
154
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
155
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
156
PREFETCH1_CHIMU(base); \
157
{ ZERO_PSI; } \
158
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
159
160
#define RESULT(base,basep) SAVE_RESULT(base,basep);
161
162
#endif
163
165
// Post comms kernel
167
#ifdef EXTERIOR
168
169
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
170
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
171
if((!local)&&(!st.same_node[Dir]) ) { \
172
LOAD_CHI(base); \
173
MULT_2SPIN_1(Dir); \
174
MULT_2SPIN_2; \
175
RECON; \
176
nmu++; \
177
}
178
179
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
180
nmu=0; \
181
{ ZERO_PSI;} \
182
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
183
if((!local)&&(!st.same_node[Dir]) ) { \
184
LOAD_CHI(base); \
185
MULT_2SPIN_1(Dir); \
186
MULT_2SPIN_2; \
187
RECON; \
188
nmu++; \
189
}
190
191
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
192
193
#endif
194
195
196
{
197
int
nmu;
198
int
local
,
perm
,
ptype
;
199
uint64_t
base
;
200
uint64_t
basep
;
201
const
uint64_t
plocal
=(uint64_t) & in[0];
202
203
MASK_REGS
;
204
int
nmax
=
U
.oSites();
205
for
(
int
site=0;site<
Ns
;site++) {
206
#ifndef EXTERIOR
207
// int sU =lo.Reorder(ssU);
208
int
sU =ssU;
209
int
ssn=ssU+1;
if
(ssn>=
nmax
) ssn=0;
210
// int sUn=lo.Reorder(ssn);
211
int
sUn=ssn;
212
#else
213
int
sU =ssU;
214
int
ssn=ssU+1;
if
(ssn>=
nmax
) ssn=0;
215
int
sUn=ssn;
216
#endif
217
for
(
int
s=0;s<Ls;s++) {
218
ss =sU*Ls+s;
219
ssn=sUn*Ls+s;
220
int
ent=ss*8;
// 2*Ndim
221
int
nent=ssn*8;
222
223
uint64_t delta_base, delta_base_p;
224
225
ASM_LEG_XP(
Xp
,
Yp
,
PERMUTE_DIR3
,
DIR0_PROJ
,
DIR0_RECON
);
226
227
#ifdef SHOW
228
float
rescale = 64. * 12.;
229
std::cout <<
"================================================================="
<< std::endl;
230
std::cout <<
"ss = "
<< ss <<
" ssn = "
<< ssn << std::endl;
231
std::cout <<
"sU = "
<< sU <<
" ssU = "
<< ssU << std::endl;
232
std::cout <<
" "
<< std::endl;
233
234
235
std::cout <<
"Dir = "
<<
Xp
<<
" "
<< WHERE<< std::endl;
236
237
std::cout <<
"ent nent local perm = "
<< ent <<
" "
<< nent <<
" "
<<
local
<<
" "
<<
perm
<< std::endl;
238
std::cout <<
"st.same_node[Dir] = "
<< st.same_node[
Xp
] << std::endl;
239
std::cout <<
"base = "
<< (
base
-
plocal
)/rescale << std::endl;
240
std::cout <<
"Basep = "
<< (
basep
-
plocal
)/rescale << std::endl;
241
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
242
std::cout <<
"----------------------------------------------------"
<< std::endl;
243
#endif
244
245
ASM_LEG(
Yp
,
Zp
,
PERMUTE_DIR2
,
DIR1_PROJ
,
DIR1_RECON
);
246
247
#ifdef SHOW
248
std::cout <<
"Dir = "
<<
Yp
<<
" "
<< WHERE<< std::endl;
249
250
std::cout <<
"ent nent local perm = "
<< ent <<
" "
<< nent <<
" "
<<
local
<<
" "
<<
perm
<< std::endl;
251
std::cout <<
"st.same_node[Dir] = "
<< st.same_node[
Yp
] << std::endl;
252
std::cout <<
"base = "
<< (
base
-
plocal
)/rescale << std::endl;
253
std::cout <<
"Basep = "
<< (
basep
-
plocal
)/rescale << std::endl;
254
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
255
std::cout <<
"----------------------------------------------------"
<< std::endl;
256
#endif
257
258
ASM_LEG(
Zp
,
Tp
,
PERMUTE_DIR1
,
DIR2_PROJ
,
DIR2_RECON
);
259
260
#ifdef SHOW
261
std::cout <<
"Dir = "
<<
Zp
<<
" "
<< WHERE<< std::endl;
262
263
std::cout <<
"ent nent local perm = "
<< ent <<
" "
<< nent <<
" "
<<
local
<<
" "
<<
perm
<< std::endl;
264
std::cout <<
"st.same_node[Dir] = "
<< st.same_node[
Zp
] << std::endl;
265
std::cout <<
"base = "
<< (
base
-
plocal
)/rescale << std::endl;
266
std::cout <<
"Basep = "
<< (
basep
-
plocal
)/rescale << std::endl;
267
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
268
std::cout <<
"----------------------------------------------------"
<< std::endl;
269
#endif
270
271
ASM_LEG(
Tp
,
Xm
,
PERMUTE_DIR0
,
DIR3_PROJ
,
DIR3_RECON
);
272
273
#ifdef SHOW
274
std::cout <<
"Dir = "
<<
Tp
<<
" "
<< WHERE<< std::endl;
275
276
std::cout <<
"ent nent local perm = "
<< ent <<
" "
<< nent <<
" "
<<
local
<<
" "
<<
perm
<< std::endl;
277
std::cout <<
"st.same_node[Dir] = "
<< st.same_node[
Tp
] << std::endl;
278
std::cout <<
"base = "
<< (
base
-
plocal
)/rescale << std::endl;
279
std::cout <<
"Basep = "
<< (
basep
-
plocal
)/rescale << std::endl;
280
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
281
std::cout <<
"----------------------------------------------------"
<< std::endl;
282
#endif
283
284
ASM_LEG(
Xm
,
Ym
,
PERMUTE_DIR3
,
DIR4_PROJ
,
DIR4_RECON
);
285
286
#ifdef SHOW
287
std::cout <<
"Dir = "
<<
Xm
<<
" "
<< WHERE<< std::endl;
288
289
std::cout <<
"ent nent local perm = "
<< ent <<
" "
<< nent <<
" "
<<
local
<<
" "
<<
perm
<< std::endl;
290
std::cout <<
"st.same_node[Dir] = "
<< st.same_node[
Xm
] << std::endl;
291
std::cout <<
"base = "
<< (
base
-
plocal
)/rescale << std::endl;
292
std::cout <<
"Basep = "
<< (
basep
-
plocal
)/rescale << std::endl;
293
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
294
std::cout <<
"----------------------------------------------------"
<< std::endl;
295
#endif
296
297
// DC ZVA test
298
// { uint64_t basestore = (uint64_t)&out[ss];
299
// PREFETCH_RESULT_L2_STORE(basestore); }
300
301
302
ASM_LEG(
Ym
,
Zm
,
PERMUTE_DIR2
,
DIR5_PROJ
,
DIR5_RECON
);
303
304
#ifdef SHOW
305
std::cout <<
"Dir = "
<<
Ym
<<
" "
<< WHERE<< std::endl;
306
307
std::cout <<
"ent nent local perm = "
<< ent <<
" "
<< nent <<
" "
<<
local
<<
" "
<<
perm
<< std::endl;
308
std::cout <<
"st.same_node[Dir] = "
<< st.same_node[
Ym
] << std::endl;
309
std::cout <<
"base = "
<< (
base
-
plocal
)/rescale << std::endl;
310
std::cout <<
"Basep = "
<< (
basep
-
plocal
)/rescale << std::endl;
311
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
312
std::cout <<
"----------------------------------------------------"
<< std::endl;
313
#endif
314
315
// DC ZVA test
316
//{ uint64_t basestore = (uint64_t)&out[ss];
317
// PREFETCH_RESULT_L2_STORE(basestore); }
318
319
320
ASM_LEG(
Zm
,
Tm
,
PERMUTE_DIR1
,
DIR6_PROJ
,
DIR6_RECON
);
321
322
#ifdef SHOW
323
std::cout <<
"Dir = "
<<
Zm
<<
" "
<< WHERE<< std::endl;
324
325
std::cout <<
"ent nent local perm = "
<< ent <<
" "
<< nent <<
" "
<<
local
<<
" "
<<
perm
<< std::endl;
326
std::cout <<
"st.same_node[Dir] = "
<< st.same_node[
Zm
] << std::endl;
327
std::cout <<
"base = "
<< (
base
-
plocal
)/rescale << std::endl;
328
std::cout <<
"Basep = "
<< (
basep
-
plocal
)/rescale << std::endl;
329
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
330
std::cout <<
"----------------------------------------------------"
<< std::endl;
331
#endif
332
333
// DC ZVA test
334
//{ uint64_t basestore = (uint64_t)&out[ss];
335
// PREFETCH_RESULT_L2_STORE(basestore); }
336
337
338
ASM_LEG(
Tm
,
Xp
,
PERMUTE_DIR0
,
DIR7_PROJ
,
DIR7_RECON
);
339
340
#ifdef SHOW
341
std::cout <<
"Dir = "
<<
Tm
<<
" "
<< WHERE<< std::endl;
342
343
std::cout <<
"ent nent local perm = "
<< ent <<
" "
<< nent <<
" "
<<
local
<<
" "
<<
perm
<< std::endl;
344
std::cout <<
"st.same_node[Dir] = "
<< st.same_node[
Tm
] << std::endl;
345
std::cout <<
"base = "
<< (
base
-
plocal
)/rescale << std::endl;
346
std::cout <<
"Basep = "
<< (
basep
-
plocal
)/rescale << std::endl;
347
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
348
std::cout <<
"----------------------------------------------------"
<< std::endl;
349
#endif
350
351
#ifdef EXTERIOR
352
if
(nmu==0)
break
;
353
// if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
354
#endif
355
base
= (uint64_t) &out[ss];
356
basep
= st.GetPFInfo(nent,
plocal
); ent++;
357
basep
= (uint64_t) &out[ssn];
358
//PREFETCH_RESULT_L1_STORE(base);
359
RESULT(
base
,
basep
);
360
361
#ifdef SHOW
362
std::cout <<
"Dir = FINAL "
<< WHERE<< std::endl;;
363
364
base_ss =
base
;
365
std::cout <<
"base = "
<< (
base
- (uint64_t) &out[0])/rescale << std::endl;
366
std::cout <<
"Basep = "
<< (
basep
-
plocal
)/rescale << std::endl;
367
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
368
std::cout <<
"----------------------------------------------------"
<< std::endl;
369
#endif
370
371
}
372
ssU++;
373
UNLOCK_GAUGE
(0);
374
}
375
}
376
377
#undef DIR0_PROJ
378
#undef DIR1_PROJ
379
#undef DIR2_PROJ
380
#undef DIR3_PROJ
381
#undef DIR4_PROJ
382
#undef DIR5_PROJ
383
#undef DIR6_PROJ
384
#undef DIR7_PROJ
385
#undef DIR0_RECON
386
#undef DIR1_RECON
387
#undef DIR2_RECON
388
#undef DIR3_RECON
389
#undef DIR4_RECON
390
#undef DIR5_RECON
391
#undef DIR6_RECON
392
#undef DIR7_RECON
393
#undef ASM_LEG
394
#undef ASM_LEG_XP
395
#undef RESULT
UNLOCK_GAUGE
#define UNLOCK_GAUGE(dir)
Definition
BGQQPX.h:138
MASK_REGS
#define MASK_REGS
Definition
BGQQPX.h:65
perm
#define perm(a, b, n, w)
Definition
Grid_generic.h:379
Xm
static constexpr int Xm
Definition
QCD.h:45
Tm
static constexpr int Tm
Definition
QCD.h:48
Ns
static constexpr int Ns
Definition
QCD.h:51
Tp
static constexpr int Tp
Definition
QCD.h:44
Zp
static constexpr int Zp
Definition
QCD.h:43
Zm
static constexpr int Zm
Definition
QCD.h:47
Xp
static constexpr int Xp
Definition
QCD.h:41
Yp
static constexpr int Yp
Definition
QCD.h:42
Ym
static constexpr int Ym
Definition
QCD.h:46
PERMUTE_DIR2
#define PERMUTE_DIR2
Definition
StaggeredKernelsAsm.h:802
PERMUTE_DIR1
#define PERMUTE_DIR1
Definition
StaggeredKernelsAsm.h:807
PERMUTE_DIR0
#define PERMUTE_DIR0
Definition
StaggeredKernelsAsm.h:812
PERMUTE_DIR3
#define PERMUTE_DIR3
Definition
StaggeredKernelsAsm.h:797
DIR0_PROJ
#define DIR0_PROJ
Definition
WilsonKernelsAsmBodyA64FX.h:51
DIR3_PROJ
#define DIR3_PROJ
Definition
WilsonKernelsAsmBodyA64FX.h:54
DIR2_PROJ
#define DIR2_PROJ
Definition
WilsonKernelsAsmBodyA64FX.h:53
DIR6_PROJ
#define DIR6_PROJ
Definition
WilsonKernelsAsmBodyA64FX.h:57
DIR5_PROJ
#define DIR5_PROJ
Definition
WilsonKernelsAsmBodyA64FX.h:56
DIR7_PROJ
#define DIR7_PROJ
Definition
WilsonKernelsAsmBodyA64FX.h:58
DIR1_PROJ
#define DIR1_PROJ
Definition
WilsonKernelsAsmBodyA64FX.h:52
DIR4_PROJ
#define DIR4_PROJ
Definition
WilsonKernelsAsmBodyA64FX.h:55
plocal
const uint64_t plocal
Definition
WilsonKernelsAsmBody.h:133
ptype
int ptype
Definition
WilsonKernelsAsmBody.h:130
basep
uint64_t basep
Definition
WilsonKernelsAsmBody.h:132
local
int local
Definition
WilsonKernelsAsmBody.h:130
DIR1_RECON
#define DIR1_RECON
Definition
WilsonKernelsAsmBody.h:28
nmax
int nmax
Definition
WilsonKernelsAsmBody.h:137
DIR3_RECON
#define DIR3_RECON
Definition
WilsonKernelsAsmBody.h:30
DIR7_RECON
#define DIR7_RECON
Definition
WilsonKernelsAsmBody.h:34
DIR6_RECON
#define DIR6_RECON
Definition
WilsonKernelsAsmBody.h:33
DIR2_RECON
#define DIR2_RECON
Definition
WilsonKernelsAsmBody.h:29
DIR5_RECON
#define DIR5_RECON
Definition
WilsonKernelsAsmBody.h:32
DIR4_RECON
#define DIR4_RECON
Definition
WilsonKernelsAsmBody.h:31
base
uint64_t base
Definition
WilsonKernelsAsmBody.h:131
DIR0_RECON
#define DIR0_RECON
Definition
WilsonKernelsAsmBody.h:27
U
static INTERNAL_PRECISION U
Definition
Zolotarev.cc:230
Grid
qcd
action
fermion
implementation
WilsonKernelsAsmBodyA64FX.h
Generated by
1.16.1