Grid 0.7.0
BGQQPX.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/BGQQPX.h
6
7 Copyright (C) 2015
8
9Author: paboyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#ifndef GRID_ASM_BGQ_QPX_H
29#define GRID_ASM_BGQ_QPX_H
30
31#include <stddint.h>
32
33/*********************************************************
34 * Architectural macros
35 *********************************************************/
36#define VLOADf(OFF,PTR,DEST) "qvlfsux " #DEST "," #OFF "," #PTR ") ;\n"
37#define VLOADd(OFF,PTR,DEST) "qvlfdux " #DEST "," #OFF "," #PTR ") ;\n"
38#define VSTOREf(OFF,PTR,SRC) "qvstfsux " #SRC "," #OFF "," #PTR ") ;\n"
39#define VSTOREd(OFF,PTR,SRC) "qvstfdux " #SRC "," #OFF "," #PTR ") ;\n"
40#define VSPLATf(A,B,DEST) "qvlfcdxa " #A "," #B "," #DEST ";\n"
41#define VSPLATd(A,B,DEST) "qvlfcsxa " #A "," #B "," #DEST ";\n"
42
43#define LOAD64(A,ptr)
44#define VZERO(DEST) "qvfclr " #DEST "; \n"
45#define VONE (DEST) "qvfset " #DEST "; \n"
46#define VNEG (SRC,DEST) "qvfneg " #DEST "," #SRC "; \n"
47#define VMOV(A,DEST) "qvfmr " #DEST, "," #A ";\n"
48
49#define VADD(A,B,DEST) "qvfadd " #DEST "," #A "," #B ";\n"
50#define VSUB(A,B,DEST) "qvfsub " #DEST "," #A "," #B ";\n"
51#define VMUL(A,B,DEST) "qvfmul " #DEST "," #A "," #B ";\n"
52#define VMUL_RR_RI(A,B,DEST) "qvfxmul " #DEST "," #A "," #B ";\n"
53#define VMADD(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
54#define VMADD_RR_RI(A,B,C,DEST) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
55#define VMADD_MII_IR(A,B,C,DEST) "qvfxxnpmadd " #DEST "," #A "," #B ","#C ";\n"
56#define VMADD_II_MIR(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
57
58#define CACHE_LOCK (PTR) asm (" dcbtls %%r0, %0 \n" : : "r" (PTR) );
59#define CACHE_UNLOCK(PTR) asm (" dcblc %%r0, %0 \n" : : "r" (PTR) );
60#define CACHE_FLUSH (PTR) asm (" dcbf %%r0, %0 \n" : : "r" (PTR) );
61#define CACHE_TOUCH (PTR) asm (" dcbt %%r0, %0 \n" : : "r" (PTR) );
62
63// Gauge field locking 2 x 9 complex == 18*8 / 16 bytes per link
64// This is 144/288 bytes == 4.5; 9 lines
65#define MASK_REGS /*NOOP ON BGQ*/
66#define PF_GAUGE(A) /*NOOP ON BGQ*/
67#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
68#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
69
70/*********************************************************
71 * Register definitions
72 *********************************************************/
73#define psi_00 0
74#define psi_01 1
75#define psi_02 2
76
77#define psi_10 3
78#define psi_11 4
79#define psi_12 5
80
81#define psi_20 6
82#define psi_21 7
83#define psi_22 8
84
85#define psi_30 9
86#define psi_31 10
87#define psi_32 11
88
89#define Chi_00 12
90#define Chi_01 13
91#define Chi_02 14
92
93#define Chi_10 15
94#define Chi_11 16
95#define Chi_12 17
96
97#define UChi_00 18
98#define UChi_01 19
99#define UChi_02 20
100
101#define UChi_10 21
102#define UChi_11 22
103#define UChi_12 23
104
105#define U0 24
106#define U1 25
107#define U2 26
108#define one 27
109
110#define REP %%r16
111#define IMM %%r17
112
113/*Alias regs*/
114#define Chimu_00 Chi_00
115#define Chimu_01 Chi_01
116#define Chimu_02 Chi_02
117#define Chimu_10 Chi_10
118#define Chimu_11 Chi_11
119#define Chimu_12 Chi_02
120#define Chimu_20 UChi_00
121#define Chimu_21 UChi_01
122#define Chimu_22 UChi_02
123#define Chimu_30 UChi_10
124#define Chimu_31 UChi_11
125#define Chimu_32 UChi_02
126
127/*********************************************************
128 * Macro sequences encoding QCD
129 *********************************************************/
130#define LOCK_GAUGE(dir) \
131 { \
132 uint8_t *byte_addr = (uint8_t *)&U[sU](dir); \
133 for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
134 CACHE_LOCK(&byte_addr[i]); \
135 } \
136 }
137
138#define UNLOCK_GAUGE(dir) \
139 { \
140 uint8_t *byte_addr = (uint8_t *)&U[sU](dir); \
141 for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
142 CACHE_UNLOCK(&byte_addr[i]); \
143 } \
144 }
145
146#define MAYBEPERM(A,B)
147
148#define PERMUTE_DIR3
149#define PERMUTE_DIR2
150#define PERMUTE_DIR1
151#define PERMUTE_DIR0
152
153#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN(&U[sU](A),p)
154#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN(&U[sU](A),p)
155#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN(&U[sU](A),p)
156#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN(&U[sU](A),p)
157#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN(&U[sU](A),p)
158#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN(&U[sU](A),p)
159#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN(&U[sU](A),p)
160#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN(&U[sU](A),p)
161
162#define MULT_SPIN(ptr,p) { \
163 uint64_t ub = ((uint64_t)base); \
164 asm ( \
165 VLOAD(%0,%3,U0) \
166 VLOAD(%1,%3,U1) \
167 VLOAD(%2,%3,U2) \
168 VMUL_RR_RI(U0,Chi_00,UChi_00) \
169 VMUL_RR_RI(U1,Chi_00,UChi_01) \
170 VMUL_RR_RI(U2,Chi_00,UChi_02) \
171 VMUL_RR_RI(U0,Chi_10,UChi_10) \
172 VMUL_RR_RI(U1,Chi_10,UChi_11) \
173 VMUL_RR_RI(U2,Chi_10,UChi_12) \
174 VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00) \
175 VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01) \
176 VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02) \
177 VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10) \
178 VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11) \
179 VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12) \
180 : : "r" (0), "r" (32*3), "r" (32*6), "r" (ub )); \
181 asm ( \
182 VLOAD(%0,%3,U0) \
183 VLOAD(%1,%3,U1) \
184 VLOAD(%2,%3,U2) \
185 VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00) \
186 VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01) \
187 VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02) \
188 VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10) \
189 VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11) \
190 VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12) \
191 VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00) \
192 VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01) \
193 VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02) \
194 VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10) \
195 VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11) \
196 VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12) \
197 : : "r" (32), "r" (32*4), "r" (32*7), "r" (ub )); \
198 asm ( \
199 VLOAD(%0,%3,U0) \
200 VLOAD(%1,%3,U1) \
201 VLOAD(%2,%3,U2) \
202 VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00) \
203 VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01) \
204 VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02) \
205 VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10) \
206 VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11) \
207 VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12) \
208 VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00) \
209 VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01) \
210 VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02) \
211 VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10) \
212 VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11) \
213 VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12) \
214 : : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub )); \
215 }
216
217#define SAVE_RESULT(base,basep) { \
218 uint64_t ub = ((uint64_t)base) - 32; \
219 asm("mr %0,"REP";\n\t" \
220 "li " IMM ",32;\n\t" \
221 VSTORE(IMM,REP,psi_00) \
222 VSTORE(IMM,REP,psi_01) \
223 VSTORE(IMM,REP,psi_02) \
224 VSTORE(IMM,REP,psi_10) \
225 VSTORE(IMM,REP,psi_11) \
226 VSTORE(IMM,REP,psi_12) \
227 VSTORE(IMM,REP,psi_20) \
228 VSTORE(IMM,REP,psi_21) \
229 VSTORE(IMM,REP,psi_22) \
230 VSTORE(IMM,REP,psi_30) \
231 VSTORE(IMM,REP,psi_31) \
232 VSTORE(IMM,REP,psi_32) \
233 ); \
234 }
235
236/*
237 *Annoying BG/Q loads with no immediat indexing and big performance hit
238 *when second miss to a L1 line occurs
239 */
240#define LOAD_CHI(base) { \
241 uint64_t ub = ((uint64_t)base) - 64; \
242 asm("mr %0,"REP";\n\t" \
243 "li " IMM ",64;\n\t" \
244 VLOAD(IMM,REP,Chi_00) \
245 VLOAD(IMM,REP,Chi_02) \
246 VLOAD(IMM,REP,Chi_11) : : "r" (ub) ); \
247 ub = ((uint64_t)base) - 32; \
248 asm("mr %0,"REP";\n\t" \
249 "li IMM,64;\n\t" \
250 VLOAD(IMM,REP,Chimu_01) \
251 VLOAD(IMM,REP,Chimu_10) \
252 VLOAD(IMM,REP,Chimu_12) : : "r" (ub) ); \
253 }
254
255#define LOAD_CHIMU(base) { \
256 uint64_t ub = ((uint64_t)base) - 64; \
257 asm("mr %0,"REP";\n\t" \
258 "li IMM,64;\n\t" \
259 VLOAD(IMM,REP,Chimu_00) \
260 VLOAD(IMM,REP,Chimu_02) \
261 VLOAD(IMM,REP,Chimu_11) \
262 VLOAD(IMM,REP,Chimu_20) \
263 VLOAD(IMM,REP,Chimu_22) \
264 VLOAD(IMM,REP,Chimu_31) : : "r" (ub) ); \
265 ub = ((uint64_t)base) - 32; \
266 asm("mr %0,"REP";\n\t" \
267 "li IMM,64;\n\t" \
268 VLOAD(IMM,REP,Chimu_01) \
269 VLOAD(IMM,REP,Chimu_10) \
270 VLOAD(IMM,REP,Chimu_12) \
271 VLOAD(IMM,REP,Chimu_21) \
272 VLOAD(IMM,REP,Chimu_30) \
273 VLOAD(IMM,REP,Chimu_32) : : "r" (ub) ); \
274 }
275
276// hspin(0)=fspin(0)+timesI(fspin(3));
277// hspin(1)=fspin(1)+timesI(fspin(2));
278#define XP_PROJMEM(base) { \
279 LOAD_CHIMU(base); \
280 asm ( \
281 VONE(one) \
282 VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00) \
283 VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01) \
284 VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02) \
285 VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10) \
286 VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11) \
287 VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12) \
288 ); \
289 }
290
291#define XM_PROJMEM(base) { \
292 LOAD_CHIMU(base); \
293 asm ( \
294 VONE(one) \
295 VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00) \
296 VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01) \
297 VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02) \
298 VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10) \
299 VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11) \
300 VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12) \
301 ); \
302 }
303
304// hspin(0)=fspin(0)-fspin(3);
305// hspin(1)=fspin(1)+fspin(2);
306#define YP_PROJMEM(base) { \
307 LOAD_CHIMU(base); \
308 asm ( \
309 VSUB(Chimu_00,Chimu_00,Chi_30) \
310 VSUB(Chimu_01,Chimu_01,Chi_31) \
311 VSUB(Chimu_02,Chimu_02,Chi_32) \
312 VADD(Chimu_10,Chimu_10,Chi_20) \
313 VADD(Chimu_11,Chimu_11,Chi_21) \
314 VADD(Chimu_12,Chimu_12,Chi_22) \
315 ); \
316 }
317
318#define YM_PROJMEM(base) { \
319 LOAD_CHIMU(base); \
320 asm ( \
321 VADD(Chimu_00,Chimu_00,Chi_30) \
322 VADD(Chimu_01,Chimu_01,Chi_31) \
323 VADD(Chimu_02,Chimu_02,Chi_32) \
324 VSUB(Chimu_10,Chimu_10,Chi_20) \
325 VSUB(Chimu_11,Chimu_11,Chi_21) \
326 VSUB(Chimu_12,Chimu_12,Chi_22) \
327 ); \
328 }
329
330/*Gz
331 * 0 0 i 0 [0]+-i[2]
332 * 0 0 0 -i [1]-+i[3]
333 * -i 0 0 0
334 * 0 i 0 0
335 */
336#define ZP_PROJMEM(base) { \
337 LOAD_CHIMU(base); \
338 asm ( \
339 VONE(one) \
340 VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00) \
341 VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01) \
342 VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02) \
343 VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10) \
344 VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11) \
345 VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12) \
346 ); \
347 }
348
349#define ZM_PROJMEM(base) { \
350 LOAD_CHIMU(base); \
351 asm ( \
352 VONE(one) \
353 VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00) \
354 VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01) \
355 VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02) \
356 VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10) \
357 VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11) \
358 VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12) \
359 ); \
360 }
361/*Gt
362 * 0 0 1 0 [0]+-[2]
363 * 0 0 0 1 [1]+-[3]
364 * 1 0 0 0
365 * 0 1 0 0
366 */
367#define TP_PROJMEM(base) { \
368 LOAD_CHIMU(base); \
369 asm ( \
370 VADD(Chimu_00,Chimu_00,Chi_20) \
371 VADD(Chimu_01,Chimu_01,Chi_21) \
372 VADD(Chimu_02,Chimu_02,Chi_22) \
373 VADD(Chimu_10,Chimu_10,Chi_30) \
374 VADD(Chimu_11,Chimu_11,Chi_31) \
375 VADD(Chimu_12,Chimu_12,Chi_32) \
376 ); \
377 }
378
379#define TM_PROJMEM(base) { \
380 LOAD_CHIMU(base); \
381 asm ( \
382 VSUB(Chimu_00,Chimu_00,Chi_20) \
383 VSUB(Chimu_01,Chimu_01,Chi_21) \
384 VSUB(Chimu_02,Chimu_02,Chi_22) \
385 VSUB(Chimu_10,Chimu_10,Chi_30) \
386 VSUB(Chimu_11,Chimu_11,Chi_31) \
387 VSUB(Chimu_12,Chimu_12,Chi_32) \
388 ); \
389 }
390
391/*
392 fspin(0)=hspin(0);
393 fspin(1)=hspin(1);
394 fspin(2)=timesMinusI(hspin(1));
395 fspin(3)=timesMinusI(hspin(0));
396
397 fspin(0)+=hspin(0);
398 fspin(1)+=hspin(1);
399 fspin(2)-=timesI(hspin(1));
400 fspin(3)-=timesI(hspin(0));
401*/
402#define XP_RECON { \
403 asm( \
404 VONE(one) \
405 VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \
406 VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \
407 VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
408 VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
409 VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
410 VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
411 VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
412 VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
413 VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
414 VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
415 ); \
416 }
417
418#define XM_RECON { \
419 asm( \
420 VONE(one) \
421 VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \
422 VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \
423 VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
424 VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
425 VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
426 VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
427 VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
428 VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
429 VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
430 VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
431 ); \
432 }
433
434#define XP_RECON_ACCUM { \
435 asm( \
436 VONE(one) \
437 VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
438 VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
439 VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
440 VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
441 VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
442 VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
443 VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
444 VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
445 ); \
446 }
447
448#define XM_RECON_ACCUM { \
449 asm( \
450 VONE(one) \
451 VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
452 VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
453 VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
454 VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
455 VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
456 VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
457 VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
458 VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
459 ); \
460 }
461
462// fspin(2)+=hspin(1);
463// fspin(3)-=hspin(0);
464#define YP_RECON_ACCUM { \
465 asm( \
466 VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
467 VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
468 VADD(psi_20,UChi_10,psi_20) VADD(psi_21,UChi_11,psi_21) VADD(psi_22,UChi_12,psi_22) \
469 VSUB(psi_30,UChi_00,psi_30) VSUB(psi_31,UChi_01,psi_31) VSUB(psi_32,UChi_02,psi_32) \
470 ); \
471 }
472#define YM_RECON_ACCUM { \
473 asm( \
474 VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
475 VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
476 VSUB(psi_20,UChi_10,psi_20) VSUB(psi_21,UChi_11,psi_21) VSUB(psi_22,UChi_12,psi_22) \
477 VADD(psi_30,UChi_00,psi_30) VADD(psi_31,UChi_01,psi_31) VADD(psi_32,UChi_02,psi_32) \
478 ); \
479 }
480
481// fspin(2)-=timesI(hspin(0));
482// fspin(3)+=timesI(hspin(1));
483#define ZP_RECON_ACCUM { \
484 asm( \
485 VONE(one) \
486 VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
487 VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
488 VMADD_II_MIR(one,UChi_00,psi_20,psi_20) \
489 VMADD_II_MIR(one,UChi_01,psi_21,psi_21) \
490 VMADD_II_MIR(one,UChi_02,psi_22,psi_22) \
491 VMADD_MII_IR(one,UChi_10,psi_30,psi_30) \
492 VMADD_MII_IR(one,UChi_11,psi_31,psi_31) \
493 VMADD_MII_IR(one,UChi_12,psi_32,psi_32) \
494 ); \
495 }
496
497#define ZM_RECON_ACCUM { \
498 asm( \
499 VONE(one) \
500 VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
501 VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
502 VMADD_MII_IR(one,UChi_00,psi_20,psi_20) \
503 VMADD_MII_IR(one,UChi_01,psi_21,psi_21) \
504 VMADD_MII_IR(one,UChi_02,psi_22,psi_22) \
505 VMADD_II_MIR(one,UChi_10,psi_30,psi_30) \
506 VMADD_II_MIR(one,UChi_11,psi_31,psi_31) \
507 VMADD_II_MIR(one,UChi_12,psi_32,psi_32) \
508 ); \
509 }
510
511// fspin(2)+=hspin(0);
512// fspin(3)+=hspin(1);
513#define TP_RECON_ACCUM { \
514 asm( \
515 VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
516 VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
517 VADD(psi_20,UChi_00,psi_20) VADD(psi_21,UChi_01,psi_21) VADD(psi_22,UChi_02,psi_22) \
518 VADD(psi_30,UChi_10,psi_30) VADD(psi_31,UChi_11,psi_31) VADD(psi_32,UChi_12,psi_32) \
519 ); \
520 }
521
522#define TM_RECON_ACCUM { \
523 asm( \
524 VONE(one) \
525 VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
526 VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
527 VSUB(psi_20,UChi_00,psi_20) VSUB(psi_21,UChi_01,psi_21) VSUB(psi_22,UChi_02,psi_22) \
528 VSUB(psi_30,UChi_10,psi_30) VSUB(psi_31,UChi_11,psi_31) VSUB(psi_32,UChi_12,psi_32) \
529 ); \
530 }
531
532uint64_t GetPFInfo(int nent,int plocal);
533uint64_t GetInfo(int ptype,int local,int perm,int Xp,int ent,int plocal);
534
535#define COMPLEX_TYPE int;
536int signs[4];
537
538void testme(int osites,int ssU)
539{
540 int local,perm, ptype;
541 uint64_t base;
542 uint64_t basep;
543 const uint64_t plocal =(uint64_t) & in[0];
544
545 // vComplexF isigns[2] = { signs[0], signs[1] };
546 //COMPLEX_TYPE is vComplexF of vComplexD depending
547 //on the chosen precision
548 COMPLEX_TYPE *isigns = &signs[0];
549
550 MASK_REGS;
551 int nmax=osites;
552 for(int site=0;site<Ns;site++) {
553 int sU =ssU;
554 int ssn=ssU+1;
555 if(ssn>=nmax) ssn=0;
556 int sUn=ssn;
557 for(int s=0;s<Ls;s++) {
558 ss =sU*Ls+s;
559 ssn=sUn*Ls+s;
561 // Xp
563 int ent=ss*8;// 2*Ndim
564 int nent=ssn*8;
565
566 PF_GAUGE(Xp);
567 base = GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
569
570 basep = GetPFInfo(nent,plocal); nent++;
571 if ( local ) {
572 LOAD64(%r10,isigns);
573#ifdef KERNEL_DAG
575#else
577#endif
579 } else {
580 LOAD_CHI(base);
581 }
582 base = GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
584 {
586 }
587 LOAD64(%r10,isigns);
588#ifdef KERNEL_DAG
589 XP_RECON;
590#else
591 XM_RECON;
592#endif
594 // Yp
596 basep = GetPFInfo(nent,plocal); nent++;
597 if ( local ) {
598 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
599#ifdef KERNEL_DAG
601#else
603#endif
605 } else {
606 LOAD_CHI(base);
607 }
608 base = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
610 {
612 }
613 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
614#ifdef KERNEL_DAG
616#else
618#endif
619
621 // Zp
623 basep = GetPFInfo(nent,plocal); nent++;
624 if ( local ) {
625 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
626#ifdef KERNEL_DAG
628#else
630#endif
632 } else {
633 LOAD_CHI(base);
634 }
635 base = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
637 {
639 }
640 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
641#ifdef KERNEL_DAG
643#else
645#endif
646
648 // Tp
650 basep = GetPFInfo(nent,plocal); nent++;
651 if ( local ) {
652 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
653#ifdef KERNEL_DAG
655#else
657#endif
659 } else {
660 LOAD_CHI(base);
661 }
662 base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
664 {
666 }
667 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
668#ifdef KERNEL_DAG
670#else
672#endif
673
675 // Xm
677#ifndef STREAM_STORE
678 basep= (uint64_t) &out[ss];
679#endif
680 // basep= GetPFInfo(nent,plocal); nent++;
681 if ( local ) {
682 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
683#ifdef KERNEL_DAG
685#else
687#endif
689 } else {
690 LOAD_CHI(base);
691 }
692 base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
694 {
696 }
697 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
698#ifdef KERNEL_DAG
700#else
702#endif
703
705 // Ym
707 basep= GetPFInfo(nent,plocal); nent++;
708 if ( local ) {
709 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
710#ifdef KERNEL_DAG
712#else
714#endif
716 } else {
717 LOAD_CHI(base);
718 }
719 base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
721 {
723 }
724 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
725#ifdef KERNEL_DAG
727#else
729#endif
730
732 // Zm
734 basep= GetPFInfo(nent,plocal); nent++;
735 if ( local ) {
736 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
737#ifdef KERNEL_DAG
739#else
741#endif
743 } else {
744 LOAD_CHI(base);
745 }
746 base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
748 {
750 }
751 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
752#ifdef KERNEL_DAG
754#else
756#endif
757
759 // Tm
761 basep= GetPFInfo(nent,plocal); nent++;
762 if ( local ) {
763 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
764#ifdef KERNEL_DAG
766#else
768#endif
770 } else {
771 LOAD_CHI(base);
772 }
773 base= (uint64_t) &out[ss];
774#ifndef STREAM_STORE
776#endif
777 {
779 }
780 LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
781#ifdef KERNEL_DAG
783#else
785#endif
786
787 basep= GetPFInfo(nent,plocal); nent++;
789
790 }
791 ssU++;
792 }
793}
794
795
796#endif
int signs[4]
Definition BGQQPX.h:536
uint64_t GetPFInfo(int nent, int plocal)
#define MULT_2SPIN_DIR_PFTP(A, p)
Definition BGQQPX.h:156
#define TM_PROJMEM(base)
Definition BGQQPX.h:379
#define LOAD64(A, ptr)
Definition BGQQPX.h:43
#define XP_PROJMEM(base)
Definition BGQQPX.h:278
#define ZP_PROJMEM(base)
Definition BGQQPX.h:336
#define YP_PROJMEM(base)
Definition BGQQPX.h:306
#define MAYBEPERM(A, B)
Definition BGQQPX.h:146
#define SAVE_RESULT(base, basep)
Definition BGQQPX.h:217
#define YM_PROJMEM(base)
Definition BGQQPX.h:318
#define MULT_2SPIN_DIR_PFXM(A, p)
Definition BGQQPX.h:157
#define MULT_2SPIN_DIR_PFYM(A, p)
Definition BGQQPX.h:158
#define XM_PROJMEM(base)
Definition BGQQPX.h:291
#define MULT_2SPIN_DIR_PFZM(A, p)
Definition BGQQPX.h:159
#define MULT_2SPIN_DIR_PFZP(A, p)
Definition BGQQPX.h:155
#define PREFETCH_CHIMU(base)
Definition BGQQPX.h:68
#define PREFETCH1_CHIMU(base)
Definition BGQQPX.h:67
#define COMPLEX_TYPE
Definition BGQQPX.h:535
#define MULT_2SPIN_DIR_PFTM(A, p)
Definition BGQQPX.h:160
#define ZM_PROJMEM(base)
Definition BGQQPX.h:349
#define MASK_REGS
Definition BGQQPX.h:65
#define MULT_2SPIN_DIR_PFXP(A, p)
Definition BGQQPX.h:153
#define TP_PROJMEM(base)
Definition BGQQPX.h:367
#define MULT_2SPIN_DIR_PFYP(A, p)
Definition BGQQPX.h:154
#define LOAD_CHI(base)
Definition BGQQPX.h:240
uint64_t GetInfo(int ptype, int local, int perm, int Xp, int ent, int plocal)
void testme(int osites, int ssU)
Definition BGQQPX.h:538
#define PF_GAUGE(A)
Definition BGQQPX.h:66
#define perm(a, b, n, w)
static constexpr int Xm
Definition QCD.h:45
static constexpr int Tm
Definition QCD.h:48
static constexpr int Ns
Definition QCD.h:51
static constexpr int Tp
Definition QCD.h:44
static constexpr int Zp
Definition QCD.h:43
static constexpr int Zm
Definition QCD.h:47
static constexpr int Xp
Definition QCD.h:41
static constexpr int Yp
Definition QCD.h:42
static constexpr int Ym
Definition QCD.h:46
#define PERMUTE_DIR2
#define PERMUTE_DIR1
#define PERMUTE_DIR0
#define PERMUTE_DIR3
const uint64_t plocal
uint64_t basep
uint64_t base