Grid 0.7.0
IBM_qpx.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/BGQQPX.h
6
7 Copyright (C) 2015
8
9Author: paboyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#ifndef GRID_ASM_BGQ_QPX_H
29#define GRID_ASM_BGQ_QPX_H
30
31#include <stdint.h>
32
33/*********************************************************
34 * Register definitions
35 *********************************************************/
36#define psi_00 0
37#define psi_01 1
38#define psi_02 2
39
40#define psi_10 3
41#define psi_11 4
42#define psi_12 5
43
44#define psi_20 6
45#define psi_21 7
46#define psi_22 8
47
48#define psi_30 9
49#define psi_31 10
50#define psi_32 11
51
52#define Chi_00 12
53#define Chi_01 13
54#define Chi_02 14
55
56#define Chi_10 15
57#define Chi_11 16
58#define Chi_12 17
59
60#define UChi_00 18
61#define UChi_01 19
62#define UChi_02 20
63
64#define UChi_10 21
65#define UChi_11 22
66#define UChi_12 23
67
68#define U0 24
69#define U1 25
70#define U2 26
71#define one 27
72#define perm_reg 28
73
74#define REP %%r16
75#define IMM %%r17
76#define pREP %r16
77#define pIMM %r17
78
79#define PPC_INST_DCBTLS 0x7c00014c
80#define PPC_INST_DCBLC 0x7c00030c
81#define __PPC_CT(t) (((t) & 0x0f) << 21)
82#define ___PPC_RA(a) (((a) & 0x1f) << 16)
83#define ___PPC_RB(b) (((b) & 0x1f) << 11)
84
85#define LOCK_SET ".long (" HASH(PPC_INST_DCBTLS) "|" HASH(___PPC_RB(16)) ")\n"
86#define LOCK_CLEAR ".long (" HASH(PPC_INST_DCBLC) "|" HASH(___PPC_RB(16)) ")\n"
87
88/*Alias regs for incoming fourspinor on neighbour site*/
89#define Chi_20 UChi_00
90#define Chi_21 UChi_01
91#define Chi_22 UChi_02
92#define Chi_30 UChi_10
93#define Chi_31 UChi_11
94#define Chi_32 UChi_12
95
96/*********************************************************
97 * Architectural macros
98 *********************************************************/
99#define HASHit(A) #A
100#define HASH(A) HASHit(A)
101#define LOAD64(A,ptr)
102
103
104#define MASK_REGS /*NOOP ON BGQ*/
105#define PF_GAUGE(A) /*NOOP ON BGQ*/
106#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
107#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
108
109#define VLOADf(OFF,PTR,DEST) "qvlfsx " #DEST "," #PTR "," #OFF " ;\n"
110#define VLOADuf(OFF,PTR,DEST) "qvlfsux " #DEST "," #PTR "," #OFF " ;\n"
111#define VSTOREf(OFF,PTR,SRC) "qvstfsx " #SRC "," #PTR "," #OFF " ;\n"
112#define VSTOREuf(OFF,PTR,SRC) "qvstfsux " #SRC "," #PTR "," #OFF " ;\n"
113#define VSPLATf(A,B,DEST) "qvlfcsxa " #DEST "," #A "," #B ";\n"
114#define VSIZEf (16)
115
116#define VPERMIi(p) "qvgpci " #p ", 1217;\n"
117#define VPERMi(A,p) "qvfperm " #A "," #A "," #A "," #p ";\n"
118#define VPERMI(p) VPERMIi(p)
119#define VPERM(A,p) VPERMi(A,p)
120
121#define VLOADd(OFF,PTR,DEST) "qvlfdx " #DEST "," #PTR "," #OFF " ;\n"
122#define VLOADud(OFF,PTR,DEST) "qvlfdux " #DEST "," #PTR "," #OFF " ;\n"
123#define VSTOREd(OFF,PTR,SRC) "qvstfdx " #SRC "," #PTR "," #OFF " ;\n"
124#define VSTOREud(OFF,PTR,SRC) "qvstfdux " #SRC "," #PTR "," #OFF " ;\n"
125#define VSPLATd(A,B,DEST) "qvlfcdxa " #DEST "," #A "," #B ";\n"
126#define VSIZEd (32)
127
128// QPX manual ordering QRT comes first (dest)
129#define VZEROi(DEST) "qvfset " #DEST "; \n qvfsub " #DEST "," #DEST "," #DEST ";\n"
130#define VONEi(DEST) "qvfset " #DEST "; \n"
131#define VMOVi(DEST,A) "qvfmr " #DEST "," #A ";\n"
132#define VADDi(DEST,A,B) "qvfadd " #DEST "," #A "," #B ";\n"
133#define VSUBi(DEST,A,B) "qvfsub " #DEST "," #A "," #B ";\n"
134#define VMULi(DEST,A,B) "qvfmul " #DEST "," #A "," #B ";\n"
135#define VMUL_RR_RIi(DEST,A,B) "qvfxmul " #DEST "," #A "," #B ";\n"
136#define VMADDi(DEST,A,B,C) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
137#define VMADD_RR_RIi(DEST,A,B,C) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
138#define VMADD_MII_IRi(DEST,A,B,C) "qvfxxnpmadd " #DEST "," #B "," #A ","#C ";\n"
139#define VMADD_II_MIRi(DEST,A,B,C) "qvfxxcpnmadd " #DEST "," #B "," #A ","#C ";\n"
140
141#define VZERO(C) VZEROi(C)
142#define VONE(C) VONEi(C)
143#define VMOV(C,A) VMOVi(C,A)
144#define VADD(A,B,C) VADDi(A,B,C)
145#define VSUB(A,B,C) VSUBi(A,B,C)
146#define VMUL(A,B,C) VMULi(A,B,C)
147#define VMUL_RR_RI(A,B,C) VMUL_RR_RIi(A,B,C)
148#define VMADD(A,B,C,D) VMADDi(A,B,C,D)
149#define VMADD_RR_RI(A,B,C,D) VMADD_RR_RIi(A,B,C,D)
150#define VMADD_MII_IR(A,B,C,D) VMADD_MII_IRi(A,B,C,D)
151#define VMADD_II_MIR(A,B,C,D) VMADD_II_MIRi(A,B,C,D)
152
153/*********************************************************
154 * Macro sequences encoding QCD
155 *********************************************************/
156#define LOCK_GAUGE(dir) \
157 { \
158 uint64_t byte_addr = (uint64_t)&U[sU]; \
159 int count = (sizeof(U[0])+63)/64; \
160 asm (" mtctr %0 \n" \
161 " mr " HASH(REP) ", %1\n" \
162 " li " HASH(IMM) ", 64\n" \
163 "0:\n" \
164 LOCK_SET \
165 " add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
166 " bdnz 0b\n" \
167 : : "b" (count), "b" (byte_addr) ); \
168 }
169
170#define UNLOCK_GAUGE(dir) \
171 { \
172 uint64_t byte_addr = (uint64_t)&U[sU]; \
173 int count = (sizeof(U[0])+63)/64; \
174 asm (" mtctr %0 \n" \
175 " mr " HASH(REP) ", %1\n" \
176 " li " HASH(IMM) ", 64\n" \
177 "0:\n" \
178 LOCK_CLEAR \
179 " add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
180 " bdnz 0b\n" \
181 : : "b" (count), "b" (byte_addr) ); \
182 }
183
184#define ZERO_PSI \
185 VZERO(psi_00) \
186 VZERO(psi_01) \
187 VZERO(psi_02) \
188 VZERO(psi_10) \
189 VZERO(psi_11) \
190 VZERO(psi_12) \
191 VZERO(psi_20) \
192 VZERO(psi_21) \
193 VZERO(psi_22) \
194 VZERO(psi_30) \
195 VZERO(psi_31) \
196 VZERO(psi_32)
197
198#define MULT_2SPIN_QPX_LSd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,16)
199#define MULT_2SPIN_QPX_LSf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,8)
200#define MULT_2SPIN_QPXd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,32)
201#define MULT_2SPIN_QPXf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,16)
202
203#define MULT_2SPIN_QPX_INTERNAL(ptr,p,ULOAD,USKIP) { \
204 uint64_t ub = ((uint64_t)ptr); \
205 asm ( \
206 ULOAD(%0,%3,U0) \
207 ULOAD(%1,%3,U1) \
208 ULOAD(%2,%3,U2) \
209 VMUL_RR_RI(UChi_00,U0,Chi_00) \
210 VMUL_RR_RI(UChi_01,U1,Chi_00) \
211 VMUL_RR_RI(UChi_02,U2,Chi_00) \
212 VMUL_RR_RI(UChi_10,U0,Chi_10) \
213 VMUL_RR_RI(UChi_11,U1,Chi_10) \
214 VMUL_RR_RI(UChi_12,U2,Chi_10) \
215 VMADD_MII_IR(UChi_00,U0,Chi_00,UChi_00) \
216 VMADD_MII_IR(UChi_01,U1,Chi_00,UChi_01) \
217 VMADD_MII_IR(UChi_02,U2,Chi_00,UChi_02) \
218 VMADD_MII_IR(UChi_10,U0,Chi_10,UChi_10) \
219 VMADD_MII_IR(UChi_11,U1,Chi_10,UChi_11) \
220 VMADD_MII_IR(UChi_12,U2,Chi_10,UChi_12) \
221 : : "b" (0), "b" (USKIP*3), "b" (USKIP*6), "b" (ub )); \
222 asm ( \
223 ULOAD(%0,%3,U0) \
224 ULOAD(%1,%3,U1) \
225 ULOAD(%2,%3,U2) \
226 VMADD_RR_RI(UChi_00,U0,Chi_01,UChi_00) \
227 VMADD_RR_RI(UChi_01,U1,Chi_01,UChi_01) \
228 VMADD_RR_RI(UChi_02,U2,Chi_01,UChi_02) \
229 VMADD_RR_RI(UChi_10,U0,Chi_11,UChi_10) \
230 VMADD_RR_RI(UChi_11,U1,Chi_11,UChi_11) \
231 VMADD_RR_RI(UChi_12,U2,Chi_11,UChi_12) \
232 VMADD_MII_IR(UChi_00,U0,Chi_01,UChi_00) \
233 VMADD_MII_IR(UChi_01,U1,Chi_01,UChi_01) \
234 VMADD_MII_IR(UChi_02,U2,Chi_01,UChi_02) \
235 VMADD_MII_IR(UChi_10,U0,Chi_11,UChi_10) \
236 VMADD_MII_IR(UChi_11,U1,Chi_11,UChi_11) \
237 VMADD_MII_IR(UChi_12,U2,Chi_11,UChi_12) \
238 : : "b" (USKIP*1), "b" (USKIP*4), "b" (USKIP*7), "b" (ub )); \
239 asm ( \
240 ULOAD(%0,%3,U0) \
241 ULOAD(%1,%3,U1) \
242 ULOAD(%2,%3,U2) \
243 VMADD_RR_RI(UChi_00,U0,Chi_02,UChi_00) \
244 VMADD_RR_RI(UChi_01,U1,Chi_02,UChi_01) \
245 VMADD_RR_RI(UChi_02,U2,Chi_02,UChi_02) \
246 VMADD_RR_RI(UChi_10,U0,Chi_12,UChi_10) \
247 VMADD_RR_RI(UChi_11,U1,Chi_12,UChi_11) \
248 VMADD_RR_RI(UChi_12,U2,Chi_12,UChi_12) \
249 VMADD_MII_IR(UChi_00,U0,Chi_02,UChi_00) \
250 VMADD_MII_IR(UChi_01,U1,Chi_02,UChi_01) \
251 VMADD_MII_IR(UChi_02,U2,Chi_02,UChi_02) \
252 VMADD_MII_IR(UChi_10,U0,Chi_12,UChi_10) \
253 VMADD_MII_IR(UChi_11,U1,Chi_12,UChi_11) \
254 VMADD_MII_IR(UChi_12,U2,Chi_12,UChi_12) \
255 : : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub )); \
256 }
257
258
259#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U[sU](A),p)
260#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
261
262#define SAVE_RESULT(base,basep) { \
263 uint64_t ub = ((uint64_t)base) - (VSIZE); \
264 asm("mr " HASH(REP) ", %0;\n" \
265 "li " HASH(IMM) "," HASH(VSIZE)" ;\n" \
266 VSTOREu(IMM,REP,psi_00) \
267 VSTOREu(IMM,REP,psi_01) \
268 VSTOREu(IMM,REP,psi_02) \
269 VSTOREu(IMM,REP,psi_10) \
270 VSTOREu(IMM,REP,psi_11) \
271 VSTOREu(IMM,REP,psi_12) \
272 VSTOREu(IMM,REP,psi_20) \
273 VSTOREu(IMM,REP,psi_21) \
274 VSTOREu(IMM,REP,psi_22) \
275 VSTOREu(IMM,REP,psi_30) \
276 VSTOREu(IMM,REP,psi_31) \
277 VSTOREu(IMM,REP,psi_32) \
278 : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
279 }
280
281
282/*
283 *Annoying BG/Q loads with no immediat indexing and big performance hit
284 *when second miss to a L1 line occurs
285 */
286#define LOAD_CHI(base) { \
287 uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
288 asm("mr " HASH(REP) ",%0 ;\n" \
289 "li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
290 VLOADu(IMM,REP,Chi_00) \
291 VLOADu(IMM,REP,Chi_02) \
292 VLOADu(IMM,REP,Chi_11) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
293 ub = ((uint64_t)base) - VSIZE; \
294 asm("mr " HASH(REP) ", %0;\n" \
295 "li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
296 VLOADu(IMM,REP,Chi_01) \
297 VLOADu(IMM,REP,Chi_10) \
298 VLOADu(IMM,REP,Chi_12) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
299 }
300
301#define LOAD_CHIMU(base) { \
302 uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
303 asm("mr " HASH(REP) ",%0;\n" \
304 "li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
305 VLOADu(IMM,REP,Chi_00) \
306 VLOADu(IMM,REP,Chi_02) \
307 VLOADu(IMM,REP,Chi_11) \
308 VLOADu(IMM,REP,Chi_20) \
309 VLOADu(IMM,REP,Chi_22) \
310 VLOADu(IMM,REP,Chi_31) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
311 ub = ((uint64_t)base) - VSIZE; \
312 asm("mr " HASH(REP) ", %0;\n" \
313 "li " HASH(IMM) ", (2*" HASH(VSIZE) ");\n" \
314 VLOADu(IMM,REP,Chi_01) \
315 VLOADu(IMM,REP,Chi_10) \
316 VLOADu(IMM,REP,Chi_12) \
317 VLOADu(IMM,REP,Chi_21) \
318 VLOADu(IMM,REP,Chi_30) \
319 VLOADu(IMM,REP,Chi_32) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
320 }
321
322// hspin(0)=fspin(0)+timesI(fspin(3));
323// hspin(1)=fspin(1)+timesI(fspin(2));
324#define XP_PROJMEM(base) { \
325 LOAD_CHIMU(base); \
326 asm ( \
327 VONE(one) \
328 VMADD_MII_IR(Chi_00,one,Chi_30,Chi_00) \
329 VMADD_MII_IR(Chi_01,one,Chi_31,Chi_01) \
330 VMADD_MII_IR(Chi_02,one,Chi_32,Chi_02) \
331 VMADD_MII_IR(Chi_10,one,Chi_20,Chi_10) \
332 VMADD_MII_IR(Chi_11,one,Chi_21,Chi_11) \
333 VMADD_MII_IR(Chi_12,one,Chi_22,Chi_12) \
334 ); \
335 }
336
337#define XM_PROJMEM(base) { \
338 LOAD_CHIMU(base); \
339 asm ( \
340 VONE(one) \
341 VMADD_II_MIR(Chi_00,one,Chi_30,Chi_00) \
342 VMADD_II_MIR(Chi_01,one,Chi_31,Chi_01) \
343 VMADD_II_MIR(Chi_02,one,Chi_32,Chi_02) \
344 VMADD_II_MIR(Chi_10,one,Chi_20,Chi_10) \
345 VMADD_II_MIR(Chi_11,one,Chi_21,Chi_11) \
346 VMADD_II_MIR(Chi_12,one,Chi_22,Chi_12) \
347 ); \
348 }
349
350// hspin(0)=fspin(0)-fspin(3);
351// hspin(1)=fspin(1)+fspin(2);
352#define YP_PROJMEM(base) { \
353 LOAD_CHIMU(base); \
354 asm ( \
355 VSUB(Chi_00,Chi_00,Chi_30) \
356 VSUB(Chi_01,Chi_01,Chi_31) \
357 VSUB(Chi_02,Chi_02,Chi_32) \
358 VADD(Chi_10,Chi_10,Chi_20) \
359 VADD(Chi_11,Chi_11,Chi_21) \
360 VADD(Chi_12,Chi_12,Chi_22) \
361 ); \
362 }
363
364#define YM_PROJMEM(base) { \
365 LOAD_CHIMU(base); \
366 asm ( \
367 VADD(Chi_00,Chi_00,Chi_30) \
368 VADD(Chi_01,Chi_01,Chi_31) \
369 VADD(Chi_02,Chi_02,Chi_32) \
370 VSUB(Chi_10,Chi_10,Chi_20) \
371 VSUB(Chi_11,Chi_11,Chi_21) \
372 VSUB(Chi_12,Chi_12,Chi_22) ); \
373 }
374
375/*Gz
376 * 0 0 i 0 [0]+-i[2]
377 * 0 0 0 -i [1]-+i[3]
378 * -i 0 0 0
379 * 0 i 0 0
380 */
381#define ZP_PROJMEM(base) { \
382 LOAD_CHIMU(base); \
383 asm ( \
384 VONE(one) \
385 VMADD_MII_IR(Chi_00,one,Chi_20,Chi_00) \
386 VMADD_MII_IR(Chi_01,one,Chi_21,Chi_01) \
387 VMADD_MII_IR(Chi_02,one,Chi_22,Chi_02) \
388 VMADD_II_MIR(Chi_10,one,Chi_30,Chi_10) \
389 VMADD_II_MIR(Chi_11,one,Chi_31,Chi_11) \
390 VMADD_II_MIR(Chi_12,one,Chi_32,Chi_12) \
391 ); \
392 }
393
394#define ZM_PROJMEM(base) { \
395 LOAD_CHIMU(base); \
396 asm ( \
397 VONE(one) \
398 VMADD_II_MIR(Chi_00,one,Chi_20,Chi_00) \
399 VMADD_II_MIR(Chi_01,one,Chi_21,Chi_01) \
400 VMADD_II_MIR(Chi_02,one,Chi_22,Chi_02) \
401 VMADD_MII_IR(Chi_10,one,Chi_30,Chi_10) \
402 VMADD_MII_IR(Chi_11,one,Chi_31,Chi_11) \
403 VMADD_MII_IR(Chi_12,one,Chi_32,Chi_12) \
404 ); \
405 }
406/*Gt
407 * 0 0 1 0 [0]+-[2]
408 * 0 0 0 1 [1]+-[3]
409 * 1 0 0 0
410 * 0 1 0 0
411 */
412#define TP_PROJMEM(base) { \
413 LOAD_CHIMU(base); \
414 asm ( \
415 VADD(Chi_00,Chi_00,Chi_20) \
416 VADD(Chi_01,Chi_01,Chi_21) \
417 VADD(Chi_02,Chi_02,Chi_22) \
418 VADD(Chi_10,Chi_10,Chi_30) \
419 VADD(Chi_11,Chi_11,Chi_31) \
420 VADD(Chi_12,Chi_12,Chi_32) \
421 ); \
422 }
423
424#define TM_PROJMEM(base) { \
425 LOAD_CHIMU(base); \
426 asm ( \
427 VSUB(Chi_00,Chi_00,Chi_20) \
428 VSUB(Chi_01,Chi_01,Chi_21) \
429 VSUB(Chi_02,Chi_02,Chi_22) \
430 VSUB(Chi_10,Chi_10,Chi_30) \
431 VSUB(Chi_11,Chi_11,Chi_31) \
432 VSUB(Chi_12,Chi_12,Chi_32) \
433 ); \
434 }
435
436/*
437 fspin(0)=hspin(0);
438 fspin(1)=hspin(1);
439 fspin(2)=timesMinusI(hspin(1));
440 fspin(3)=timesMinusI(hspin(0));
441
442 fspin(0)+=hspin(0);
443 fspin(1)+=hspin(1);
444 fspin(2)-=timesI(hspin(1));
445 fspin(3)-=timesI(hspin(0));
446*/
447#define XP_RECON { \
448 asm( \
449 VONE(one) \
450 VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \
451 VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \
452 VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
453 VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
454 VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
455 VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
456 VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
457 VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
458 VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
459 VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
460 ); \
461 }
462
463#define XM_RECON { \
464 asm( \
465 VONE(one) \
466 VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \
467 VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \
468 VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
469 VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
470 VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
471 VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
472 VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
473 VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
474 VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
475 VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
476 ); \
477 }
478
479#define XP_RECON_ACCUM { \
480 asm( \
481 VONE(one) \
482 VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
483 VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
484 VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
485 VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
486 VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
487 VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
488 VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
489 VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
490 ); \
491 }
492
493#define XM_RECON_ACCUM { \
494 asm( \
495 VONE(one) \
496 VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
497 VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
498 VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
499 VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
500 VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
501 VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
502 VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
503 VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
504 ); \
505 }
506
507// fspin(2)+=hspin(1);
508// fspin(3)-=hspin(0);
509#define YP_RECON_ACCUM { \
510 asm( \
511 VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
512 VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
513 VADD(psi_20,psi_20,UChi_10) VADD(psi_21,psi_21,UChi_11) VADD(psi_22,psi_22,UChi_12) \
514 VSUB(psi_30,psi_30,UChi_00) VSUB(psi_31,psi_31,UChi_01) VSUB(psi_32,psi_32,UChi_02) \
515 ); \
516 }
517#define YM_RECON_ACCUM { \
518 asm( \
519 VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
520 VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
521 VSUB(psi_20,psi_20,UChi_10) VSUB(psi_21,psi_21,UChi_11) VSUB(psi_22,psi_22,UChi_12) \
522 VADD(psi_30,psi_30,UChi_00) VADD(psi_31,psi_31,UChi_01) VADD(psi_32,psi_32,UChi_02) \
523 ); \
524 }
525
526// fspin(2)-=timesI(hspin(0));
527// fspin(3)+=timesI(hspin(1));
528#define ZP_RECON_ACCUM { \
529 asm( \
530 VONE(one) \
531 VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
532 VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
533 VMADD_II_MIR(psi_20,one,UChi_00,psi_20) \
534 VMADD_II_MIR(psi_21,one,UChi_01,psi_21) \
535 VMADD_II_MIR(psi_22,one,UChi_02,psi_22) \
536 VMADD_MII_IR(psi_30,one,UChi_10,psi_30) \
537 VMADD_MII_IR(psi_31,one,UChi_11,psi_31) \
538 VMADD_MII_IR(psi_32,one,UChi_12,psi_32) \
539 ); \
540 }
541
542#define ZM_RECON_ACCUM { \
543 asm( \
544 VONE(one) \
545 VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
546 VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
547 VMADD_MII_IR(psi_20,one,UChi_00,psi_20) \
548 VMADD_MII_IR(psi_21,one,UChi_01,psi_21) \
549 VMADD_MII_IR(psi_22,one,UChi_02,psi_22) \
550 VMADD_II_MIR(psi_30,one,UChi_10,psi_30) \
551 VMADD_II_MIR(psi_31,one,UChi_11,psi_31) \
552 VMADD_II_MIR(psi_32,one,UChi_12,psi_32) \
553 ); \
554 }
555
556// fspin(2)+=hspin(0);
557// fspin(3)+=hspin(1);
558#define TP_RECON_ACCUM { \
559 asm( \
560 VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
561 VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
562 VADD(psi_20,psi_20,UChi_00) VADD(psi_21,psi_21,UChi_01) VADD(psi_22,psi_22,UChi_02) \
563 VADD(psi_30,psi_30,UChi_10) VADD(psi_31,psi_31,UChi_11) VADD(psi_32,psi_32,UChi_12) \
564 ); \
565 }
566
567#define TM_RECON_ACCUM { \
568 asm( \
569 VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
570 VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
571 VSUB(psi_20,psi_20,UChi_00) VSUB(psi_21,psi_21,UChi_01) VSUB(psi_22,psi_22,UChi_02) \
572 VSUB(psi_30,psi_30,UChi_10) VSUB(psi_31,psi_31,UChi_11) VSUB(psi_32,psi_32,UChi_12) \
573 ); \
574 }
575
576
577#define ADD_RESULTi(PTR,pf) \
578 LOAD_CHIMU(PTR) \
579 asm( \
580 VADD(psi_00,chi_00,psi_00) VADD(psi_01,chi_01,psi_01) VADD(psi_02,chi_02,psi_02) \
581 VADD(psi_10,chi_10,psi_10) VADD(psi_11,chi_11,psi_11) VADD(psi_12,chi_12,psi_12) \
582 VADD(psi_20,chi_20,psi_20) VADD(psi_21,chi_21,psi_21) VADD(psi_22,chi_22,psi_22) \
583 VADD(psi_30,chi_30,psi_30) VADD(psi_31,chi_31,psi_31) VADD(psi_32,chi_32,psi_32) ); \
584 SAVE_RESULT(PTR,pf);
585
586
587#define PERMUTE_DIR3
588#define PERMUTE_DIR2
589#define PERMUTE_DIR1
590
591#define PERMUTE_DIR0 { \
592 asm( \
593 VPERMI(perm_reg) \
594 VPERM(Chi_00,perm_reg) VPERM(Chi_01,perm_reg) VPERM(Chi_02,perm_reg) \
595 VPERM(Chi_10,perm_reg) VPERM(Chi_11,perm_reg) VPERM(Chi_12,perm_reg) ); \
596 }
597
598#endif