Grid 0.7.0
Intel512wilson.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Avx512Asm.h
6
7 Copyright (C) 2015
8
9Author: paboyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#ifndef GRID_ASM_INTEL_512_QCD_H
29#define GRID_ASM_INTEL_512_QCD_H
30
32// Register allocations for Wilson Kernel are precision indept
34#define psi_00 %zmm0
35#define psi_01 %zmm1
36#define psi_02 %zmm2
37
38#define psi_10 %zmm3
39#define psi_11 %zmm4
40#define psi_12 %zmm5
41
42#define psi_20 %zmm6
43#define psi_21 %zmm7
44#define psi_22 %zmm8
45
46#define psi_30 %zmm9
47#define psi_31 %zmm10
48#define psi_32 %zmm11
49
50#define Chi_00 %zmm12
51#define Chi_01 %zmm13
52#define Chi_02 %zmm14
53
54#define Chi_10 %zmm15
55#define Chi_11 %zmm16
56#define Chi_12 %zmm17
57
58#define UChi_00 %zmm18
59#define UChi_01 %zmm19
60#define UChi_02 %zmm20
61
62#define UChi_10 %zmm21
63#define UChi_11 %zmm22
64#define UChi_12 %zmm23
65
66#define Uir %zmm24
67#define Uri %zmm25
68#define T1 %zmm24
69#define T2 %zmm25
70
71#define Z0 %zmm26
72#define Z1 %zmm27
73#define Z2 %zmm28
74#define Z3 %zmm29
75#define Z4 %zmm30
76#define Z5 %zmm31
77
78#define TMP Chi_00
79
80#define Chimu_00 Chi_00
81#define Chimu_01 Chi_01
82#define Chimu_02 Chi_02
83#define Chimu_10 Chi_10
84#define Chimu_11 Chi_11
85#define Chimu_12 Chi_12
86#define Chimu_20 UChi_00
87#define Chimu_21 UChi_01
88#define Chimu_22 UChi_02
89#define Chimu_30 UChi_10
90#define Chimu_31 UChi_11
91#define Chimu_32 UChi_12
92
93#include "Intel512common.h"
94#include "Intel512avx.h"
95
97// Macros used to build wilson kernel -- can rationalise and simplify
98// a little as some duplication developed during trying different
99// variants during optimisation. Could cut back to only those used.
101#define LOCK_GAUGE(dir)
102#define UNLOCK_GAUGE(dir)
103
104// const SiteSpinor * ptr = & in[offset];
105#define LOAD_CHIMU(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIMUi );
106#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
107#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
108#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
109#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
110#define ADD_RESULT(PT,R) ADD_RESULTi(PT,R)
111
112#define ZERO_PSI \
113 asm( VZERO(psi_00) \
114 VZERO(psi_01) \
115 VZERO(psi_02) \
116 VZERO(psi_10) \
117 VZERO(psi_11) \
118 VZERO(psi_12) \
119 VZERO(psi_20) \
120 VZERO(psi_21) \
121 VZERO(psi_22) \
122 VZERO(psi_30) \
123 VZERO(psi_31) \
124 VZERO(psi_32));
125
126#define LOAD_CHIMUi \
127 LOAD_CHIMU01i \
128 LOAD_CHIMU23i
129
130#define LOAD_CHIMU01i \
131 VLOAD(0,%r8,Chimu_00) \
132 VLOAD(1,%r8,Chimu_01) \
133 VLOAD(2,%r8,Chimu_02) \
134 VLOAD(3,%r8,Chimu_10) \
135 VLOAD(4,%r8,Chimu_11) \
136 VLOAD(5,%r8,Chimu_12)
137
138#define LOAD_CHIMU23i \
139 VLOAD(6,%r8,Chimu_20) \
140 VLOAD(7,%r8,Chimu_21) \
141 VLOAD(8,%r8,Chimu_22) \
142 VLOAD(9,%r8,Chimu_30) \
143 VLOAD(10,%r8,Chimu_31) \
144 VLOAD(11,%r8,Chimu_32)
145
146#define SHUF_CHIMU23i \
147 VSHUFMEM(6,%r8,Chimu_20) \
148 VSHUFMEM(7,%r8,Chimu_21) \
149 VSHUFMEM(8,%r8,Chimu_22) \
150 VSHUFMEM(9,%r8,Chimu_30) \
151 VSHUFMEM(10,%r8,Chimu_31) \
152 VSHUFMEM(11,%r8,Chimu_32)
153
154#define LOAD_CHIi \
155 VLOAD(0,%r8,Chi_00) \
156 VLOAD(1,%r8,Chi_01) \
157 VLOAD(2,%r8,Chi_02) \
158 VLOAD(3,%r8,Chi_10) \
159 VLOAD(4,%r8,Chi_11) \
160 VLOAD(5,%r8,Chi_12)
161
162#define SAVE_UCHIi(PTR) \
163 LOAD64(%r8,PTR) \
164 __asm__ ( \
165 VSTORE(0,%r8,UChi_00) \
166 VSTORE(1,%r8,UChi_01) \
167 VSTORE(2,%r8,UChi_02) \
168 VSTORE(3,%r8,UChi_10) \
169 VSTORE(4,%r8,UChi_11) \
170 VSTORE(5,%r8,UChi_12) );
171
172#define SAVE_CHIi(PTR) \
173 LOAD64(%r8,PTR) \
174 __asm__ ( \
175 VSTORE(0,%r8,Chi_00) \
176 VSTORE(1,%r8,Chi_01) \
177 VSTORE(2,%r8,Chi_02) \
178 VSTORE(3,%r8,Chi_10) \
179 VSTORE(4,%r8,Chi_11) \
180 VSTORE(5,%r8,Chi_12) );
181
182#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U[sU](A),p)
183#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
184
186// Dirac algebra
188// hspin(0)=fspin(0)+timesI(fspin(3));
189// hspin(1)=fspin(1)+timesI(fspin(2));
190#define XP_PROJMEM(PTR) \
191 LOAD64(%r8,PTR) \
192 __asm__ ( \
193 LOAD_CHIi \
194 SHUF_CHIMU23i \
195 VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \
196 VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \
197 VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \
198 VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \
199 VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \
200 VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \
201 VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \
202 VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \
203 VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \
204 VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \
205 VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \
206 VACCTIMESI2(Chi_12,Chi_12,Chimu_22) );
207
208
209#define YP_PROJMEM(ptr) \
210 LOAD64(%r8,ptr) \
211 __asm__ ( \
212 LOAD_CHIMU01i \
213 VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \
214 VSUBMEM(10,%r8,Chimu_01,Chi_01) \
215 VSUBMEM(11,%r8,Chimu_02,Chi_02) \
216 VADDMEM(6,%r8,Chimu_10,Chi_10) \
217 VADDMEM(7,%r8,Chimu_11,Chi_11) \
218 VADDMEM(8,%r8,Chimu_12,Chi_12) );
219
220#define ZP_PROJMEM(PTR) \
221 LOAD64(%r8,PTR) \
222 __asm__ ( \
223 LOAD_CHIi \
224 SHUF_CHIMU23i \
225 VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \
226 VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \
227 VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \
228 VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \
229 VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \
230 VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \
231 VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \
232 VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \
233 VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \
234 VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \
235 VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \
236 VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) );
237
238
239#define TP_PROJMEM(ptr) \
240 LOAD64(%r8,ptr) \
241 __asm__ ( \
242 LOAD_CHIMU01i \
243 VADDMEM(6,%r8 ,Chimu_00,Chi_00) \
244 VADDMEM(7,%r8,Chimu_01,Chi_01) \
245 VADDMEM(8,%r8,Chimu_02,Chi_02) \
246 VADDMEM(9,%r8,Chimu_10,Chi_10) \
247 VADDMEM(10,%r8,Chimu_11,Chi_11) \
248 VADDMEM(11,%r8,Chimu_12,Chi_12) );
249
250// hspin(0)=fspin(0)-timesI(fspin(3))
251// hspin(1)=fspin(1)-timesI(fspin(2))
252#define XM_PROJMEM(PTR) \
253 LOAD64(%r8,PTR) \
254 __asm__ ( \
255 LOAD_CHIi \
256 SHUF_CHIMU23i \
257 VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30) \
258 VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31) \
259 VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32) \
260 VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20) \
261 VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21) \
262 VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22) \
263 VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30) \
264 VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31) \
265 VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32) \
266 VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20) \
267 VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21) \
268 VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
269
270#define YM_PROJMEM(ptr) \
271 LOAD64(%r8,ptr) \
272 __asm__ ( \
273 LOAD_CHIMU01i \
274 VADDMEM(9,%r8 ,Chimu_00,Chi_00) \
275 VADDMEM(10,%r8,Chimu_01,Chi_01) \
276 VADDMEM(11,%r8,Chimu_02,Chi_02) \
277 VSUBMEM(6,%r8,Chimu_10,Chi_10) \
278 VSUBMEM(7,%r8,Chimu_11,Chi_11) \
279 VSUBMEM(8,%r8,Chimu_12,Chi_12) );
280
281#define ZM_PROJMEM(PTR) \
282 LOAD64(%r8,PTR) \
283 __asm__ ( \
284 LOAD_CHIi \
285 SHUF_CHIMU23i \
286 VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20) \
287 VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21) \
288 VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22) \
289 VACCTIMESI1(Chi_10,Chi_10,Chimu_30) \
290 VACCTIMESI1(Chi_11,Chi_11,Chimu_31) \
291 VACCTIMESI1(Chi_12,Chi_12,Chimu_32) \
292 VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20) \
293 VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21) \
294 VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22) \
295 VACCTIMESI2(Chi_10,Chi_10,Chimu_30) \
296 VACCTIMESI2(Chi_11,Chi_11,Chimu_31) \
297 VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
298
299#define TM_PROJMEM(ptr) \
300 LOAD64(%r8,ptr) \
301 __asm__ ( \
302 LOAD_CHIMU01i \
303 VSUBMEM(6,%r8,Chimu_00,Chi_00) \
304 VSUBMEM(7,%r8,Chimu_01,Chi_01) \
305 VSUBMEM(8,%r8,Chimu_02,Chi_02) \
306 VSUBMEM(9,%r8,Chimu_10,Chi_10) \
307 VSUBMEM(10,%r8,Chimu_11,Chi_11) \
308 VSUBMEM(11,%r8,Chimu_12,Chi_12) );
309
310// fspin(0)=hspin(0)
311// fspin(1)=hspin(1)
312// fspin(2)=timesMinusI(hspin(1))
313// fspin(3)=timesMinusI(hspin(0))
314#define XP_RECON __asm__ ( \
315 VZERO(TMP) \
316 VTIMESMINUSI0(UChi_00,psi_30,TMP) \
317 VTIMESMINUSI0(UChi_10,psi_20,TMP) \
318 VTIMESMINUSI0(UChi_01,psi_31,TMP) \
319 VTIMESMINUSI0(UChi_11,psi_21,TMP) \
320 VTIMESMINUSI0(UChi_02,psi_32,TMP) \
321 VTIMESMINUSI0(UChi_12,psi_22,TMP) \
322 VMOV(UChi_00,psi_00) \
323 VMOV(UChi_10,psi_10) \
324 VMOV(UChi_01,psi_01) \
325 VMOV(UChi_11,psi_11) \
326 VMOV(UChi_02,psi_02) \
327 VMOV(UChi_12,psi_12) \
328 VTIMESMINUSI1(UChi_10,psi_20,TMP) \
329 VTIMESMINUSI1(UChi_11,psi_21,TMP) \
330 VTIMESMINUSI1(UChi_12,psi_22,TMP) \
331 VTIMESMINUSI1(UChi_00,psi_30,TMP) \
332 VTIMESMINUSI1(UChi_01,psi_31,TMP) \
333 VTIMESMINUSI1(UChi_02,psi_32,TMP) \
334 VTIMESMINUSI2(UChi_10,psi_20,TMP) \
335 VTIMESMINUSI2(UChi_11,psi_21,TMP) \
336 VTIMESMINUSI2(UChi_12,psi_22,TMP) \
337 VTIMESMINUSI2(UChi_00,psi_30,TMP) \
338 VTIMESMINUSI2(UChi_01,psi_31,TMP) \
339 VTIMESMINUSI2(UChi_02,psi_32,TMP) \
340 );
341// NB could save 6 ops using addsub => 12 cycles
342#define XP_RECON_ACCUM __asm__ ( \
343 VZERO(TMP) \
344 VACCTIMESMINUSI0(UChi_00,psi_30,Z3) \
345 VACCTIMESMINUSI0(UChi_10,psi_20,Z0) \
346 VACCTIMESMINUSI0(UChi_01,psi_31,Z4) \
347 VACCTIMESMINUSI0(UChi_11,psi_21,Z1) \
348 VACCTIMESMINUSI0(UChi_02,psi_32,Z5) \
349 VACCTIMESMINUSI0(UChi_12,psi_22,Z2) \
350 VADD(UChi_00,psi_00,psi_00) \
351 VADD(UChi_10,psi_10,psi_10) \
352 VADD(UChi_01,psi_01,psi_01) \
353 VADD(UChi_11,psi_11,psi_11) \
354 VADD(UChi_02,psi_02,psi_02) \
355 VADD(UChi_12,psi_12,psi_12) \
356 VACCTIMESMINUSI1(UChi_00,psi_30,Z3) \
357 VACCTIMESMINUSI1(UChi_10,psi_20,Z0) \
358 VACCTIMESMINUSI1(UChi_01,psi_31,Z4) \
359 VACCTIMESMINUSI1(UChi_11,psi_21,Z1) \
360 VACCTIMESMINUSI1(UChi_02,psi_32,Z5) \
361 VACCTIMESMINUSI1(UChi_12,psi_22,Z2) \
362 VACCTIMESMINUSI2(UChi_10,psi_20,Z0) \
363 VACCTIMESMINUSI2(UChi_11,psi_21,Z1) \
364 VACCTIMESMINUSI2(UChi_12,psi_22,Z2) \
365 VACCTIMESMINUSI2(UChi_00,psi_30,Z3) \
366 VACCTIMESMINUSI2(UChi_01,psi_31,Z4) \
367 VACCTIMESMINUSI2(UChi_02,psi_32,Z5) \
368 );
369
370#define XM_RECON __asm__ ( \
371 VZERO(TMP) \
372 VTIMESI0(UChi_00,psi_30,TMP) \
373 VTIMESI0(UChi_10,psi_20,TMP) \
374 VTIMESI0(UChi_01,psi_31,TMP) \
375 VTIMESI0(UChi_11,psi_21,TMP) \
376 VTIMESI0(UChi_02,psi_32,TMP) \
377 VTIMESI0(UChi_12,psi_22,TMP) \
378 VMOV(UChi_00,psi_00) \
379 VMOV(UChi_10,psi_10) \
380 VMOV(UChi_01,psi_01) \
381 VMOV(UChi_11,psi_11) \
382 VMOV(UChi_02,psi_02) \
383 VMOV(UChi_12,psi_12) \
384 VTIMESI1(UChi_00,psi_30,TMP) \
385 VTIMESI1(UChi_10,psi_20,TMP) \
386 VTIMESI1(UChi_01,psi_31,TMP) \
387 VTIMESI1(UChi_11,psi_21,TMP) \
388 VTIMESI1(UChi_02,psi_32,TMP) \
389 VTIMESI1(UChi_12,psi_22,TMP) \
390 VTIMESI2(UChi_10,psi_20,TMP) \
391 VTIMESI2(UChi_11,psi_21,TMP) \
392 VTIMESI2(UChi_12,psi_22,TMP) \
393 VTIMESI2(UChi_00,psi_30,TMP) \
394 VTIMESI2(UChi_01,psi_31,TMP) \
395 VTIMESI2(UChi_02,psi_32,TMP) \
396 );
397
398#define XM_RECON_ACCUM __asm__ ( \
399 VACCTIMESI0(UChi_10,psi_20,Z0) \
400 VACCTIMESI0(UChi_00,psi_30,Z3) \
401 VACCTIMESI0(UChi_11,psi_21,Z1) \
402 VACCTIMESI0(UChi_01,psi_31,Z4) \
403 VACCTIMESI0(UChi_12,psi_22,Z2) \
404 VACCTIMESI0(UChi_02,psi_32,Z5) \
405 \
406 VADD(UChi_10,psi_10,psi_10) \
407 VADD(UChi_00,psi_00,psi_00) \
408 VADD(UChi_11,psi_11,psi_11) \
409 VADD(UChi_01,psi_01,psi_01) \
410 VADD(UChi_12,psi_12,psi_12) \
411 VADD(UChi_02,psi_02,psi_02) \
412 \
413 VACCTIMESI1(UChi_10,psi_20,Z0) \
414 VACCTIMESI1(UChi_00,psi_30,Z3) \
415 VACCTIMESI1(UChi_11,psi_21,Z1) \
416 VACCTIMESI1(UChi_01,psi_31,Z4) \
417 VACCTIMESI1(UChi_12,psi_22,Z2) \
418 VACCTIMESI1(UChi_02,psi_32,Z5) \
419 VACCTIMESI2(UChi_10,psi_20,Z0) \
420 VACCTIMESI2(UChi_11,psi_21,Z1) \
421 VACCTIMESI2(UChi_12,psi_22,Z2) \
422 VACCTIMESI2(UChi_00,psi_30,Z3) \
423 VACCTIMESI2(UChi_01,psi_31,Z4) \
424 VACCTIMESI2(UChi_02,psi_32,Z5) \
425 );
426
427#define YP_RECON_ACCUM __asm__ ( \
428 VADD(UChi_00,psi_00,psi_00) \
429 VADD(UChi_10,psi_10,psi_10) \
430 VADD(UChi_01,psi_01,psi_01) \
431 VADD(UChi_11,psi_11,psi_11) \
432 VADD(UChi_02,psi_02,psi_02) \
433 VADD(UChi_12,psi_12,psi_12) \
434 VADD(UChi_10,psi_20,psi_20) \
435 VADD(UChi_11,psi_21,psi_21) \
436 VADD(UChi_12,psi_22,psi_22) \
437 VSUB(UChi_00,psi_30,psi_30) \
438 VSUB(UChi_01,psi_31,psi_31) \
439 VSUB(UChi_02,psi_32,psi_32) );
440
441#define YM_RECON_ACCUM __asm__ ( \
442 VADD(UChi_00,psi_00,psi_00) \
443 VADD(UChi_10,psi_10,psi_10) \
444 VADD(UChi_01,psi_01,psi_01) \
445 VADD(UChi_11,psi_11,psi_11) \
446 VADD(UChi_02,psi_02,psi_02) \
447 VADD(UChi_12,psi_12,psi_12) \
448 VSUB(UChi_10,psi_20,psi_20) \
449 VSUB(UChi_11,psi_21,psi_21) \
450 VSUB(UChi_12,psi_22,psi_22) \
451 VADD(UChi_00,psi_30,psi_30) \
452 VADD(UChi_01,psi_31,psi_31) \
453 VADD(UChi_02,psi_32,psi_32) );
454
455#define ZP_RECON_ACCUM __asm__ ( \
456 VACCTIMESMINUSI0(UChi_00,psi_20,Z0) \
457 VACCTIMESI0(UChi_10,psi_30,Z3) \
458 VACCTIMESMINUSI0(UChi_01,psi_21,Z1) \
459 VACCTIMESI0(UChi_11,psi_31,Z4) \
460 VACCTIMESMINUSI0(UChi_02,psi_22,Z2) \
461 VACCTIMESI0(UChi_12,psi_32,Z5) \
462 VADD(UChi_00,psi_00,psi_00) \
463 VADD(UChi_10,psi_10,psi_10) \
464 VADD(UChi_01,psi_01,psi_01) \
465 VADD(UChi_11,psi_11,psi_11) \
466 VADD(UChi_02,psi_02,psi_02) \
467 VADD(UChi_12,psi_12,psi_12) \
468 VACCTIMESMINUSI1(UChi_00,psi_20,Z0) \
469 VACCTIMESI1(UChi_10,psi_30,Z3) \
470 VACCTIMESMINUSI1(UChi_01,psi_21,Z1) \
471 VACCTIMESI1(UChi_11,psi_31,Z4) \
472 VACCTIMESMINUSI1(UChi_02,psi_22,Z2) \
473 VACCTIMESI1(UChi_12,psi_32,Z5) \
474 VACCTIMESMINUSI2(UChi_00,psi_20,Z0) \
475 VACCTIMESMINUSI2(UChi_01,psi_21,Z1) \
476 VACCTIMESMINUSI2(UChi_02,psi_22,Z2) \
477 VACCTIMESI2(UChi_10,psi_30,Z3) \
478 VACCTIMESI2(UChi_11,psi_31,Z4) \
479 VACCTIMESI2(UChi_12,psi_32,Z5) \
480 );
481
482#define ZM_RECON_ACCUM __asm__ ( \
483 VACCTIMESI0(UChi_00,psi_20,Z0) \
484 VACCTIMESMINUSI0(UChi_10,psi_30,Z3) \
485 VACCTIMESI0(UChi_01,psi_21,Z1) \
486 VACCTIMESMINUSI0(UChi_11,psi_31,Z4) \
487 VACCTIMESI0(UChi_02,psi_22,Z2) \
488 VACCTIMESMINUSI0(UChi_12,psi_32,Z5) \
489 VADD(UChi_00,psi_00,psi_00) \
490 VADD(UChi_10,psi_10,psi_10) \
491 VADD(UChi_01,psi_01,psi_01) \
492 VADD(UChi_11,psi_11,psi_11) \
493 VADD(UChi_02,psi_02,psi_02) \
494 VADD(UChi_12,psi_12,psi_12) \
495 VACCTIMESI1(UChi_00,psi_20,Z0) \
496 VACCTIMESMINUSI1(UChi_10,psi_30,Z3) \
497 VACCTIMESI1(UChi_01,psi_21,Z1) \
498 VACCTIMESMINUSI1(UChi_11,psi_31,Z4) \
499 VACCTIMESI1(UChi_02,psi_22,Z2) \
500 VACCTIMESMINUSI1(UChi_12,psi_32,Z5) \
501 VACCTIMESI2(UChi_00,psi_20,Z0) \
502 VACCTIMESI2(UChi_01,psi_21,Z1) \
503 VACCTIMESI2(UChi_02,psi_22,Z2) \
504 VACCTIMESMINUSI2(UChi_10,psi_30,Z3) \
505 VACCTIMESMINUSI2(UChi_11,psi_31,Z4) \
506 VACCTIMESMINUSI2(UChi_12,psi_32,Z5) \
507 );
508
509#define TP_RECON_ACCUM __asm__ ( \
510 VADD(UChi_00,psi_00,psi_00) \
511 VADD(UChi_10,psi_10,psi_10) \
512 VADD(UChi_01,psi_01,psi_01) \
513 VADD(UChi_11,psi_11,psi_11) \
514 VADD(UChi_02,psi_02,psi_02) \
515 VADD(UChi_12,psi_12,psi_12) \
516 VADD(UChi_00,psi_20,psi_20) \
517 VADD(UChi_10,psi_30,psi_30) \
518 VADD(UChi_01,psi_21,psi_21) \
519 VADD(UChi_11,psi_31,psi_31) \
520 VADD(UChi_02,psi_22,psi_22) \
521 VADD(UChi_12,psi_32,psi_32) );
522
523#define TM_RECON_ACCUM __asm__ ( \
524 VADD(UChi_00,psi_00,psi_00) \
525 VADD(UChi_10,psi_10,psi_10) \
526 VADD(UChi_01,psi_01,psi_01) \
527 VADD(UChi_11,psi_11,psi_11) \
528 VADD(UChi_02,psi_02,psi_02) \
529 VADD(UChi_12,psi_12,psi_12) \
530 VSUB(UChi_00,psi_20,psi_20) \
531 VSUB(UChi_10,psi_30,psi_30) \
532 VSUB(UChi_01,psi_21,psi_21) \
533 VSUB(UChi_11,psi_31,psi_31) \
534 VSUB(UChi_02,psi_22,psi_22) \
535 VSUB(UChi_12,psi_32,psi_32) );
536
537#define AVX512_PF_L1
538#define AVX512_PF_L2_GAUGE
539#define AVX512_PF_L2_TABLE
540#undef AVX512_PF_L2_LINEAR
541
542#ifdef AVX512_PF_L2_TABLE
543// P1 Fetches the base pointer for next link into L1 with P1
544// M1 Fetches the next site pointer into L2
545#define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
546#define VPREFETCH_P2(A,B)
547#define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
548#define VPREFETCH_M2(A,B)
549#endif
550
551#ifdef AVX512_PF_L2_LINEAR
552#define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
553#define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
554#define VPREFETCH_P1(A,B)
555#define VPREFETCH_P2(A,B)
556#endif
557
558#ifdef AVX512_PF_L2_GAUGE
559#define VPREFETCH_G1(A,B) VPREFETCH1(A,B)
560#define VPREFETCH_G2(A,B) VPREFETCH2(A,B)
561#endif
562
563#define PF_GAUGE(A) \
564 LOAD64(%r8,&U[sU](A)) \
565 __asm__ ( \
566 VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \
567 VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \
568 );
569
570#define SAVE_RESULTi(PTR,pf) \
571 LOAD64(%r8,PTR) \
572 LOAD64(%r9,pf) \
573 __asm__ ( \
574 VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \
575 VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \
576 VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \
577 VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \
578 VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \
579 VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \
580 VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \
581 VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \
582 VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \
583 VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \
584 VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \
585 VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \
586 );
587
588#define ADD_RESULTi(PTR,pf) \
589 LOAD_CHIMU(PTR); \
590 asm(VADD(psi_00,Chimu_00,psi_00) VADD(psi_01,Chimu_01,psi_01) VADD(psi_02,Chimu_02,psi_02) \
591 VADD(psi_10,Chimu_10,psi_10) VADD(psi_11,Chimu_11,psi_11) VADD(psi_12,Chimu_12,psi_12) \
592 VADD(psi_20,Chimu_20,psi_20) VADD(psi_21,Chimu_21,psi_21) VADD(psi_22,Chimu_22,psi_22) \
593 VADD(psi_30,Chimu_30,psi_30) VADD(psi_31,Chimu_31,psi_31) VADD(psi_32,Chimu_32,psi_32) ); \
594 SAVE_RESULT(PTR,pf);
595
596
597
598#define ADD_RESULTia(PTR,pf) \
599 LOAD64(%r8,PTR) \
600 __asm__ ( \
601 VADDMEM(0,%r8,psi_00,psi_00) \
602 VADDMEM(1,%r8,psi_01,psi_01) \
603 VADDMEM(2,%r8,psi_02,psi_02) \
604 VADDMEM(3,%r8,psi_10,psi_10) \
605 VADDMEM(4,%r8,psi_11,psi_11) \
606 VADDMEM(5,%r8,psi_12,psi_12) \
607 VADDMEM(6,%r8,psi_20,psi_20) \
608 VADDMEM(7,%r8,psi_21,psi_21) \
609 VADDMEM(8,%r8,psi_22,psi_22) \
610 VADDMEM(9,%r8,psi_30,psi_30) \
611 VADDMEM(10,%r8,psi_31,psi_31) \
612 VADDMEM(11,%r8,psi_32,psi_32) \
613 VSTORE(0,%r8,psi_00) \
614 VSTORE(1,%r8,psi_01) \
615 VSTORE(2,%r8,psi_02) \
616 VSTORE(3,%r8,psi_10) \
617 VSTORE(4,%r8,psi_11) \
618 VSTORE(5,%r8,psi_12) \
619 VSTORE(6,%r8,psi_20) \
620 VSTORE(7,%r8,psi_21) \
621 VSTORE(8,%r8,psi_22) \
622 VSTORE(9,%r8,psi_30) \
623 VSTORE(10,%r8,psi_31) \
624 VSTORE(11,%r8,psi_32) \
625 );
626
627
628#ifdef AVX512_PF_L2_TABLE
629#define PREFETCH_CHIMU(A) \
630 LOAD64(%r9,A) \
631 __asm__ ( \
632 VPREFETCH_P1(0,%r9) \
633 VPREFETCH_P1(1,%r9) \
634 VPREFETCH_P1(2,%r9) \
635 VPREFETCH_P1(3,%r9) \
636 VPREFETCH_P1(4,%r9) \
637 VPREFETCH_P1(5,%r9) \
638 VPREFETCH_P1(6,%r9) \
639 VPREFETCH_P1(7,%r9) \
640 VPREFETCH_P1(8,%r9) \
641 VPREFETCH_P1(9,%r9) \
642 VPREFETCH_P1(10,%r9) \
643 VPREFETCH_P1(11,%r9));
644
645#else
646#define PREFETCH_CHIMU(A)
647#endif
648
649#define PREFETCH1_CHIMU(A) \
650 LOAD64(%r9,A) \
651 __asm__ ( \
652 VPREFETCH_P1(0,%r9) \
653 VPREFETCH_P1(1,%r9) \
654 VPREFETCH_P1(2,%r9) \
655 VPREFETCH_P1(3,%r9) \
656 VPREFETCH_P1(4,%r9) \
657 VPREFETCH_P1(5,%r9) \
658 VPREFETCH_P1(6,%r9) \
659 VPREFETCH_P1(7,%r9) \
660 VPREFETCH_P1(8,%r9) \
661 VPREFETCH_P1(9,%r9) \
662 VPREFETCH_P1(10,%r9) \
663 VPREFETCH_P1(11,%r9));
664
665#define PERMUTE_DIR0 __asm__ ( \
666 VPERM0(Chi_00,Chi_00) \
667 VPERM0(Chi_01,Chi_01) \
668 VPERM0(Chi_02,Chi_02) \
669 VPERM0(Chi_10,Chi_10) \
670 VPERM0(Chi_11,Chi_11) \
671 VPERM0(Chi_12,Chi_12) );
672
673#define PERMUTE_DIR1 __asm__ ( \
674 VPERM1(Chi_00,Chi_00) \
675 VPERM1(Chi_01,Chi_01) \
676 VPERM1(Chi_02,Chi_02) \
677 VPERM1(Chi_10,Chi_10) \
678 VPERM1(Chi_11,Chi_11) \
679 VPERM1(Chi_12,Chi_12));
680
681#define PERMUTE_DIR2 __asm__ ( \
682 VPERM2(Chi_00,Chi_00) \
683 VPERM2(Chi_01,Chi_01) \
684 VPERM2(Chi_02,Chi_02) \
685 VPERM2(Chi_10,Chi_10) \
686 VPERM2(Chi_11,Chi_11) \
687 VPERM2(Chi_12,Chi_12) );
688
689#define PERMUTE_DIR3 __asm__ ( \
690 VPERM3(Chi_00,Chi_00) \
691 VPERM3(Chi_01,Chi_01) \
692 VPERM3(Chi_02,Chi_02) \
693 VPERM3(Chi_10,Chi_10) \
694 VPERM3(Chi_11,Chi_11) \
695 VPERM3(Chi_12,Chi_12) );
696
697
698#define MULT_ADDSUB_2SPIN(ptr,pf) \
699 LOAD64(%r8,ptr) \
700 LOAD64(%r9,pf) \
701 __asm__ ( \
702 VPREFETCH_G2(9,%r8) \
703 VPREFETCH_G2(10,%r8) \
704 VPREFETCH_G2(11,%r8) \
705 VPREFETCH_G2(12,%r8) \
706 VPREFETCH_G2(13,%r8) \
707 VPREFETCH_G2(14,%r8) \
708 VPREFETCH_G2(15,%r8) \
709 VPREFETCH_G2(16,%r8) \
710 VPREFETCH_G2(17,%r8) \
711 VSHUF(Chi_00,T1) \
712 VMOVIDUP(0,%r8,Z0 ) \
713 VMOVIDUP(3,%r8,Z1 ) \
714 VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \
715 /*6*/ \
716 VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
717 VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
718 VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
719 VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
720 VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
721 VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
722 VPREFETCH_M1(0,%r9) \
723 VPREFETCH_M1(1,%r9) \
724 VPREFETCH_M1(2,%r9) \
725 VPREFETCH_M1(3,%r9) \
726 /*18*/ \
727 VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
728 VMADDSUB(Z3,Chi_10,UChi_10) \
729 VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
730 VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
731 VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
732 VMADDSUB(Z5,Chi_10,UChi_12) \
733 VPREFETCH_M1(4,%r9) \
734 VPREFETCH_M1(5,%r9) \
735 VPREFETCH_M1(6,%r9) \
736 VPREFETCH_M1(7,%r9) \
737 /*28*/ \
738 VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
739 VMADDSUB(Z0,T2,UChi_10) \
740 VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
741 VMADDSUB(Z1,T2,UChi_11) \
742 VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
743 VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
744 VPREFETCH2(12,%r9) \
745 VPREFETCH2(13,%r9) \
746 VPREFETCH2(14,%r9) \
747 VPREFETCH2(15,%r9) \
748 VPREFETCH2(16,%r9) \
749 VPREFETCH2(17,%r9) \
750 VPREFETCH2(18,%r9) \
751 VPREFETCH2(19,%r9) \
752 VPREFETCH2(20,%r9) \
753 VPREFETCH2(21,%r9) \
754 VPREFETCH2(22,%r9) \
755 VPREFETCH2(23,%r9) \
756 /*38*/ \
757 VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
758 VMADDSUB(Z3,Chi_11,UChi_10) \
759 VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
760 VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
761 VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
762 VMADDSUB(Z5,Chi_11,UChi_12) \
763 VPREFETCH_M1(9,%r8) \
764 VPREFETCH_M1(10,%r8) \
765 VPREFETCH_M1(11,%r8) \
766 VPREFETCH_M1(12,%r8) \
767 VPREFETCH_M1(13,%r8) \
768 VPREFETCH_M1(14,%r8) \
769 VPREFETCH_M1(15,%r8) \
770 VPREFETCH_M1(16,%r8) \
771 VPREFETCH_M1(17,%r8) \
772 /*48*/ \
773 VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
774 VMADDSUB(Z0,T2,UChi_10) \
775 VMADDSUB(Z1,T1,UChi_01) \
776 VMADDSUB(Z1,T2,UChi_11) \
777 VMADDSUB(Z2,T1,UChi_02) \
778 VMADDSUB(Z2,T2,UChi_12) \
779 VPREFETCH_M1(8,%r9) \
780 VPREFETCH_M1(9,%r9) \
781 VPREFETCH_M1(10,%r9) \
782 VPREFETCH_M1(11,%r9) \
783 /*55*/ \
784 VMADDSUB(Z3,Chi_02,UChi_00) \
785 VMADDSUB(Z3,Chi_12,UChi_10) \
786 VMADDSUB(Z4,Chi_02,UChi_01) \
787 VMADDSUB(Z4,Chi_12,UChi_11) \
788 VMADDSUB(Z5,Chi_02,UChi_02) \
789 VMADDSUB(Z5,Chi_12,UChi_12) \
790 /*61 insns*/ );
791
792
793#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \
794 LOAD64(%r8,ptr) \
795 LOAD64(%r9,pf) \
796 __asm__ ( \
797 VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
798 VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
799 VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
800 VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
801 VPREFETCH_M1(0,%r9) \
802 VPREFETCH_M1(1,%r9) \
803 VPREFETCH_M1(2,%r9) \
804 VPREFETCH_M1(3,%r9) \
805 /*8*/ \
806 VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
807 VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
808 VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
809 VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
810 VPREFETCH_M1(4,%r9) \
811 VPREFETCH_M1(5,%r9) \
812 VPREFETCH_M1(6,%r9) \
813 VPREFETCH_M1(7,%r9) \
814 /*16*/ \
815 VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
816 VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
817 VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
818 VPREFETCH_M1(8,%r9) \
819 VPREFETCH_M1(9,%r9) \
820 VPREFETCH_M1(10,%r9) \
821 VPREFETCH_M1(11,%r9) \
822 /*22*/ \
823 VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
824 VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
825 VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
826 VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
827 VPREFETCH_M2(12,%r9) \
828 VPREFETCH_M2(13,%r9) \
829 VPREFETCH_M2(14,%r9) \
830 VPREFETCH_M2(15,%r9) \
831 /*30*/ \
832 VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
833 VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
834 VPREFETCH_M2(16,%r9) \
835 VPREFETCH_M2(17,%r9) \
836 VPREFETCH_M2(18,%r9) \
837 VPREFETCH_M2(19,%r9) \
838 VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
839 /*36*/ \
840 VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
841 VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
842 VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
843 VPREFETCH_M2(20,%r9) \
844 VPREFETCH_M2(21,%r9) \
845 VPREFETCH_M2(22,%r9) \
846 VPREFETCH_M2(23,%r9) \
847 VPREFETCH_G1(2,%r8) \
848 VPREFETCH_G1(3,%r8) \
849 VPREFETCH_G2(4,%r8) \
850 VPREFETCH_G2(5,%r8) \
851 VPREFETCH_G2(6,%r8) \
852 VPREFETCH_G2(7,%r8) \
853 /*42 insns*/ );
854
855#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
856 LOAD64(%r8,ptr) \
857 LOAD64(%r9,pf) \
858 __asm__ ( \
859 VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
860 VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
861 VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
862 VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
863 /*8*/ \
864 VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
865 VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
866 VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
867 VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
868 /*16*/ \
869 VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
870 VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
871 VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
872 /*22*/ \
873 VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
874 VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
875 VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
876 VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
877 /*30*/ \
878 VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
879 VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
880 VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
881 /*36*/ \
882 VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
883 VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
884 VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
885 /* VPREFETCH1(2,%r8)*/ \
886 /* VPREFETCH1(3,%r8)*/ \
887 /*42 insns*/ );
888
889
890#define Z6 Chi_00
891#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \
892 LOAD64(%r8,ptr) \
893 __asm__ ( \
894 VSHUFMEM(0,%r8,Z0) \
895 VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
896 VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
897 VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \
898 VSHUFMEM(3,%r8,Z0) \
899 VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \
900 VSHUFMEM(6,%r8,Z0) \
901 VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \
902 VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \
903 VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \
904 VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \
905 /*11 cycles*/ \
906 VSHUFMEM(1,%r8,Z0) \
907 VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
908 VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
909 VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \
910 VSHUFMEM(4,%r8,Z0) \
911 VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \
912 VSHUFMEM(7,%r8,Z0) \
913 VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \
914 VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
915 VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
916 VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
917 /*22 cycles*/ \
918 VSHUFMEM(2,%r8,Z0) \
919 VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
920 VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
921 VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \
922 VSHUFMEM(5,%r8,Z0) \
923 VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \
924 VSHUFMEM(8,%r8,Z0) \
925 VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \
926 /*33 cycles*/ \
927 VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
928 VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
929 VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
930 /*stall*/ \
931 /*stall*/ \
932 /*stall*/ \
933 VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
934 VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
935 VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) )
936
937
938#endif