Grid 0.7.0
StaggeredKernelsAsm.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10Author: paboyle <paboyle@ph.ed.ac.uk>
11
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License along
23 with this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25
26 See the full license in the file "LICENSE" in the top level distribution directory
27 *************************************************************************************/
28 /* END LEGAL */
29#pragma once
30
31#include <Grid/Grid.h>
32
33#ifdef AVX512
36#endif
37
38// Interleave operations from two directions
39// This looks just like a 2 spin multiply and reuse same sequence from the Wilson
40// Kernel. But the spin index becomes a mu index instead.
41#define Chi_00 %zmm0
42#define Chi_01 %zmm1
43#define Chi_02 %zmm2
44#define Chi_10 %zmm3
45#define Chi_11 %zmm4
46#define Chi_12 %zmm5
47#define Chi_20 %zmm6
48#define Chi_21 %zmm7
49#define Chi_22 %zmm8
50#define Chi_30 %zmm9
51#define Chi_31 %zmm10
52#define Chi_32 %zmm11
53
54#define UChi_00 %zmm12
55#define UChi_01 %zmm13
56#define UChi_02 %zmm14
57#define UChi_10 %zmm15
58#define UChi_11 %zmm16
59#define UChi_12 %zmm17
60#define UChi_20 %zmm18
61#define UChi_21 %zmm19
62#define UChi_22 %zmm20
63#define UChi_30 %zmm21
64#define UChi_31 %zmm22
65#define UChi_32 %zmm23
66
67#define pChi_00 %%zmm0
68#define pChi_01 %%zmm1
69#define pChi_02 %%zmm2
70#define pChi_10 %%zmm3
71#define pChi_11 %%zmm4
72#define pChi_12 %%zmm5
73#define pChi_20 %%zmm6
74#define pChi_21 %%zmm7
75#define pChi_22 %%zmm8
76#define pChi_30 %%zmm9
77#define pChi_31 %%zmm10
78#define pChi_32 %%zmm11
79
80#define pUChi_00 %%zmm12
81#define pUChi_01 %%zmm13
82#define pUChi_02 %%zmm14
83#define pUChi_10 %%zmm15
84#define pUChi_11 %%zmm16
85#define pUChi_12 %%zmm17
86#define pUChi_20 %%zmm18
87#define pUChi_21 %%zmm19
88#define pUChi_22 %%zmm20
89#define pUChi_30 %%zmm21
90#define pUChi_31 %%zmm22
91#define pUChi_32 %%zmm23
92
93#define T0 %zmm24
94#define T1 %zmm25
95#define T2 %zmm26
96#define T3 %zmm27
97
98#define Z00 %zmm26
99#define Z10 %zmm27
100#define Z0 Z00
101#define Z1 %zmm28
102#define Z2 %zmm29
103
104#define Z3 %zmm30
105#define Z4 %zmm31
106#define Z5 Chi_31
107#define Z6 Chi_32
108
109#define MULT_ADD_LS(g0,g1,g2,g3) \
110 asm ( "movq %0, %%r8 \n\t" \
111 "movq %1, %%r9 \n\t" \
112 "movq %2, %%r10 \n\t" \
113 "movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
114 asm ( \
115 VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
116 VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
117 VMADDSUBIDUP(0,%r8,T0,UChi_00) VMADDSUBIDUP(0,%r9,T1,UChi_10) \
118 VMADDSUBIDUP(3,%r8,T0,UChi_01) VMADDSUBIDUP(3,%r9,T1,UChi_11) \
119 VMADDSUBIDUP(6,%r8,T0,UChi_02) VMADDSUBIDUP(6,%r9,T1,UChi_12) \
120 VMADDSUBIDUP(0,%r10,T2,UChi_20) VMADDSUBIDUP(0,%r11,T3,UChi_30) \
121 VMADDSUBIDUP(3,%r10,T2,UChi_21) VMADDSUBIDUP(3,%r11,T3,UChi_31) \
122 VMADDSUBIDUP(6,%r10,T2,UChi_22) VMADDSUBIDUP(6,%r11,T3,UChi_32) \
123 VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
124 VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
125 VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
126 VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
127 VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
128 VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
129 VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
130 VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
131 VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
132 VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
133 VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
134 VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
135 VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
136 VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
137 VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
138 VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
139 VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
140 VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
141 VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
142 VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
143 VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
144 VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
145 VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
146 VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
147 VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
148 VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
149 VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
150 VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
151 VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
152 VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
153 VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
154 VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
155 VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
156 VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
157
158#define MULT_LS(g0,g1,g2,g3) \
159 asm ( "movq %0, %%r8 \n\t" \
160 "movq %1, %%r9 \n\t" \
161 "movq %2, %%r10 \n\t" \
162 "movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
163 asm ( \
164 VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
165 VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
166 VMULIDUP(0,%r8,T0,UChi_00) VMULIDUP(0,%r9,T1,UChi_10) \
167 VMULIDUP(3,%r8,T0,UChi_01) VMULIDUP(3,%r9,T1,UChi_11) \
168 VMULIDUP(6,%r8,T0,UChi_02) VMULIDUP(6,%r9,T1,UChi_12) \
169 VMULIDUP(0,%r10,T2,UChi_20) VMULIDUP(0,%r11,T3,UChi_30) \
170 VMULIDUP(3,%r10,T2,UChi_21) VMULIDUP(3,%r11,T3,UChi_31) \
171 VMULIDUP(6,%r10,T2,UChi_22) VMULIDUP(6,%r11,T3,UChi_32) \
172 VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
173 VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
174 VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
175 VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
176 VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
177 VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
178 VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
179 VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
180 VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
181 VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
182 VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
183 VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
184 VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
185 VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
186 VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
187 VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
188 VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
189 VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
190 VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
191 VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
192 VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
193 VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
194 VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
195 VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
196 VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
197 VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
198 VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
199 VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
200 VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
201 VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
202 VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
203 VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
204 VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
205 VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
206
207#define MULT_ADD_XYZTa(g0,g1) \
208 asm ( "movq %0, %%r8 \n\t" \
209 "movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9");\
210 __asm__ ( \
211 VSHUF(Chi_00,T0) \
212 VSHUF(Chi_10,T1) \
213 VMOVIDUP(0,%r8,Z0 ) \
214 VMOVIDUP(3,%r8,Z1 ) \
215 VMOVIDUP(6,%r8,Z2 ) \
216 VMADDSUB(Z0,T0,UChi_00) \
217 VMADDSUB(Z1,T0,UChi_01) \
218 VMADDSUB(Z2,T0,UChi_02) \
219 \
220 VMOVIDUP(0,%r9,Z0 ) \
221 VMOVIDUP(3,%r9,Z1 ) \
222 VMOVIDUP(6,%r9,Z2 ) \
223 VMADDSUB(Z0,T1,UChi_10) \
224 VMADDSUB(Z1,T1,UChi_11) \
225 VMADDSUB(Z2,T1,UChi_12) \
226 \
227 \
228 VMOVRDUP(0,%r8,Z3 ) \
229 VMOVRDUP(3,%r8,Z4 ) \
230 VMOVRDUP(6,%r8,Z5 ) \
231 VMADDSUB(Z3,Chi_00,UChi_00)/*rr * ir = ri rr*/ \
232 VMADDSUB(Z4,Chi_00,UChi_01) \
233 VMADDSUB(Z5,Chi_00,UChi_02) \
234 \
235 VMOVRDUP(0,%r9,Z3 ) \
236 VMOVRDUP(3,%r9,Z4 ) \
237 VMOVRDUP(6,%r9,Z5 ) \
238 VMADDSUB(Z3,Chi_10,UChi_10) \
239 VMADDSUB(Z4,Chi_10,UChi_11)\
240 VMADDSUB(Z5,Chi_10,UChi_12) \
241 \
242 \
243 VMOVIDUP(1,%r8,Z0 ) \
244 VMOVIDUP(4,%r8,Z1 ) \
245 VMOVIDUP(7,%r8,Z2 ) \
246 VSHUF(Chi_01,T0) \
247 VMADDSUB(Z0,T0,UChi_00) \
248 VMADDSUB(Z1,T0,UChi_01) \
249 VMADDSUB(Z2,T0,UChi_02) \
250 \
251 VMOVIDUP(1,%r9,Z0 ) \
252 VMOVIDUP(4,%r9,Z1 ) \
253 VMOVIDUP(7,%r9,Z2 ) \
254 VSHUF(Chi_11,T1) \
255 VMADDSUB(Z0,T1,UChi_10) \
256 VMADDSUB(Z1,T1,UChi_11) \
257 VMADDSUB(Z2,T1,UChi_12) \
258 \
259 VMOVRDUP(1,%r8,Z3 ) \
260 VMOVRDUP(4,%r8,Z4 ) \
261 VMOVRDUP(7,%r8,Z5 ) \
262 VMADDSUB(Z3,Chi_01,UChi_00) \
263 VMADDSUB(Z4,Chi_01,UChi_01) \
264 VMADDSUB(Z5,Chi_01,UChi_02) \
265 \
266 VMOVRDUP(1,%r9,Z3 ) \
267 VMOVRDUP(4,%r9,Z4 ) \
268 VMOVRDUP(7,%r9,Z5 ) \
269 VMADDSUB(Z3,Chi_11,UChi_10) \
270 VMADDSUB(Z4,Chi_11,UChi_11) \
271 VMADDSUB(Z5,Chi_11,UChi_12) \
272 \
273 VSHUF(Chi_02,T0) \
274 VSHUF(Chi_12,T1) \
275 VMOVIDUP(2,%r8,Z0 ) \
276 VMOVIDUP(5,%r8,Z1 ) \
277 VMOVIDUP(8,%r8,Z2 ) \
278 VMADDSUB(Z0,T0,UChi_00) \
279 VMADDSUB(Z1,T0,UChi_01) \
280 VMADDSUB(Z2,T0,UChi_02) \
281 VMOVIDUP(2,%r9,Z0 ) \
282 VMOVIDUP(5,%r9,Z1 ) \
283 VMOVIDUP(8,%r9,Z2 ) \
284 VMADDSUB(Z0,T1,UChi_10) \
285 VMADDSUB(Z1,T1,UChi_11) \
286 VMADDSUB(Z2,T1,UChi_12) \
287 /*55*/ \
288 VMOVRDUP(2,%r8,Z3 ) \
289 VMOVRDUP(5,%r8,Z4 ) \
290 VMOVRDUP(8,%r8,Z5 ) \
291 VMADDSUB(Z3,Chi_02,UChi_00) \
292 VMADDSUB(Z4,Chi_02,UChi_01) \
293 VMADDSUB(Z5,Chi_02,UChi_02) \
294 VMOVRDUP(2,%r9,Z3 ) \
295 VMOVRDUP(5,%r9,Z4 ) \
296 VMOVRDUP(8,%r9,Z5 ) \
297 VMADDSUB(Z3,Chi_12,UChi_10) \
298 VMADDSUB(Z4,Chi_12,UChi_11) \
299 VMADDSUB(Z5,Chi_12,UChi_12) \
300 /*61 insns*/ );
301
302#define MULT_ADD_XYZT(g0,g1) \
303 asm ( "movq %0, %%r8 \n\t" \
304 "movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9");\
305 __asm__ ( \
306 VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
307 VRDUP(Chi_00,T0) VIDUP(Chi_00,Chi_00) \
308 VRDUP(Chi_10,T1) VIDUP(Chi_10,Chi_10) \
309 VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
310 VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
311 VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
312 VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
313 VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
314 VMADDMEM(0,%r8,T0,UChi_00) VMADDMEM(0,%r9,T1,UChi_10) \
315 VMADDMEM(3,%r8,T0,UChi_01) VMADDMEM(3,%r9,T1,UChi_11) \
316 VMADDMEM(6,%r8,T0,UChi_02) VMADDMEM(6,%r9,T1,UChi_12) \
317 VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
318 VRDUP(Chi_01,T0) VIDUP(Chi_01,Chi_01) \
319 VRDUP(Chi_11,T1) VIDUP(Chi_11,Chi_11) \
320 VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
321 VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
322 VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
323 VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
324 VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
325 VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10) \
326 VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11) \
327 VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12) \
328 VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
329 VRDUP(Chi_02,T0) VIDUP(Chi_02,Chi_02) \
330 VRDUP(Chi_12,T1) VIDUP(Chi_12,Chi_12) \
331 VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
332 VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
333 VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
334 VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
335 VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
336 VMADDSUBMEM(2,%r8,T0,Z1) VMADDSUBMEM(2,%r9,T1,Z2) \
337 VMADDSUBMEM(5,%r8,T0,Z3) VMADDSUBMEM(5,%r9,T1,Z4) \
338 VMADDSUBMEM(8,%r8,T0,Z5) VMADDSUBMEM(8,%r9,T1,Z6) \
339 VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
340 VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
341 VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
342
343#define MULT_XYZT(g0,g1) \
344 asm ( "movq %0, %%r8 \n\t" \
345 "movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9" ); \
346 __asm__ ( \
347 VSHUF(Chi_00,T0) \
348 VSHUF(Chi_10,T1) \
349 VMOVIDUP(0,%r8,Z0 ) \
350 VMOVIDUP(3,%r8,Z1 ) \
351 VMOVIDUP(6,%r8,Z2 ) \
352 /*6*/ \
353 VMUL(Z0,T0,UChi_00) \
354 VMUL(Z1,T0,UChi_01) \
355 VMUL(Z2,T0,UChi_02) \
356 VMOVIDUP(0,%r9,Z0 ) \
357 VMOVIDUP(3,%r9,Z1 ) \
358 VMOVIDUP(6,%r9,Z2 ) \
359 VMUL(Z0,T1,UChi_10) \
360 VMUL(Z1,T1,UChi_11) \
361 VMUL(Z2,T1,UChi_12) \
362 VMOVRDUP(0,%r8,Z3 ) \
363 VMOVRDUP(3,%r8,Z4 ) \
364 VMOVRDUP(6,%r8,Z5 ) \
365 /*18*/ \
366 VMADDSUB(Z3,Chi_00,UChi_00) \
367 VMADDSUB(Z4,Chi_00,UChi_01)\
368 VMADDSUB(Z5,Chi_00,UChi_02) \
369 VMOVRDUP(0,%r9,Z3 ) \
370 VMOVRDUP(3,%r9,Z4 ) \
371 VMOVRDUP(6,%r9,Z5 ) \
372 VMADDSUB(Z3,Chi_10,UChi_10) \
373 VMADDSUB(Z4,Chi_10,UChi_11)\
374 VMADDSUB(Z5,Chi_10,UChi_12) \
375 VMOVIDUP(1,%r8,Z0 ) \
376 VMOVIDUP(4,%r8,Z1 ) \
377 VMOVIDUP(7,%r8,Z2 ) \
378 /*28*/ \
379 VSHUF(Chi_01,T0) \
380 VMADDSUB(Z0,T0,UChi_00) \
381 VMADDSUB(Z1,T0,UChi_01) \
382 VMADDSUB(Z2,T0,UChi_02) \
383 VMOVIDUP(1,%r9,Z0 ) \
384 VMOVIDUP(4,%r9,Z1 ) \
385 VMOVIDUP(7,%r9,Z2 ) \
386 VSHUF(Chi_11,T1) \
387 VMADDSUB(Z0,T1,UChi_10) \
388 VMADDSUB(Z1,T1,UChi_11) \
389 VMADDSUB(Z2,T1,UChi_12) \
390 VMOVRDUP(1,%r8,Z3 ) \
391 VMOVRDUP(4,%r8,Z4 ) \
392 VMOVRDUP(7,%r8,Z5 ) \
393 /*38*/ \
394 VMADDSUB(Z3,Chi_01,UChi_00) \
395 VMADDSUB(Z4,Chi_01,UChi_01) \
396 VMADDSUB(Z5,Chi_01,UChi_02) \
397 VMOVRDUP(1,%r9,Z3 ) \
398 VMOVRDUP(4,%r9,Z4 ) \
399 VMOVRDUP(7,%r9,Z5 ) \
400 VMADDSUB(Z3,Chi_11,UChi_10) \
401 VMADDSUB(Z4,Chi_11,UChi_11) \
402 VMADDSUB(Z5,Chi_11,UChi_12) \
403 /*48*/ \
404 VSHUF(Chi_02,T0) \
405 VSHUF(Chi_12,T1) \
406 VMOVIDUP(2,%r8,Z0 ) \
407 VMOVIDUP(5,%r8,Z1 ) \
408 VMOVIDUP(8,%r8,Z2 ) \
409 VMADDSUB(Z0,T0,UChi_00) \
410 VMADDSUB(Z1,T0,UChi_01) \
411 VMADDSUB(Z2,T0,UChi_02) \
412 VMOVIDUP(2,%r9,Z0 ) \
413 VMOVIDUP(5,%r9,Z1 ) \
414 VMOVIDUP(8,%r9,Z2 ) \
415 VMADDSUB(Z0,T1,UChi_10) \
416 VMADDSUB(Z1,T1,UChi_11) \
417 VMADDSUB(Z2,T1,UChi_12) \
418 /*55*/ \
419 VMOVRDUP(2,%r8,Z3 ) \
420 VMOVRDUP(5,%r8,Z4 ) \
421 VMOVRDUP(8,%r8,Z5 ) \
422 VMADDSUB(Z3,Chi_02,UChi_00) \
423 VMADDSUB(Z4,Chi_02,UChi_01) \
424 VMADDSUB(Z5,Chi_02,UChi_02) \
425 VMOVRDUP(2,%r9,Z3 ) \
426 VMOVRDUP(5,%r9,Z4 ) \
427 VMOVRDUP(8,%r9,Z5 ) \
428 VMADDSUB(Z3,Chi_12,UChi_10) \
429 VMADDSUB(Z4,Chi_12,UChi_11) \
430 VMADDSUB(Z5,Chi_12,UChi_12) \
431 /*61 insns*/ );
432
433#define MULT_XYZTa(g0,g1) \
434 asm ( "movq %0, %%r8 \n\t" \
435 "movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9" ); \
436 __asm__ ( \
437 VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
438 VRDUP(Chi_00,T0) VIDUP(Chi_00,Chi_00) \
439 VRDUP(Chi_10,T1) VIDUP(Chi_10,Chi_10) \
440 VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
441 VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
442 VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
443 VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
444 VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
445 VMULMEM(0,%r8,T0,UChi_00) VMULMEM(0,%r9,T1,UChi_10) \
446 VMULMEM(3,%r8,T0,UChi_01) VMULMEM(3,%r9,T1,UChi_11) \
447 VMULMEM(6,%r8,T0,UChi_02) VMULMEM(6,%r9,T1,UChi_12) \
448 VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
449 VRDUP(Chi_01,T0) VIDUP(Chi_01,Chi_01) \
450 VRDUP(Chi_11,T1) VIDUP(Chi_11,Chi_11) \
451 VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
452 VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
453 VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
454 VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
455 VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
456 VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10) \
457 VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11) \
458 VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12) \
459 VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
460 VRDUP(Chi_02,T0) VIDUP(Chi_02,Chi_02) \
461 VRDUP(Chi_12,T1) VIDUP(Chi_12,Chi_12) \
462 VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
463 VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
464 VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
465 VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
466 VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
467 VMADDSUBMEM(2,%r8,T0,Z1) VMADDSUBMEM(2,%r9,T1,Z2) \
468 VMADDSUBMEM(5,%r8,T0,Z3) VMADDSUBMEM(5,%r9,T1,Z4) \
469 VMADDSUBMEM(8,%r8,T0,Z5) VMADDSUBMEM(8,%r9,T1,Z6) \
470 VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
471 VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
472 VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
473
474
475#define LOAD_CHI(a0,a1,a2,a3) \
476 asm ( \
477 "movq %0, %%r8 \n\t" \
478 VLOAD(0,%%r8,pChi_00) \
479 VLOAD(1,%%r8,pChi_01) \
480 VLOAD(2,%%r8,pChi_02) \
481 : : "r" (a0) : "%r8" ); \
482 asm ( \
483 "movq %0, %%r8 \n\t" \
484 VLOAD(0,%%r8,pChi_10) \
485 VLOAD(1,%%r8,pChi_11) \
486 VLOAD(2,%%r8,pChi_12) \
487 : : "r" (a1) : "%r8" ); \
488 asm ( \
489 "movq %0, %%r8 \n\t" \
490 VLOAD(0,%%r8,pChi_20) \
491 VLOAD(1,%%r8,pChi_21) \
492 VLOAD(2,%%r8,pChi_22) \
493 : : "r" (a2) : "%r8" ); \
494 asm ( \
495 "movq %0, %%r8 \n\t" \
496 VLOAD(0,%%r8,pChi_30) \
497 VLOAD(1,%%r8,pChi_31) \
498 VLOAD(2,%%r8,pChi_32) \
499 : : "r" (a3) : "%r8" );
500
501#define LOAD_CHIa(a0,a1) \
502 asm ( \
503 "movq %0, %%r8 \n\t" \
504 VLOAD(0,%%r8,pChi_00) \
505 VLOAD(1,%%r8,pChi_01) \
506 VLOAD(2,%%r8,pChi_02) \
507 : : "r" (a0) : "%r8" ); \
508 asm ( \
509 "movq %0, %%r8 \n\t" \
510 VLOAD(0,%%r8,pChi_10) \
511 VLOAD(1,%%r8,pChi_11) \
512 VLOAD(2,%%r8,pChi_12) \
513 : : "r" (a1) : "%r8" );
514
515#define PF_CHI(a0)
516#define PF_CHIa(a0) \
517 asm ( \
518 "movq %0, %%r8 \n\t" \
519 VPREFETCH1(0,%%r8) \
520 VPREFETCH1(1,%%r8) \
521 VPREFETCH1(2,%%r8) \
522 : : "r" (a0) : "%r8" ); \
523
524#define PF_GAUGE_XYZT(a0)
525#define PF_GAUGE_XYZTa(a0) \
526 asm ( \
527 "movq %0, %%r8 \n\t" \
528 VPREFETCH1(0,%%r8) \
529 VPREFETCH1(1,%%r8) \
530 VPREFETCH1(2,%%r8) \
531 VPREFETCH1(3,%%r8) \
532 VPREFETCH1(4,%%r8) \
533 VPREFETCH1(5,%%r8) \
534 VPREFETCH1(6,%%r8) \
535 VPREFETCH1(7,%%r8) \
536 VPREFETCH1(8,%%r8) \
537 : : "r" (a0) : "%r8" ); \
538
539#define PF_GAUGE_LS(a0)
540#define PF_GAUGE_LSa(a0) \
541 asm ( \
542 "movq %0, %%r8 \n\t" \
543 VPREFETCH1(0,%%r8) \
544 VPREFETCH1(1,%%r8) \
545 : : "r" (a0) : "%r8" ); \
546
547
548#define REDUCE(out) \
549 asm ( \
550 VADD(UChi_00,UChi_10,UChi_00) \
551 VADD(UChi_01,UChi_11,UChi_01) \
552 VADD(UChi_02,UChi_12,UChi_02) \
553 VADD(UChi_30,UChi_20,UChi_30) \
554 VADD(UChi_31,UChi_21,UChi_31) \
555 VADD(UChi_32,UChi_22,UChi_32) \
556 VADD(UChi_00,UChi_30,UChi_00) \
557 VADD(UChi_01,UChi_31,UChi_01) \
558 VADD(UChi_02,UChi_32,UChi_02) ); \
559 asm ( \
560 VSTORE(0,%0,pUChi_00) \
561 VSTORE(1,%0,pUChi_01) \
562 VSTORE(2,%0,pUChi_02) \
563 : : "r" (out) : "memory" );
564
565#define nREDUCE(out) \
566 asm ( \
567 VADD(UChi_00,UChi_10,UChi_00) \
568 VADD(UChi_01,UChi_11,UChi_01) \
569 VADD(UChi_02,UChi_12,UChi_02) \
570 VADD(UChi_30,UChi_20,UChi_30) \
571 VADD(UChi_31,UChi_21,UChi_31) \
572 VADD(UChi_32,UChi_22,UChi_32) \
573 VADD(UChi_00,UChi_30,UChi_00) \
574 VADD(UChi_01,UChi_31,UChi_01) \
575 VADD(UChi_02,UChi_32,UChi_02) ); \
576 asm (VZERO(Chi_00) \
577 VSUB(UChi_00,Chi_00,UChi_00) \
578 VSUB(UChi_01,Chi_00,UChi_01) \
579 VSUB(UChi_02,Chi_00,UChi_02) ); \
580 asm ( \
581 VSTORE(0,%0,pUChi_00) \
582 VSTORE(1,%0,pUChi_01) \
583 VSTORE(2,%0,pUChi_02) \
584 : : "r" (out) : "memory" );
585
586#define REDUCEa(out) \
587 asm ( \
588 VADD(UChi_00,UChi_10,UChi_00) \
589 VADD(UChi_01,UChi_11,UChi_01) \
590 VADD(UChi_02,UChi_12,UChi_02) ); \
591 asm ( \
592 VSTORE(0,%0,pUChi_00) \
593 VSTORE(1,%0,pUChi_01) \
594 VSTORE(2,%0,pUChi_02) \
595 : : "r" (out) : "memory" );
596
597// FIXME is sign right in the VSUB ?
598#define nREDUCEa(out) \
599 asm ( \
600 VADD(UChi_00,UChi_10,UChi_00) \
601 VADD(UChi_01,UChi_11,UChi_01) \
602 VADD(UChi_02,UChi_12,UChi_02) ); \
603 asm (VZERO(Chi_00) \
604 VSUB(UChi_00,Chi_00,UChi_00) \
605 VSUB(UChi_01,Chi_00,UChi_01) \
606 VSUB(UChi_02,Chi_00,UChi_02) ); \
607 asm ( \
608 VSTORE(0,%0,pUChi_00) \
609 VSTORE(1,%0,pUChi_01) \
610 VSTORE(2,%0,pUChi_02) \
611 : : "r" (out) : "memory" );
612
613#define PERMUTE_DIR(dir) \
614 permute##dir(Chi_0,Chi_0);\
615 permute##dir(Chi_1,Chi_1);\
616 permute##dir(Chi_2,Chi_2);
617
619
620template <class Impl>
622 DoubledGaugeFieldView &U,
623 DoubledGaugeFieldView &UUU,
624 SiteSpinor *buf, int sF,
625 int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
626{
627 assert(0);
628};
629
630
631//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in[o] ; } else { out =(uint64_t) &buf[o]; }
632
633#define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }
634
635#define PREPARE_XYZT(X,Y,Z,T,skew,UU) \
636 PREPARE(X,Y,Z,T,skew,UU); \
637 PF_GAUGE_XYZT(gauge0); \
638 PF_GAUGE_XYZT(gauge1); \
639 PF_GAUGE_XYZT(gauge2); \
640 PF_GAUGE_XYZT(gauge3);
641
642#define PREPARE_LS(X,Y,Z,T,skew,UU) \
643 PREPARE(X,Y,Z,T,skew,UU); \
644 PF_GAUGE_LS(gauge0); \
645 PF_GAUGE_LS(gauge1); \
646 PF_GAUGE_LS(gauge2); \
647 PF_GAUGE_LS(gauge3);
648
649#define PREPARE(X,Y,Z,T,skew,UU) \
650 SE0=st.GetEntry(ptype,X+skew,sF); \
651 o0 = SE0->_offset; \
652 l0 = SE0->_is_local; \
653 p0 = SE0->_permute; \
654 CONDITIONAL_MOVE(l0,o0,addr0); \
655 PF_CHI(addr0); \
656 \
657 SE1=st.GetEntry(ptype,Y+skew,sF); \
658 o1 = SE1->_offset; \
659 l1 = SE1->_is_local; \
660 p1 = SE1->_permute; \
661 CONDITIONAL_MOVE(l1,o1,addr1); \
662 PF_CHI(addr1); \
663 \
664 SE2=st.GetEntry(ptype,Z+skew,sF); \
665 o2 = SE2->_offset; \
666 l2 = SE2->_is_local; \
667 p2 = SE2->_permute; \
668 CONDITIONAL_MOVE(l2,o2,addr2); \
669 PF_CHI(addr2); \
670 \
671 SE3=st.GetEntry(ptype,T+skew,sF); \
672 o3 = SE3->_offset; \
673 l3 = SE3->_is_local; \
674 p3 = SE3->_permute; \
675 CONDITIONAL_MOVE(l3,o3,addr3); \
676 PF_CHI(addr3); \
677 \
678 gauge0 =(uint64_t)&UU[sU]( X ); \
679 gauge1 =(uint64_t)&UU[sU]( Y ); \
680 gauge2 =(uint64_t)&UU[sU]( Z ); \
681 gauge3 =(uint64_t)&UU[sU]( T );
682
683#undef STAG_VEC5D
684#ifdef STAG_VEC5D
685 // This is the single precision 5th direction vectorised kernel
687template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
688 DoubledGaugeFieldView &U,
689 DoubledGaugeFieldView &UUU,
690 SiteSpinor *buf, int sF,
691 int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
692{
693#ifdef AVX512
694 uint64_t gauge0,gauge1,gauge2,gauge3;
695 uint64_t addr0,addr1,addr2,addr3;
696 const SiteSpinor *in_p; in_p = &in[0];
697
698 int o0,o1,o2,o3; // offsets
699 int l0,l1,l2,l3; // local
700 int p0,p1,p2,p3; // perm
701 int ptype;
702 StencilEntry *SE0;
703 StencilEntry *SE1;
704 StencilEntry *SE2;
705 StencilEntry *SE3;
706
707 // for(int s=0;s<LLs;s++){
708
709 // int sF=s+LLs*sU;
710 {
711 // Xp, Yp, Zp, Tp
712 PREPARE(Xp,Yp,Zp,Tp,0,U);
713 LOAD_CHI(addr0,addr1,addr2,addr3);
714 MULT_LS(gauge0,gauge1,gauge2,gauge3);
715
716 PREPARE(Xm,Ym,Zm,Tm,0,U);
717 LOAD_CHI(addr0,addr1,addr2,addr3);
718 MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
719
720 PREPARE(Xp,Yp,Zp,Tp,8,UUU);
721 LOAD_CHI(addr0,addr1,addr2,addr3);
722 MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
723
724 PREPARE(Xm,Ym,Zm,Tm,8,UUU);
725 LOAD_CHI(addr0,addr1,addr2,addr3);
726 MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
727
728 addr0 = (uint64_t) &out[sF];
729 if ( dag ) {
730 nREDUCE(addr0);
731 } else {
732 REDUCE(addr0);
733 }
734 }
735#else
736 assert(0);
737#endif
738
739}
740
742template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st,
743 DoubledGaugeFieldView &U,
744 DoubledGaugeFieldView &UUU,
745 SiteSpinor *buf, int sF,
746 int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
747{
748#ifdef AVX512
749 uint64_t gauge0,gauge1,gauge2,gauge3;
750 uint64_t addr0,addr1,addr2,addr3;
751 const SiteSpinor *in_p; in_p = &in[0];
752
753 int o0,o1,o2,o3; // offsets
754 int l0,l1,l2,l3; // local
755 int p0,p1,p2,p3; // perm
756 int ptype;
757 StencilEntry *SE0;
758 StencilEntry *SE1;
759 StencilEntry *SE2;
760 StencilEntry *SE3;
761
762 // for(int s=0;s<LLs;s++){
763 // int sF=s+LLs*sU;
764 {
765 // Xp, Yp, Zp, Tp
766 PREPARE(Xp,Yp,Zp,Tp,0,U);
767 LOAD_CHI(addr0,addr1,addr2,addr3);
768 MULT_LS(gauge0,gauge1,gauge2,gauge3);
769
770 PREPARE(Xm,Ym,Zm,Tm,0,U);
771 LOAD_CHI(addr0,addr1,addr2,addr3);
772 MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
773
774 PREPARE(Xp,Yp,Zp,Tp,8,UUU);
775 LOAD_CHI(addr0,addr1,addr2,addr3);
776 MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
777
778 PREPARE(Xm,Ym,Zm,Tm,8,UUU);
779 LOAD_CHI(addr0,addr1,addr2,addr3);
780 MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
781
782 addr0 = (uint64_t) &out[sF];
783 if ( dag ) {
784 nREDUCE(addr0);
785 } else {
786 REDUCE(addr0);
787 }
788 }
789#else
790 assert(0);
791#endif
792}
793
794#endif
795
796
797#define PERMUTE_DIR3 __asm__ ( \
798 VPERM3(Chi_00,Chi_00) \
799 VPERM3(Chi_01,Chi_01) \
800 VPERM3(Chi_02,Chi_02) );
801
802#define PERMUTE_DIR2 __asm__ ( \
803 VPERM2(Chi_10,Chi_10) \
804 VPERM2(Chi_11,Chi_11) \
805 VPERM2(Chi_12,Chi_12) );
806
807#define PERMUTE_DIR1 __asm__ ( \
808 VPERM1(Chi_00,Chi_00) \
809 VPERM1(Chi_01,Chi_01) \
810 VPERM1(Chi_02,Chi_02) );
811
812#define PERMUTE_DIR0 __asm__ ( \
813 VPERM0(Chi_10,Chi_10) \
814 VPERM0(Chi_11,Chi_11) \
815 VPERM0(Chi_12,Chi_12) );
816
817#define PERMUTE01 \
818 if ( p0 ) { PERMUTE_DIR3; }\
819 if ( p1 ) { PERMUTE_DIR2; }
820
821#define PERMUTE23 \
822 if ( p2 ) { PERMUTE_DIR1; }\
823 if ( p3 ) { PERMUTE_DIR0; }
824
825 // This is the single precision 5th direction vectorised kernel
826
828template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
829 DoubledGaugeFieldView &U,
830 DoubledGaugeFieldView &UUU,
831 SiteSpinor *buf, int sF,
832 int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
833{
834#ifdef AVX512
835 uint64_t gauge0,gauge1,gauge2,gauge3;
836 uint64_t addr0,addr1,addr2,addr3;
837 const SiteSpinor *in_p; in_p = &in[0];
838
839 int o0,o1,o2,o3; // offsets
840 int l0,l1,l2,l3; // local
841 int p0,p1,p2,p3; // perm
842 int ptype;
843 StencilEntry *SE0;
844 StencilEntry *SE1;
845 StencilEntry *SE2;
846 StencilEntry *SE3;
847
848 // for(int s=0;s<LLs;s++){
849 // int sF=s+LLs*sU;
850 {
851 // Xp, Yp, Zp, Tp
852 PREPARE(Xp,Yp,Zp,Tp,0,U);
853 LOAD_CHIa(addr0,addr1);
854 PERMUTE01;
855 MULT_XYZT(gauge0,gauge1);
856 LOAD_CHIa(addr2,addr3);
857 PERMUTE23;
858 MULT_ADD_XYZT(gauge2,gauge3);
859
860 PREPARE(Xm,Ym,Zm,Tm,0,U);
861 LOAD_CHIa(addr0,addr1);
862 PERMUTE01;
863 MULT_ADD_XYZT(gauge0,gauge1);
864 LOAD_CHIa(addr2,addr3);
865 PERMUTE23;
866 MULT_ADD_XYZT(gauge2,gauge3);
867
868 PREPARE(Xp,Yp,Zp,Tp,8,UUU);
869 LOAD_CHIa(addr0,addr1);
870 PERMUTE01;
871 MULT_ADD_XYZT(gauge0,gauge1);
872 LOAD_CHIa(addr2,addr3);
873 PERMUTE23;
874 MULT_ADD_XYZT(gauge2,gauge3);
875
876 PREPARE(Xm,Ym,Zm,Tm,8,UUU);
877 LOAD_CHIa(addr0,addr1);
878 PERMUTE01;
879 MULT_ADD_XYZT(gauge0,gauge1);
880 LOAD_CHIa(addr2,addr3);
881 PERMUTE23;
882 MULT_ADD_XYZT(gauge2,gauge3);
883
884 addr0 = (uint64_t) &out[sF];
885 if ( dag ) {
886 nREDUCEa(addr0);
887 } else {
888 REDUCEa(addr0);
889 }
890 }
891#else
892 assert(0);
893#endif
894}
895
897template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
898 DoubledGaugeFieldView &U,
899 DoubledGaugeFieldView &UUU,
900 SiteSpinor *buf, int sF,
901 int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
902{
903#ifdef AVX512
904 uint64_t gauge0,gauge1,gauge2,gauge3;
905 uint64_t addr0,addr1,addr2,addr3;
906 const SiteSpinor *in_p; in_p = &in[0];
907
908 int o0,o1,o2,o3; // offsets
909 int l0,l1,l2,l3; // local
910 int p0,p1,p2,p3; // perm
911 int ptype;
912 StencilEntry *SE0;
913 StencilEntry *SE1;
914 StencilEntry *SE2;
915 StencilEntry *SE3;
916
917 // for(int s=0;s<LLs;s++){
918 // int sF=s+LLs*sU;
919 {
920 // Xp, Yp, Zp, Tp
921 PREPARE(Xp,Yp,Zp,Tp,0,U);
922 LOAD_CHIa(addr0,addr1);
923 PERMUTE01;
924 MULT_XYZT(gauge0,gauge1);
925 LOAD_CHIa(addr2,addr3);
926 PERMUTE23;
927 MULT_ADD_XYZT(gauge2,gauge3);
928
929 PREPARE(Xm,Ym,Zm,Tm,0,U);
930 LOAD_CHIa(addr0,addr1);
931 PERMUTE01;
932 MULT_ADD_XYZT(gauge0,gauge1);
933 LOAD_CHIa(addr2,addr3);
934 PERMUTE23;
935 MULT_ADD_XYZT(gauge2,gauge3);
936
937 PREPARE(Xp,Yp,Zp,Tp,8,UUU);
938 LOAD_CHIa(addr0,addr1);
939 PERMUTE01;
940 MULT_ADD_XYZT(gauge0,gauge1);
941 LOAD_CHIa(addr2,addr3);
942 PERMUTE23;
943 MULT_ADD_XYZT(gauge2,gauge3);
944
945 PREPARE(Xm,Ym,Zm,Tm,8,UUU);
946 LOAD_CHIa(addr0,addr1);
947 PERMUTE01;
948 MULT_ADD_XYZT(gauge0,gauge1);
949 LOAD_CHIa(addr2,addr3);
950 PERMUTE23;
951 MULT_ADD_XYZT(gauge2,gauge3);
952
953 addr0 = (uint64_t) &out[sF];
954 if ( dag ) {
955 nREDUCEa(addr0);
956 } else {
957 REDUCEa(addr0);
958 }
959 }
960#else
961 assert(0);
962#endif
963}
964
966
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
static constexpr int Xm
Definition QCD.h:45
static constexpr int Tm
Definition QCD.h:48
static constexpr int Tp
Definition QCD.h:44
static constexpr int Zp
Definition QCD.h:43
static constexpr int Zm
Definition QCD.h:47
static constexpr int Xp
Definition QCD.h:41
static constexpr int Yp
Definition QCD.h:42
static constexpr int Ym
Definition QCD.h:46
#define MULT_ADD_XYZT(g0, g1)
#define nREDUCEa(out)
#define MULT_XYZT(g0, g1)
#define LOAD_CHIa(a0, a1)
#define REDUCE(out)
#define MULT_ADD_LS(g0, g1, g2, g3)
#define nREDUCE(out)
#define PERMUTE01
#define PERMUTE23
#define PREPARE(X, Y, Z, T, skew, UU)
#define REDUCEa(out)
#define LOAD_CHI(a0, a1, a2, a3)
#define MULT_LS(g0, g1, g2, g3)
static INTERNAL_PRECISION U
Definition Zolotarev.cc:230
void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)