Grid 0.7.0
Fujitsu_A64FX_intrin_double.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: Fujitsu_A64FX_intrin_double.h
6
7 Copyright (C) 2020
8
9Author: Nils Meyer <nils.meyer@ur.de>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
29#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
30#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
31#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
32#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)
33#define PF_GAUGE(A)
34#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)
35#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A)
36#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
37#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
38#define LOCK_GAUGE(A)
39#define UNLOCK_GAUGE(A)
40#define MASK_REGS DECLARATIONS_A64FXd
41#define SAVE_RESULT(A,B) RESULT_A64FXd(A);
42#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
43#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
44#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
45#define ZERO_PSI ZERO_PSI_A64FXd
46#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
47#define XP_PROJ XP_PROJ_A64FXd
48#define YP_PROJ YP_PROJ_A64FXd
49#define ZP_PROJ ZP_PROJ_A64FXd
50#define TP_PROJ TP_PROJ_A64FXd
51#define XM_PROJ XM_PROJ_A64FXd
52#define YM_PROJ YM_PROJ_A64FXd
53#define ZM_PROJ ZM_PROJ_A64FXd
54#define TM_PROJ TM_PROJ_A64FXd
55#define XP_RECON XP_RECON_A64FXd
56#define XM_RECON XM_RECON_A64FXd
57#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
58#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd
59#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd
60#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd
61#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd
62#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
63#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
64#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
65#define PERMUTE_DIR0 0
66#define PERMUTE_DIR1 1
67#define PERMUTE_DIR2 2
68#define PERMUTE_DIR3 3
69#define PERMUTE PERMUTE_A64FXd;
70#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
71#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
72// DECLARATIONS
73#define DECLARATIONS_A64FXd \
74 uint64_t baseU; \
75 const uint64_t lut[4][8] = { \
76 {4, 5, 6, 7, 0, 1, 2, 3}, \
77 {2, 3, 0, 1, 6, 7, 4, 5}, \
78 {1, 0, 3, 2, 5, 4, 7, 6}, \
79 {0, 1, 2, 4, 5, 6, 7, 8} };\
80 svfloat64_t result_00; \
81 svfloat64_t result_01; \
82 svfloat64_t result_02; \
83 svfloat64_t result_10; \
84 svfloat64_t result_11; \
85 svfloat64_t result_12; \
86 svfloat64_t result_20; \
87 svfloat64_t result_21; \
88 svfloat64_t result_22; \
89 svfloat64_t result_30; \
90 svfloat64_t result_31; \
91 svfloat64_t result_32; \
92 svfloat64_t Chi_00; \
93 svfloat64_t Chi_01; \
94 svfloat64_t Chi_02; \
95 svfloat64_t Chi_10; \
96 svfloat64_t Chi_11; \
97 svfloat64_t Chi_12; \
98 svfloat64_t UChi_00; \
99 svfloat64_t UChi_01; \
100 svfloat64_t UChi_02; \
101 svfloat64_t UChi_10; \
102 svfloat64_t UChi_11; \
103 svfloat64_t UChi_12; \
104 svfloat64_t U_00; \
105 svfloat64_t U_10; \
106 svfloat64_t U_20; \
107 svfloat64_t U_01; \
108 svfloat64_t U_11; \
109 svfloat64_t U_21; \
110 svbool_t pg1; \
111 pg1 = svptrue_b64(); \
112 svuint64_t table0; \
113 svfloat64_t zero0; \
114 zero0 = svdup_f64(0.);
115
116#define Chimu_00 Chi_00
117#define Chimu_01 Chi_01
118#define Chimu_02 Chi_02
119#define Chimu_10 Chi_10
120#define Chimu_11 Chi_11
121#define Chimu_12 Chi_12
122#define Chimu_20 UChi_00
123#define Chimu_21 UChi_01
124#define Chimu_22 UChi_02
125#define Chimu_30 UChi_10
126#define Chimu_31 UChi_11
127#define Chimu_32 UChi_12
128// RESULT
129#define RESULT_A64FXd(base) \
130{ \
131 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \
132 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \
133 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \
134 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \
135 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \
136 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \
137 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \
138 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \
139 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \
140 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \
141 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \
142 svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \
143}
144// PREFETCH_CHIMU_L2 (prefetch to L2)
145#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \
146{ \
147 svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
148 svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
149 svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
150}
151// PREFETCH_CHIMU_L1 (prefetch to L1)
152#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \
153{ \
154 svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
155 svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
156 svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
157}
158// PREFETCH_GAUGE_L2 (prefetch to L2)
159#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \
160{ \
161 const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
162 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
163 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
164 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
165 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
166 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
167 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
168 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
169 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
170 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
171}
172// PREFETCH_GAUGE_L1 (prefetch to L1)
173#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \
174{ \
175 const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
176 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
177 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
178 svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
179}
180// LOAD_CHI
181#define LOAD_CHI_A64FXd(base) \
182{ \
183 Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0)); \
184 Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1)); \
185 Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2)); \
186 Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3)); \
187 Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4)); \
188 Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5)); \
189}
190// LOAD_CHIMU
191#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
192{ \
193 Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
194 Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
195 Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
196 Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
197 Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
198 Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
199 Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
200 Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
201 Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
202 Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
203 Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
204 Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
205}
206// LOAD_CHIMU_0213
207#define LOAD_CHIMU_0213_A64FXd \
208{ \
209 const SiteSpinor & ref(in[offset]); \
210 Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
211 Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
212 Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
213 Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
214 Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
215 Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
216 Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
217 Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
218 Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
219 Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
220 Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
221 Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
222}
223// LOAD_CHIMU_0312
224#define LOAD_CHIMU_0312_A64FXd \
225{ \
226 const SiteSpinor & ref(in[offset]); \
227 Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
228 Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
229 Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
230 Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
231 Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
232 Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
233 Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
234 Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
235 Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
236 Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
237 Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
238 Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
239}
240// LOAD_TABLE0
241#define LOAD_TABLE0 \
242 table0 = svld1(pg1, (uint64_t*)&lut[0]);
243
244// LOAD_TABLE1
245#define LOAD_TABLE1 \
246 table0 = svld1(pg1, (uint64_t*)&lut[1]);
247
248// LOAD_TABLE2
249#define LOAD_TABLE2 \
250 table0 = svld1(pg1, (uint64_t*)&lut[2]);
251
252// LOAD_TABLE3
253#define LOAD_TABLE3 \
254 table0 = svld1(pg1, (uint64_t*)&lut[3]);
255
256// PERMUTE
257#define PERMUTE_A64FXd \
258 Chi_00 = svtbl(Chi_00, table0); \
259 Chi_01 = svtbl(Chi_01, table0); \
260 Chi_02 = svtbl(Chi_02, table0); \
261 Chi_10 = svtbl(Chi_10, table0); \
262 Chi_11 = svtbl(Chi_11, table0); \
263 Chi_12 = svtbl(Chi_12, table0);
264
265// LOAD_GAUGE
266#define LOAD_GAUGE(A) \
267{ \
268 const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
269 U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
270 U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
271 U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
272 U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
273 U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
274 U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
275}
276// MULT_2SPIN
277#define MULT_2SPIN_1_A64FXd(A) \
278{ \
279 const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
280 U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
281 U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
282 U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
283 U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
284 U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
285 U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
286 UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
287 UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
288 UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
289 UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \
290 UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \
291 UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \
292 UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \
293 UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \
294 UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \
295 UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
296 UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
297 UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
298 U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \
299 U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \
300 U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \
301}
302// MULT_2SPIN_BACKEND
303#define MULT_2SPIN_2_A64FXd \
304{ \
305 UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
306 UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
307 UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
308 UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \
309 UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \
310 UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \
311 UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \
312 UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \
313 UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \
314 UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \
315 UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \
316 UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \
317 UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \
318 UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \
319 UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \
320 UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \
321 UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \
322 UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \
323 UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \
324 UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \
325 UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \
326 UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \
327 UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \
328 UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \
329}
330// XP_PROJ
331#define XP_PROJ_A64FXd \
332{ \
333 Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \
334 Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \
335 Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \
336 Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \
337 Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \
338 Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \
339}
340// XP_RECON
341#define XP_RECON_A64FXd \
342 result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \
343 result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \
344 result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \
345 result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \
346 result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \
347 result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \
348 result_00 = UChi_00; \
349 result_01 = UChi_01; \
350 result_02 = UChi_02; \
351 result_10 = UChi_10; \
352 result_11 = UChi_11; \
353 result_12 = UChi_12;
354
355// XP_RECON_ACCUM
356#define XP_RECON_ACCUM_A64FXd \
357 result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
358 result_00 = svadd_x(pg1, result_00, UChi_00); \
359 result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
360 result_01 = svadd_x(pg1, result_01, UChi_01); \
361 result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
362 result_02 = svadd_x(pg1, result_02, UChi_02); \
363 result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
364 result_10 = svadd_x(pg1, result_10, UChi_10); \
365 result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
366 result_11 = svadd_x(pg1, result_11, UChi_11); \
367 result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
368 result_12 = svadd_x(pg1, result_12, UChi_12);
369
370// YP_PROJ
371#define YP_PROJ_A64FXd \
372{ \
373 Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \
374 Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \
375 Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \
376 Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \
377 Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \
378 Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \
379}
380// ZP_PROJ
381#define ZP_PROJ_A64FXd \
382{ \
383 Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \
384 Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \
385 Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \
386 Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \
387 Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \
388 Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \
389}
390// TP_PROJ
391#define TP_PROJ_A64FXd \
392{ \
393 Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \
394 Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \
395 Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \
396 Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \
397 Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \
398 Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \
399}
400// XM_PROJ
401#define XM_PROJ_A64FXd \
402{ \
403 Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \
404 Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \
405 Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \
406 Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \
407 Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \
408 Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \
409}
410// XM_RECON
411#define XM_RECON_A64FXd \
412 result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \
413 result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \
414 result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \
415 result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \
416 result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \
417 result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \
418 result_00 = UChi_00; \
419 result_01 = UChi_01; \
420 result_02 = UChi_02; \
421 result_10 = UChi_10; \
422 result_11 = UChi_11; \
423 result_12 = UChi_12;
424
425// YM_PROJ
426#define YM_PROJ_A64FXd \
427{ \
428 Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \
429 Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \
430 Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \
431 Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \
432 Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \
433 Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \
434}
435// ZM_PROJ
436#define ZM_PROJ_A64FXd \
437{ \
438 Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \
439 Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \
440 Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \
441 Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \
442 Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \
443 Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \
444}
445// TM_PROJ
446#define TM_PROJ_A64FXd \
447{ \
448 Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \
449 Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \
450 Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \
451 Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \
452 Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \
453 Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \
454}
455// XM_RECON_ACCUM
456#define XM_RECON_ACCUM_A64FXd \
457 result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
458 result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
459 result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
460 result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
461 result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
462 result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
463 result_00 = svadd_x(pg1, result_00, UChi_00); \
464 result_01 = svadd_x(pg1, result_01, UChi_01); \
465 result_02 = svadd_x(pg1, result_02, UChi_02); \
466 result_10 = svadd_x(pg1, result_10, UChi_10); \
467 result_11 = svadd_x(pg1, result_11, UChi_11); \
468 result_12 = svadd_x(pg1, result_12, UChi_12);
469
470// YP_RECON_ACCUM
471#define YP_RECON_ACCUM_A64FXd \
472 result_00 = svadd_x(pg1, result_00, UChi_00); \
473 result_30 = svsub_x(pg1, result_30, UChi_00); \
474 result_01 = svadd_x(pg1, result_01, UChi_01); \
475 result_31 = svsub_x(pg1, result_31, UChi_01); \
476 result_02 = svadd_x(pg1, result_02, UChi_02); \
477 result_32 = svsub_x(pg1, result_32, UChi_02); \
478 result_10 = svadd_x(pg1, result_10, UChi_10); \
479 result_20 = svadd_x(pg1, result_20, UChi_10); \
480 result_11 = svadd_x(pg1, result_11, UChi_11); \
481 result_21 = svadd_x(pg1, result_21, UChi_11); \
482 result_12 = svadd_x(pg1, result_12, UChi_12); \
483 result_22 = svadd_x(pg1, result_22, UChi_12);
484
485// YM_RECON_ACCUM
486#define YM_RECON_ACCUM_A64FXd \
487 result_00 = svadd_x(pg1, result_00, UChi_00); \
488 result_30 = svadd_x(pg1, result_30, UChi_00); \
489 result_01 = svadd_x(pg1, result_01, UChi_01); \
490 result_31 = svadd_x(pg1, result_31, UChi_01); \
491 result_02 = svadd_x(pg1, result_02, UChi_02); \
492 result_32 = svadd_x(pg1, result_32, UChi_02); \
493 result_10 = svadd_x(pg1, result_10, UChi_10); \
494 result_20 = svsub_x(pg1, result_20, UChi_10); \
495 result_11 = svadd_x(pg1, result_11, UChi_11); \
496 result_21 = svsub_x(pg1, result_21, UChi_11); \
497 result_12 = svadd_x(pg1, result_12, UChi_12); \
498 result_22 = svsub_x(pg1, result_22, UChi_12);
499
500// ZP_RECON_ACCUM
501#define ZP_RECON_ACCUM_A64FXd \
502 result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \
503 result_00 = svadd_x(pg1, result_00, UChi_00); \
504 result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \
505 result_01 = svadd_x(pg1, result_01, UChi_01); \
506 result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \
507 result_02 = svadd_x(pg1, result_02, UChi_02); \
508 result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \
509 result_10 = svadd_x(pg1, result_10, UChi_10); \
510 result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \
511 result_11 = svadd_x(pg1, result_11, UChi_11); \
512 result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \
513 result_12 = svadd_x(pg1, result_12, UChi_12);
514
515// ZM_RECON_ACCUM
516#define ZM_RECON_ACCUM_A64FXd \
517 result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \
518 result_00 = svadd_x(pg1, result_00, UChi_00); \
519 result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \
520 result_01 = svadd_x(pg1, result_01, UChi_01); \
521 result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \
522 result_02 = svadd_x(pg1, result_02, UChi_02); \
523 result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \
524 result_10 = svadd_x(pg1, result_10, UChi_10); \
525 result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \
526 result_11 = svadd_x(pg1, result_11, UChi_11); \
527 result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \
528 result_12 = svadd_x(pg1, result_12, UChi_12);
529
530// TP_RECON_ACCUM
531#define TP_RECON_ACCUM_A64FXd \
532 result_00 = svadd_x(pg1, result_00, UChi_00); \
533 result_20 = svadd_x(pg1, result_20, UChi_00); \
534 result_01 = svadd_x(pg1, result_01, UChi_01); \
535 result_21 = svadd_x(pg1, result_21, UChi_01); \
536 result_02 = svadd_x(pg1, result_02, UChi_02); \
537 result_22 = svadd_x(pg1, result_22, UChi_02); \
538 result_10 = svadd_x(pg1, result_10, UChi_10); \
539 result_30 = svadd_x(pg1, result_30, UChi_10); \
540 result_11 = svadd_x(pg1, result_11, UChi_11); \
541 result_31 = svadd_x(pg1, result_31, UChi_11); \
542 result_12 = svadd_x(pg1, result_12, UChi_12); \
543 result_32 = svadd_x(pg1, result_32, UChi_12);
544
545// TM_RECON_ACCUM
546#define TM_RECON_ACCUM_A64FXd \
547 result_00 = svadd_x(pg1, result_00, UChi_00); \
548 result_20 = svsub_x(pg1, result_20, UChi_00); \
549 result_01 = svadd_x(pg1, result_01, UChi_01); \
550 result_21 = svsub_x(pg1, result_21, UChi_01); \
551 result_02 = svadd_x(pg1, result_02, UChi_02); \
552 result_22 = svsub_x(pg1, result_22, UChi_02); \
553 result_10 = svadd_x(pg1, result_10, UChi_10); \
554 result_30 = svsub_x(pg1, result_30, UChi_10); \
555 result_11 = svadd_x(pg1, result_11, UChi_11); \
556 result_31 = svsub_x(pg1, result_31, UChi_11); \
557 result_12 = svadd_x(pg1, result_12, UChi_12); \
558 result_32 = svsub_x(pg1, result_32, UChi_12);
559
560// ZERO_PSI
561#define ZERO_PSI_A64FXd \
562 result_00 = svdup_f64(0.); \
563 result_01 = svdup_f64(0.); \
564 result_02 = svdup_f64(0.); \
565 result_10 = svdup_f64(0.); \
566 result_11 = svdup_f64(0.); \
567 result_12 = svdup_f64(0.); \
568 result_20 = svdup_f64(0.); \
569 result_21 = svdup_f64(0.); \
570 result_22 = svdup_f64(0.); \
571 result_30 = svdup_f64(0.); \
572 result_31 = svdup_f64(0.); \
573 result_32 = svdup_f64(0.);
574
575// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
576#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
577{ \
578 asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
579 asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
580 asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
581}
582// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
583#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \
584{ \
585 svprfd(pg1, (int64_t*)(base + 0), SV_PSTL1STRM); \
586 svprfd(pg1, (int64_t*)(base + 256), SV_PSTL1STRM); \
587 svprfd(pg1, (int64_t*)(base + 512), SV_PSTL1STRM); \
588}
589// ADD_RESULT_INTERNAL
590#define ADD_RESULT_INTERNAL_A64FXd \
591 result_00 = svadd_x(pg1, result_00, Chimu_00); \
592 result_01 = svadd_x(pg1, result_01, Chimu_01); \
593 result_02 = svadd_x(pg1, result_02, Chimu_02); \
594 result_10 = svadd_x(pg1, result_10, Chimu_10); \
595 result_11 = svadd_x(pg1, result_11, Chimu_11); \
596 result_12 = svadd_x(pg1, result_12, Chimu_12); \
597 result_20 = svadd_x(pg1, result_20, Chimu_20); \
598 result_21 = svadd_x(pg1, result_21, Chimu_21); \
599 result_22 = svadd_x(pg1, result_22, Chimu_22); \
600 result_30 = svadd_x(pg1, result_30, Chimu_30); \
601 result_31 = svadd_x(pg1, result_31, Chimu_31); \
602 result_32 = svadd_x(pg1, result_32, Chimu_32);
603