Grid 0.7.0
Grid_a64fx-2.h
Go to the documentation of this file.
1 /*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: Grid_a64fx-2.h
6
7 Copyright (C) 2020
8
9 Author: Nils Meyer <nils.meyer@ur.de>
10
11 with support from Arm
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26
27 See the full license in the file "LICENSE" in the top level distribution directory
28 *************************************************************************************/
29 /* END LEGAL */
30
32// Using SVE ACLE
34
35static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes");
36
38NAMESPACE_BEGIN(Optimization);
39
40 // type traits giving the number of elements for each vector type
41 template <typename T> struct W;
42 template <> struct W<double> {
43 constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
44 constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
45 };
46 template <> struct W<float> {
47 constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
48 constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
49 };
50 template <> struct W<Integer> {
51 constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
52 };
53 template <> struct W<uint16_t> {
54 constexpr static unsigned int c = GEN_SIMD_WIDTH/4u;
55 constexpr static unsigned int r = GEN_SIMD_WIDTH/2u;
56 };
57 template <> struct W<uint64_t> {
58 constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
59 constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
60 };
61
62 #ifdef ARMCLANGCOMPAT
63 // SIMD vector immediate types
64 template <typename T>
65 struct vec_imm {
66 alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
67 };
68
69 // SIMD vector types
70 template <typename T>
71 struct vec {
72 alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
73 vec() = default;
74 vec(const vec &rhs) { this->operator=(rhs); }
75 vec(const vec_imm<T> &rhs) {
76 // v = rhs.v
77 svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
78 }
79
80 inline vec &operator=(const vec &rhs) {
81 // v = rhs.v
82 svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
83 return *this;
84 };
85 };
86
87 #else // no ARMCLANGCOMPAT
88 #define vec_imm vec
89 // SIMD vector types
90 template <typename T>
91 struct vec {
93 };
94 #endif
95
98 typedef vec<uint16_t> vech; // half precision comms
100
101NAMESPACE_END(Optimization)
103
104// low-level API
106NAMESPACE_BEGIN(Optimization);
107
108template <typename T>
109struct acle{};
110
111template <>
112struct acle<double>{
113 typedef svfloat64_t vt;
114 typedef svfloat64x2_t vt2;
115 typedef svfloat64x4_t vt4;
116 typedef float64_t pt;
117 typedef uint64_t uint;
118 typedef svuint64_t svuint;
119
120 static inline svbool_t pg1(){return svptrue_b64();}
121 static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);}
122 static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);}
123 static inline vec<uint64_t> tbl_swap(){
124 //const vec<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
125 const vec_imm<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
126 return t;
127 }
128 static inline vec<uint64_t> tbl0(){
129 //const vec<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
130 const vec_imm<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
131 return t;
132 }
133 static inline vec<uint64_t> tbl1(){
134 //const vec<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
135 const vec_imm<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
136 return t;
137 }
138 static inline vec<uint64_t> tbl_exch1a(){ // Exchange1
139 //const vec<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
140 const vec_imm<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
141 return t;
142 }
143 static inline vec<uint64_t> tbl_exch1b(){ // Exchange1
144 //const vec<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
145 const vec_imm<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
146 return t;
147 }
148 static inline vec<uint64_t> tbl_exch1c(){ // Exchange1
149 //const vec<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
150 const vec_imm<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
151 return t;
152 }
153 static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());}
154 static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());}
155 static inline svfloat64_t zero(){return svdup_f64(0.);}
156};
157
158template <>
159struct acle<float>{
160 typedef svfloat32_t vt;
161 typedef svfloat32x2_t vt2;
162 typedef float32_t pt;
163 typedef uint32_t uint;
164 typedef svuint32_t svuint;
165
166 static inline svbool_t pg1(){return svptrue_b32();}
167 static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);}
168 // exchange neighboring elements
169 static inline vec<uint32_t> tbl_swap(){
170 //const vec<uint32_t> t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
171 const vec_imm<uint32_t> t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
172 return t;
173 }
174 static inline vec<uint32_t> tbl0(){
175 //const vec<uint32_t> t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
176 const vec_imm<uint32_t> t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
177 return t;
178 }
179 static inline vec<uint32_t> tbl1(){
180 //const vec<uint32_t> t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
181 const vec_imm<uint32_t> t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
182 return t;
183 }
184 static inline vec<uint32_t> tbl2(){
185 //const vec<uint32_t> t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
186 const vec_imm<uint32_t> t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
187 return t;
188 }
189 static inline vec<uint32_t> tbl_exch1a(){ // Exchange1
190 //const vec<uint32_t> t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 };
191 const vec_imm<uint32_t> t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 };
192 return t;
193 }
194 static inline vec<uint32_t> tbl_exch1b(){ // Exchange1
195 //const vec<uint32_t> t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 };
196 const vec_imm<uint32_t> t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 };
197 return t;
198 }
199 static inline vec<uint32_t> tbl_exch1c(){ // Exchange1
200 //const vec<uint32_t> t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7};
201 const vec_imm<uint32_t> t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7};
202 return t;
203 }
204 static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());}
205 static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());}
206 static inline svfloat32_t zero(){return svdup_f32(0.);}
207};
208
209template <>
210struct acle<uint16_t>{
211 typedef svfloat16_t vt;
212 typedef float16_t pt;
213 typedef uint16_t uint;
214 typedef svuint16_t svuint;
215
216 static inline svbool_t pg1(){return svptrue_b16();}
217 static inline svbool_t pg2(){return svptrue_pat_b16(SV_VL16);}
218 static inline svbool_t pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());}
219 static inline svbool_t pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());}
220 static inline svfloat16_t zero(){return svdup_f16(0.);}
221};
222
223template <>
224struct acle<Integer>{
225 typedef svuint32_t vt;
226 typedef svuint32x2_t vt2;
227 typedef Integer pt;
228 typedef uint32_t uint;
229 typedef svuint32_t svuint;
230
231 //static inline svbool_t pg1(){return svptrue_b16();}
232 static inline svbool_t pg1(){return svptrue_b32();}
233 static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);}
234 static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());}
235 static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());}
236};
237
238// ---------------------------------------------------
239
240struct Vsplat{
241 // Complex float
242 inline vecf operator()(float a, float b){
243 vecf out;
244 svbool_t pg1 = acle<float>::pg1();
245 typename acle<float>::vt a_v = svdup_f32(a);
246 typename acle<float>::vt b_v = svdup_f32(b);
247 typename acle<float>::vt r_v = svzip1(a_v, b_v);
248 svst1(pg1, out.v, r_v);
249 return out;
250 }
251
252 // Real float
253 inline vecf operator()(float a){
254 vecf out;
255 svbool_t pg1 = acle<float>::pg1();
256 typename acle<float>::vt r_v = svdup_f32(a);
257 svst1(pg1, out.v, r_v);
258 return out;
259 }
260
261 // Complex double
262 inline vecd operator()(double a, double b){
263 vecd out;
264 svbool_t pg1 = acle<double>::pg1();
265 typename acle<double>::vt a_v = svdup_f64(a);
266 typename acle<double>::vt b_v = svdup_f64(b);
267 typename acle<double>::vt r_v = svzip1(a_v, b_v);
268 svst1(pg1, out.v, r_v);
269 return out;
270 }
271
272 // Real double
273 inline vecd operator()(double a){
274 vecd out;
275 svbool_t pg1 = acle<double>::pg1();
276 typename acle<double>::vt r_v = svdup_f64(a);
277 svst1(pg1, out.v, r_v);
278 return out;
279 }
280
281 // Integer
283 vec<Integer> out;
284 svbool_t pg1 = acle<Integer>::pg1();
285 // Add check whether Integer is really a uint32_t???
286 typename acle<Integer>::vt r_v = svdup_u32(a);
287 svst1(pg1, out.v, r_v);
288 return out;
289 }
290};
291
292struct Vstore{
293 // Real
294 template <typename T>
295 inline void operator()(vec<T> a, T *D){
296 svbool_t pg1 = acle<T>::pg1();
297 typename acle<T>::vt a_v = svld1(pg1, (typename acle<T>::pt*)&a.v);
298 svst1(pg1, D, a_v);
299 }
300};
301
302struct Vstream{
303 // Real
304 template <typename T>
305 inline void operator()(T * a, vec<T> b){
306 svbool_t pg1 = acle<T>::pg1();
307 typename acle<T>::vt b_v = svld1(pg1, b.v);
308 svstnt1(pg1, a, b_v);
309 //svst1(pg1, a, b_v);
310 }
311};
312
313 struct Vset{
314 // Complex
315 template <typename T>
316 inline vec<T> operator()(std::complex<T> *a){
317 vec<T> out;
318 svbool_t pg1 = acle<T>::pg1();
319 typename acle<T>::vt a_v = svld1(pg1, (T*)a);
320 svst1(pg1, out.v, a_v);
321
322 return out;
323 }
324
325 // Real
326 template <typename T>
327 inline vec<T> operator()(T *a){
328 vec<T> out;
329 svbool_t pg1 = acle<T>::pg1();
330 typename acle<T>::vt a_v = svld1(pg1, a);
331 svst1(pg1, out.v, a_v);
332
333 return out;
334 }
335 };
336
338// Arithmetic operations
340
341struct Sum{
342 template <typename T>
344 vec<T> out;
345 svbool_t pg1 = acle<T>::pg1();
346 typename acle<T>::vt a_v = svld1(pg1, a.v);
347 typename acle<T>::vt b_v = svld1(pg1, b.v);
348 typename acle<T>::vt r_v = svadd_x(pg1, a_v, b_v);
349 svst1(pg1, out.v, r_v);
350
351 return out;
352 }
353};
354
355struct Sub{
356 template <typename T>
358 vec<T> out;
359 svbool_t pg1 = acle<T>::pg1();
360 typename acle<T>::vt a_v = svld1(pg1, a.v);
361 typename acle<T>::vt b_v = svld1(pg1, b.v);
362 typename acle<T>::vt r_v = svsub_x(pg1, a_v, b_v);
363 svst1(pg1, out.v, r_v);
364
365 return out;
366 }
367};
368
369struct Mult{
370 template <typename T>
372 vec<T> out;
373 svbool_t pg1 = acle<T>::pg1();
374 typename acle<T>::vt a_v = svld1(pg1, a.v);
375 typename acle<T>::vt b_v = svld1(pg1, b.v);
376 typename acle<T>::vt c_v = svld1(pg1, c.v);
377 typename acle<T>::vt r_v = svmla_x(pg1, c_v, a_v, b_v);
378 svst1(pg1, out.v, r_v);
379
380 return out;
381 }
382 template <typename T>
384 vec<T> out;
385 svbool_t pg1 = acle<T>::pg1();
386 typename acle<T>::vt a_v = svld1(pg1, a.v);
387 typename acle<T>::vt b_v = svld1(pg1, b.v);
388 typename acle<T>::vt r_v = svmul_x(pg1, a_v, b_v);
389 svst1(pg1, out.v, r_v);
390
391 return out;
392 }
393};
394
396 template <typename T>
398 vec<T> out;
399 svbool_t pg1 = acle<T>::pg1();
400 typename acle<T>::vt a_v = svld1(pg1, a.v);
401 typename acle<T>::vt b_v = svld1(pg1, b.v);
402
403 // using FCMLA
404 typename acle<T>::vt z_v = acle<T>::zero();
405 typename acle<T>::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0);
406
407 svst1(pg1, out.v, r_v);
408
409 return out;
410 }
411};
412
414 template <typename T>
416 vec<T> out;
417 svbool_t pg1 = acle<T>::pg1();
418 typename acle<T>::vt a_v = svld1(pg1, a.v);
419 typename acle<T>::vt b_v = svld1(pg1, b.v);
420 typename acle<T>::vt c_v = svld1(pg1, c.v);
421
422 // using FCMLA
423 typename acle<T>::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0);
424
425 svst1(pg1, out.v, r_v);
426
427 return out;
428 }
429};
430
432 // Complex a*b
433 template <typename T>
435 vec<T> out;
436 svbool_t pg1 = acle<T>::pg1();
437 typename acle<T>::vt a_v = svld1(pg1, a.v);
438 typename acle<T>::vt b_v = svld1(pg1, b.v);
439 typename acle<T>::vt z_v = acle<T>::zero();
440
441 // using FCMLA
442 typename acle<T>::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0);
443 r_v = svcmla_x(pg1, r_v, a_v, b_v, 90);
444
445 svst1(pg1, out.v, r_v);
446
447 return out;
448 }
449};
450
452 // Complex a*b+c
453 template <typename T>
455 vec<T> out;
456 svbool_t pg1 = acle<T>::pg1();
457 typename acle<T>::vt a_v = svld1(pg1, a.v);
458 typename acle<T>::vt b_v = svld1(pg1, b.v);
459 typename acle<T>::vt c_v = svld1(pg1, c.v);;
460
461 // using FCMLA
462 typename acle<T>::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0);
463 r_v = svcmla_x(pg1, r_v, a_v, b_v, 90);
464 svst1(pg1, out.v, r_v);
465
466 return out;
467 }
468};
469
470struct Div{
471 // Real
472 template <typename T>
474 vec<T> out;
475 svbool_t pg1 = acle<T>::pg1();
476 typename acle<T>::vt a_v = svld1(pg1, a.v);
477 typename acle<T>::vt b_v = svld1(pg1, b.v);
478 typename acle<T>::vt r_v = svdiv_x(pg1, a_v, b_v);
479 svst1(pg1, out.v, r_v);
480
481 return out;
482 }
483};
484
485struct Conj{
486 // Complex
487 template <typename T>
489 vec<T> out;
490 svbool_t pg1 = acle<T>::pg1();
491 svbool_t pg_odd = acle<T>::pg_odd();
492 typename acle<T>::vt a_v = svld1(pg1, a.v);
493 //typename acle<T>::vt r_v = svneg_x(pg_odd, a_v);
494 typename acle<T>::vt r_v = svneg_m(a_v, pg_odd, a_v);
495 svst1(pg1, out.v, r_v);
496
497 return out;
498 }
499};
500
502 // Complex
503 template <typename T>
505 vec<T> out;
507 svbool_t pg1 = acle<T>::pg1();
508 svbool_t pg_odd = acle<T>::pg_odd();
509
510 typename acle<T>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
511 typename acle<T>::vt a_v = svld1(pg1, a.v);
512 a_v = svtbl(a_v, tbl_swap_v);
513 typename acle<T>::vt r_v = svneg_m(a_v, pg_odd, a_v);
514 svst1(pg1, out.v, r_v);
515
516 return out;
517 }
518};
519
520struct TimesI{
521 // Complex
522 template <typename T>
524 vec<T> out;
526 svbool_t pg1 = acle<T>::pg1();
527 svbool_t pg_even = acle<T>::pg_even();
528
529 typename acle<T>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
530 typename acle<T>::vt a_v = svld1(pg1, a.v);
531 a_v = svtbl(a_v, tbl_swap_v);
532 //typename acle<T>::vt r_v = svneg_x(pg_even, a_v);
533 typename acle<T>::vt r_v = svneg_m(a_v, pg_even, a_v);
534 svst1(pg1, out.v, r_v);
535
536 return out;
537 }
538};
539
541 static inline vech StoH (const vecf &sa,const vecf &sb) {
542 vech ret;
543 svbool_t pg1s = acle<float>::pg1();
544 svbool_t pg1h = acle<uint16_t>::pg1();
545 typename acle<float>::vt sa_v = svld1(pg1s, sa.v);
546 typename acle<float>::vt sb_v = svld1(pg1s, sb.v);
547 typename acle<uint16_t>::vt ha_v = svcvt_f16_x(pg1s, sa_v);
548 typename acle<uint16_t>::vt hb_v = svcvt_f16_x(pg1s, sb_v);
549 typename acle<uint16_t>::vt r_v = svuzp1(ha_v, hb_v);
550 svst1(pg1h, (typename acle<uint16_t>::pt*)&ret.v, r_v);
551
552 return ret;
553 }
554 static inline void HtoS(vech h,vecf &sa,vecf &sb) {
555 svbool_t pg1h = acle<uint16_t>::pg1();
556 svbool_t pg1s = acle<float>::pg1();
557 typename acle<uint16_t>::vt h_v = svld1(pg1h, (typename acle<uint16_t>::pt*)&h.v);
558 typename acle<uint16_t>::vt ha_v = svzip1(h_v, h_v);
559 typename acle<uint16_t>::vt hb_v = svzip2(h_v, h_v);
560 typename acle<float>::vt sa_v = svcvt_f32_x(pg1s, ha_v);
561 typename acle<float>::vt sb_v = svcvt_f32_x(pg1s, hb_v);
562 svst1(pg1s, sa.v, sa_v);
563 svst1(pg1s, sb.v, sb_v);
564 }
565 static inline vecf DtoS (vecd a,vecd b) {
566 vecf ret;
567 svbool_t pg1d = acle<double>::pg1();
568 svbool_t pg1s = acle<float>::pg1();
569 typename acle<double>::vt a_v = svld1(pg1d, a.v);
570 typename acle<double>::vt b_v = svld1(pg1d, b.v);
571 typename acle<float>::vt sa_v = svcvt_f32_x(pg1d, a_v);
572 typename acle<float>::vt sb_v = svcvt_f32_x(pg1d, b_v);
573 typename acle<float>::vt r_v = svuzp1(sa_v, sb_v);
574 svst1(pg1s, ret.v, r_v);
575
576 return ret;
577 }
578 static inline void StoD (vecf s,vecd &a,vecd &b) {
579 svbool_t pg1s = acle<float>::pg1();
580 svbool_t pg1d = acle<double>::pg1();
581 typename acle<float>::vt s_v = svld1(pg1s, s.v);
582 typename acle<float>::vt sa_v = svzip1(s_v, s_v);
583 typename acle<float>::vt sb_v = svzip2(s_v, s_v);
584 typename acle<double>::vt a_v = svcvt_f64_x(pg1d, sa_v);
585 typename acle<double>::vt b_v = svcvt_f64_x(pg1d, sb_v);
586 svst1(pg1d, a.v, a_v);
587 svst1(pg1d, b.v, b_v);
588 }
589 static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) {
590 vech ret;
591 svbool_t pg1d = acle<double>::pg1();
592 svbool_t pg1h = acle<uint16_t>::pg1();
593 typename acle<double>::vt a_v = svld1(pg1d, a.v);
594 typename acle<double>::vt b_v = svld1(pg1d, b.v);
595 typename acle<double>::vt c_v = svld1(pg1d, c.v);
596 typename acle<double>::vt d_v = svld1(pg1d, d.v);
597 typename acle<uint16_t>::vt ha_v = svcvt_f16_x(pg1d, a_v);
598 typename acle<uint16_t>::vt hb_v = svcvt_f16_x(pg1d, b_v);
599 typename acle<uint16_t>::vt hc_v = svcvt_f16_x(pg1d, c_v);
600 typename acle<uint16_t>::vt hd_v = svcvt_f16_x(pg1d, d_v);
601 typename acle<uint16_t>::vt hab_v = svuzp1(ha_v, hb_v);
602 typename acle<uint16_t>::vt hcd_v = svuzp1(hc_v, hd_v);
603 typename acle<uint16_t>::vt r_v = svuzp1(hab_v, hcd_v);
604 svst1(pg1h, (typename acle<uint16_t>::pt*)&ret.v, r_v);
605
606 return ret;
607/*
608 vecf sa,sb;
609 sa = DtoS(a,b);
610 sb = DtoS(c,d);
611 return StoH(sa,sb);
612*/
613 }
614 static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) {
615 svbool_t pg1h = acle<uint16_t>::pg1();
616 svbool_t pg1d = acle<double>::pg1();
617 typename acle<uint16_t>::vt h_v = svld1(pg1h, (typename acle<uint16_t>::pt*)&h.v);
618 typename acle<uint16_t>::vt sa_v = svzip1(h_v, h_v);
619 typename acle<uint16_t>::vt sb_v = svzip2(h_v, h_v);
620 typename acle<uint16_t>::vt da_v = svzip1(sa_v, sa_v);
621 typename acle<uint16_t>::vt db_v = svzip2(sa_v, sa_v);
622 typename acle<uint16_t>::vt dc_v = svzip1(sb_v, sb_v);
623 typename acle<uint16_t>::vt dd_v = svzip2(sb_v, sb_v);
624 typename acle<double>::vt a_v = svcvt_f64_x(pg1d, da_v);
625 typename acle<double>::vt b_v = svcvt_f64_x(pg1d, db_v);
626 typename acle<double>::vt c_v = svcvt_f64_x(pg1d, dc_v);
627 typename acle<double>::vt d_v = svcvt_f64_x(pg1d, dd_v);
628 svst1(pg1d, a.v, a_v);
629 svst1(pg1d, b.v, b_v);
630 svst1(pg1d, c.v, c_v);
631 svst1(pg1d, d.v, d_v);
632/*
633 vecf sa,sb;
634 HtoS(h,sa,sb);
635 StoD(sa,a,b);
636 StoD(sb,c,d);
637*/
638 }
639};
640
641struct Exchange{
642
643 // Exchange0 is valid for arbitrary SVE vector length
644 template <typename T>
645 static inline void Exchange0(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){
646 svbool_t pg1 = acle<T>::pg1();
647 typename acle<T>::vt a1_v = svld1(pg1, in1.v);
648 typename acle<T>::vt a2_v = svld1(pg1, in2.v);
649 typename acle<T>::vt r1_v = svext(a1_v, a1_v, (uint64_t)W<T>::c);
650 r1_v = svext(r1_v, a2_v, (uint64_t)W<T>::c);
651 typename acle<T>::vt r2_v = svext(a2_v, a2_v, (uint64_t)W<T>::c);
652 r2_v = svext(a1_v, r2_v, (uint64_t)W<T>::c);
653 svst1(pg1, out1.v, r1_v);
654 svst1(pg1, out2.v, r2_v);
655 }
656
657 template <typename T>
658 static inline void Exchange1(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){
659 // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1
660 // alternative: use 4-el structure; expect translation into ldp + stp -> SFI
661 svbool_t pg1 = acle<T>::pg1();
665
666 typename acle<T>::svuint tbl_exch1a_v = svld1(pg1, tbl_exch1a.v);
667 typename acle<T>::svuint tbl_exch1b_v = svld1(pg1, tbl_exch1b.v);
668 typename acle<T>::svuint tbl_exch1c_v = svld1(pg1, tbl_exch1c.v);
669
670 typename acle<T>::vt in1_v = svld1(pg1, in1.v);
671 typename acle<T>::vt in2_v = svld1(pg1, in2.v);
672
673 typename acle<T>::vt a1_v = svtbl(in1_v, tbl_exch1a_v);
674 typename acle<T>::vt a2_v = svtbl(in2_v, tbl_exch1b_v);
675 typename acle<T>::vt b1_v = svext(a2_v, a1_v, (uint64_t)(W<T>::r / 2u));
676 typename acle<T>::vt b2_v = svext(a1_v, a2_v, (uint64_t)(W<T>::r / 2u));
677 typename acle<T>::vt out1_v = svtbl(b1_v, tbl_exch1c_v);
678 typename acle<T>::vt out2_v = svtbl(b2_v, tbl_exch1a_v);
679
680 svst1(pg1, out1.v, out1_v);
681 svst1(pg1, out2.v, out2_v);
682 }
683
684 template <typename T>
685 static inline void Exchange2(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){
686 svbool_t pg1 = acle<double>::pg1();
687 typename acle<double>::vt a1_v = svld1(pg1, (typename acle<double>::pt*)in1.v);
688 typename acle<double>::vt a2_v = svld1(pg1, (typename acle<double>::pt*)in2.v);
689 typename acle<double>::vt r1_v = svtrn1(a1_v, a2_v);
690 typename acle<double>::vt r2_v = svtrn2(a1_v, a2_v);
691 svst1(pg1, (typename acle<double>::pt*)out1.v, r1_v);
692 svst1(pg1, (typename acle<double>::pt*)out2.v, r2_v);
693 }
694
695 static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){
696 svbool_t pg1 = acle<float>::pg1();
697 typename acle<float>::vt a1_v = svld1(pg1, in1.v);
698 typename acle<float>::vt a2_v = svld1(pg1, in2.v);
699 typename acle<float>::vt r1_v = svtrn1(a1_v, a2_v);
700 typename acle<float>::vt r2_v = svtrn2(a1_v, a2_v);
701 svst1(pg1, out1.v, r1_v);
702 svst1(pg1, out2.v, r2_v);
703 }
704
705 static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){
706 assert(0);
707 return;
708 }
709};
710
711struct Permute{
712
713 // Permute0 is valid for any SVE vector width
714 template <typename T>
715 static inline vec<T> Permute0(vec<T> in) {
716 vec<T> out;
717 svbool_t pg1 = acle<T>::pg1();
718 typename acle<T>::vt a_v = svld1(pg1, in.v);
719 typename acle<T>::vt r_v = svext(a_v, a_v, (uint64_t)(W<T>::r / 2u));
720 svst1(pg1, out.v, r_v);
721
722 return out;
723 }
724
725 static inline vecd Permute1(vecd in) {
726 vecd out;
728 svbool_t pg1 = acle<double>::pg1();
729 typename acle<double>::vt a_v = svld1(pg1, in.v);
730 typename acle<double>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
731 typename acle<double>::vt r_v = svtbl(a_v, tbl_swap_v);
732 svst1(pg1, out.v, r_v);
733
734 return out;
735 }
736
737 static inline vecf Permute1(vecf in) {
738 vecf out;
740 svbool_t pg1 = acle<float>::pg1();
741 typename acle<float>::vt a_v = svld1(pg1, in.v);
742 typename acle<float>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
743 typename acle<float>::vt r_v = svtbl(a_v, tbl_swap_v);
744 svst1(pg1, out.v, r_v);
745
746 return out;
747 }
748
749 static inline vecd Permute2(vecd in) {
750 vecd out;
752 svbool_t pg1 = acle<double>::pg1();
753 typename acle<double>::vt a_v = svld1(pg1, in.v);
754 typename acle<double>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
755 typename acle<double>::vt r_v = svtbl(a_v, tbl_swap_v);
756 svst1(pg1, out.v, r_v);
757
758 return out;
759 }
760
761 static inline vecf Permute2(vecf in) {
762 vecf out;
764 svbool_t pg1 = acle<float>::pg1();
765 typename acle<float>::vt a_v = svld1(pg1, in.v);
766 typename acle<float>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
767 typename acle<float>::vt r_v = svtbl(a_v, tbl_swap_v);
768 svst1(pg1, out.v, r_v);
769
770 return out;
771 }
772
773 static inline vecf Permute3(vecf in) {
774 vecf out;
776 svbool_t pg1 = acle<float>::pg1();
777 typename acle<float>::vt a_v = svld1(pg1, in.v);
778 typename acle<float>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
779 typename acle<float>::vt r_v = svtbl(a_v, tbl_swap_v);
780 svst1(pg1, out.v, r_v);
781
782 return out;
783 }
784
785 static inline vecd Permute3(vecd in) {
786 return in;
787 }
788
789};
790
791struct Rotate{
792
793 template <int n, typename T> static inline vec<T> tRotate(vec<T> in){
794 vec<T> out;
795 svbool_t pg1 = acle<T>::pg1();
796 typename acle<T>::vt a_v = svld1(pg1, in.v);
797 typename acle<T>::vt r_v = svext(a_v, a_v, (uint64_t)(n%W<T>::r));
798 svst1(pg1, out.v, r_v);
799
800 return out;
801 }
802
803 template <typename T>
804 static inline vec<T> rotate(vec<T> in, int n){
805
806 switch(n){
807 case 0: return tRotate<0, T>(in); break;
808 case 1: return tRotate<1, T>(in); break;
809 case 2: return tRotate<2, T>(in); break;
810 case 3: return tRotate<3, T>(in); break;
811 case 4: return tRotate<4, T>(in); break;
812 case 5: return tRotate<5, T>(in); break;
813 case 6: return tRotate<6, T>(in); break;
814 case 7: return tRotate<7, T>(in); break;
815
816 case 8: return tRotate<8, T>(in); break;
817 case 9: return tRotate<9, T>(in); break;
818 case 10: return tRotate<10, T>(in); break;
819 case 11: return tRotate<11, T>(in); break;
820 case 12: return tRotate<12, T>(in); break;
821 case 13: return tRotate<13, T>(in); break;
822 case 14: return tRotate<14, T>(in); break;
823 case 15: return tRotate<15, T>(in); break;
824 default: assert(0);
825 }
826 }
827};
828
829// tree-based reduction
830#define svred(pg, v)\
831svaddv(pg, v);
832
833// left-to-right reduction
834// #define svred(pg, v)\
835// svadda(pg, 0, v)
836
837template <typename Out_type, typename In_type>
838struct Reduce{
839 //Need templated class to overload output type
840 //General form must generate error if compiled
841 inline Out_type operator()(In_type in){
842 printf("Error, using wrong Reduce function\n");
843 exit(1);
844 return 0;
845 }
846};
847
848//Complex float Reduce
849template <>
851 svbool_t pg1 = acle<float>::pg1();
852 svbool_t pg_even = acle<float>::pg_even();
853 svbool_t pg_odd = acle<float>::pg_odd();
854 typename acle<float>::vt a_v = svld1(pg1, in.v);
855 float a = svred(pg_even, a_v);
856 float b = svred(pg_odd, a_v);
857
858 return Grid::ComplexF(a, b);
859
860}
861
862//Real float Reduce
863template <>
865 svbool_t pg1 = acle<float>::pg1();
866 typename acle<float>::vt a_v = svld1(pg1, in.v);
867 float a = svred(pg1, a_v);
868
869 return a;
870}
871
872//Complex double Reduce
873template <>
875 svbool_t pg1 = acle<double>::pg1();
876 svbool_t pg_even = acle<double>::pg_even();
877 svbool_t pg_odd = acle<double>::pg_odd();
878 typename acle<double>::vt a_v = svld1(pg1, in.v);
879 double a = svred(pg_even, a_v);
880 double b = svred(pg_odd, a_v);
881
882 return Grid::ComplexD(a, b);
883}
884
885//Real double Reduce
886template <>
888 svbool_t pg1 = acle<double>::pg1();
889 typename acle<double>::vt a_v = svld1(pg1, in.v);
890 double a = svred(pg1, a_v);
891
892 return a;
893}
894
895//Integer Reduce
896template <>
898 svbool_t pg1 = acle<Integer>::pg1();
899 typename acle<Integer>::vt a_v = svld1(pg1, in.v);
900 Integer a = svred(pg1, a_v);
901
902 return a;
903}
904
905#undef svred
906#undef vec_imm
907
908NAMESPACE_END(Optimization)
909
910
911// Here assign types
912
913typedef Optimization::vech SIMD_Htype; // Reduced precision type
914typedef Optimization::vecf SIMD_Ftype; // Single precision type
915typedef Optimization::vecd SIMD_Dtype; // Double precision type
916typedef Optimization::veci SIMD_Itype; // Integer type
917
918// prefetch utilities
919inline void v_prefetch0(int size, const char *ptr){};
920inline void prefetch_HINT_T0(const char *ptr){};
921
922// Function name aliases
923typedef Optimization::Vsplat VsplatSIMD;
924typedef Optimization::Vstore VstoreSIMD;
925typedef Optimization::Vset VsetSIMD;
926typedef Optimization::Vstream VstreamSIMD;
927template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
928
929// Arithmetic operations
930typedef Optimization::Sum SumSIMD;
931typedef Optimization::Sub SubSIMD;
932typedef Optimization::Div DivSIMD;
933typedef Optimization::Mult MultSIMD;
934typedef Optimization::MultComplex MultComplexSIMD;
935typedef Optimization::MultAddComplex MultAddComplexSIMD;
936typedef Optimization::MultRealPart MultRealPartSIMD;
937typedef Optimization::MaddRealPart MaddRealPartSIMD;
938typedef Optimization::Conj ConjSIMD;
939typedef Optimization::TimesMinusI TimesMinusISIMD;
940typedef Optimization::TimesI TimesISIMD;
941
#define GEN_SIMD_WIDTH
Definition Config.h:56
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
#define vec_imm
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
void prefetch_HINT_T0(const char *ptr)
Optimization::Reduce< S, T > ReduceSIMD
vec< double > vecd
#define svred(pg, v)
vec< Integer > veci
void v_prefetch0(int size, const char *ptr)
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
vec< float > vecf
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::MultAddComplex MultAddComplexSIMD
Optimization::Div DivSIMD
vec< uint16_t > vech
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
uint32_t Integer
Definition Simd.h:58
vec< T > operator()(vec< T > a)
vec< T > operator()(vec< T > a, vec< T > b)
static void Exchange1(vec< T > &out1, vec< T > &out2, const vec< T > &in1, const vec< T > &in2)
static void Exchange0(vec< T > &out1, vec< T > &out2, const vec< T > &in1, const vec< T > &in2)
static void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2)
static void Exchange2(vec< T > &out1, vec< T > &out2, const vec< T > &in1, const vec< T > &in2)
static void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2)
vec< T > operator()(vec< T > a, vec< T > b, vec< T > c)
vec< T > operator()(vec< T > a, vec< T > b, vec< T > c)
vec< T > operator()(vec< T > a, vec< T > b)
vec< T > operator()(vec< T > a, vec< T > b)
vec< T > operator()(vec< T > a, vec< T > b, vec< T > c)
vec< T > operator()(vec< T > a, vec< T > b)
static vec< T > Permute0(vec< T > in)
static vecd Permute3(vecd in)
static vecd Permute1(vecd in)
static vecf Permute3(vecf in)
static vecd Permute2(vecd in)
static vecf Permute1(vecf in)
static vecf Permute2(vecf in)
static vech StoH(const vecf &sa, const vecf &sb)
static void StoD(vecf s, vecd &a, vecd &b)
static vecf DtoS(vecd a, vecd b)
static void HtoD(vech h, vecd &a, vecd &b, vecd &c, vecd &d)
static vech DtoH(vecd a, vecd b, vecd c, vecd d)
static void HtoS(vech h, vecf &sa, vecf &sb)
Out_type operator()(In_type in)
static vec< T > tRotate(vec< T > in)
static vec< T > rotate(vec< T > in, int n)
vec< T > operator()(vec< T > a, vec< T > b)
vec< T > operator()(vec< T > a, vec< T > b)
vec< T > operator()(vec< T > a)
vec< T > operator()(vec< T > a)
vec< T > operator()(T *a)
vec< T > operator()(std::complex< T > *a)
vecf operator()(float a, float b)
vecd operator()(double a)
vecf operator()(float a)
vecd operator()(double a, double b)
vec< Integer > operator()(Integer a)
void operator()(vec< T > a, T *D)
void operator()(T *a, vec< T > b)
static constexpr unsigned int r
static constexpr unsigned int c
static constexpr unsigned int r
static constexpr unsigned int c
static constexpr unsigned int r
static constexpr unsigned int c
static constexpr unsigned int r
static constexpr unsigned int r
static constexpr unsigned int c
svuint32x2_t vt2
svuint32_t svuint
static svbool_t pg1()
static svbool_t pg_even()
static svbool_t pg2()
static svbool_t pg_odd()
static vec< uint64_t > tbl_exch1a()
static vec< uint64_t > tbl1()
svfloat64x4_t vt4
static vec< uint64_t > tbl_exch1b()
svfloat64x2_t vt2
static svfloat64_t zero()
static svbool_t pg4()
svfloat64_t vt
svuint64_t svuint
static svbool_t pg2()
static svbool_t pg_even()
static vec< uint64_t > tbl_swap()
static vec< uint64_t > tbl0()
static svbool_t pg_odd()
static vec< uint64_t > tbl_exch1c()
static svbool_t pg1()
svuint32_t svuint
static vec< uint32_t > tbl2()
static vec< uint32_t > tbl_exch1b()
static vec< uint32_t > tbl_exch1a()
static vec< uint32_t > tbl1()
svfloat32_t vt
svfloat32x2_t vt2
static svbool_t pg2()
static vec< uint32_t > tbl_exch1c()
static svbool_t pg_even()
static vec< uint32_t > tbl_swap()
static svfloat32_t zero()
static svbool_t pg_odd()
static vec< uint32_t > tbl0()
static svbool_t pg1()
static svbool_t pg_even()
svuint16_t svuint
static svfloat16_t zero()
static svbool_t pg1()
static svbool_t pg_odd()
static svbool_t pg2()
T v[W< T >::r]