Grid 0.7.0
Grid_a64fx-fixedsize.h
Go to the documentation of this file.
1 /*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: Grid_a64fx-fixedsize.h
6
7 Copyright (C) 2020
8
9 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
10
11 with support from Arm
12 Richard Sandiford <richard.sandiford@arm.com>
13
14 This program is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 2 of the License, or
17 (at your option) any later version.
18
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License along
25 with this program; if not, write to the Free Software Foundation, Inc.,
26 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27
28 See the full license in the file "LICENSE" in the top level distribution directory
29 *************************************************************************************/
30 /* END LEGAL */
31
33// Using SVE ACLE with fixed-size data types
35
36
37// gcc 10 features
38#if __ARM_FEATURE_SVE_BITS==512
39/* gcc 10.0.1 and gcc 10.1 bug using ACLE data types CAS-159553-Y1K4C6
40 workaround: use gcc's internal data types, bugfix expected for gcc 10.2
41typedef svbool_t pred __attribute__((arm_sve_vector_bits(512)));
42typedef svfloat16_t vech __attribute__((arm_sve_vector_bits(512)));
43typedef svfloat32_t vecf __attribute__((arm_sve_vector_bits(512)));
44typedef svfloat64_t vecd __attribute__((arm_sve_vector_bits(512)));
45typedef svuint32_t veci __attribute__((arm_sve_vector_bits(512)));
46typedef svuint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float
47typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double
48*/
49typedef __SVBool_t pred __attribute__((arm_sve_vector_bits(512)));
50typedef __SVFloat16_t vech __attribute__((arm_sve_vector_bits(512)));
51typedef __SVFloat32_t vecf __attribute__((arm_sve_vector_bits(512)));
52typedef __SVFloat64_t vecd __attribute__((arm_sve_vector_bits(512)));
53typedef __SVUint32_t veci __attribute__((arm_sve_vector_bits(512)));
54typedef __SVUint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float
55typedef __SVUint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double
56#else
57#pragma error("Oops. Illegal SVE vector size!?")
58#endif /* __ARM_FEATURE_SVE_BITS */
59
60// low-level API
62NAMESPACE_BEGIN(Optimization);
63
64// convenience union types for tables eliminating loads
65union ulutf {
66 lutf v;
67 uint32_t s[16];
68};
69union ulutd {
70 lutd v;
71 uint64_t s[8];
72};
73
74template <typename T>
75struct acle{};
76
77template <>
78struct acle<double>{
79 static inline lutd tbl_swap(){
80 const ulutd t = { .s = {1, 0, 3, 2, 5, 4, 7, 6} };
81 return t.v;
82 }
83 static inline lutd tbl0(){
84 const ulutd t = { .s = {4, 5, 6, 7, 0, 1, 2, 3} };
85 return t.v;
86 }
87 static inline lutd tbl1(){
88 const ulutd t = { .s = {2, 3, 0, 1, 6, 7, 4, 5} };
89 return t.v;
90 }
91 static inline lutd tbl_exch1a(){ // Exchange1
92 const ulutd t = { .s = {0, 1, 4, 5, 2, 3, 6, 7} };
93 return t.v;
94 }
95 static inline lutd tbl_exch1b(){ // Exchange1
96 const ulutd t = { .s = {2, 3, 6, 7, 0, 1, 4, 5} };
97 return t.v;
98 }
99 static inline lutd tbl_exch1c(){ // Exchange1
100 const ulutd t = { .s = {4, 5, 0, 1, 6, 7, 2, 3} };
101 return t.v;
102 }
103 static inline pred pg1(){return svptrue_b64();}
104 static inline pred pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());}
105 static inline pred pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());}
106 static inline vecd zero(){return svdup_f64(0.);}
107};
108
109template <>
110struct acle<float>{
111 // exchange neighboring elements
112 static inline lutf tbl_swap(){
113 const ulutf t = { .s = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} };
114 return t.v;
115 }
116 static inline lutf tbl0(){
117 const ulutf t = { .s = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7} };
118 return t.v;
119 }
120 static inline lutf tbl1(){
121 const ulutf t = { .s = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11} };
122 return t.v;
123 }
124 static inline lutf tbl2(){
125 const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} };
126 return t.v;
127 }
128 static inline lutf tbl_exch1a(){ // Exchange1
129 const ulutf t = { .s = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 } };
130 return t.v;
131 }
132 static inline lutf tbl_exch1b(){ // Exchange1
133 const ulutf t = { .s = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 } };
134 return t.v;
135 }
136 static inline lutf tbl_exch1c(){ // Exchange1
137 const ulutf t = { .s = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7} };
138 return t.v;
139 }
140 static inline pred pg1(){return svptrue_b32();}
141 static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());}
142 static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());}
143 static inline vecf zero(){return svdup_f32(0.);}
144};
145
146template <>
147struct acle<uint16_t>{
148 static inline pred pg1(){return svptrue_b16();}
149 static inline pred pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());}
150 static inline pred pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());}
151 static inline vech zero(){return svdup_f16(0.);}
152};
153
154template <>
155struct acle<Integer>{
156 //static inline svbool_t pg1(){return svptrue_b16();}
157 static inline pred pg1(){return svptrue_b32();}
158 static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());}
159 static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());}
160};
161
162// ---------------------------------------------------
163
164struct Vsplat{
165 // Complex float
166 inline vecf operator()(float a, float b){
167 vecf a_v = svdup_f32(a);
168 vecf b_v = svdup_f32(b);
169 return svzip1(a_v, b_v);
170 }
171 // Real float
172 inline vecf operator()(float a){
173 return svdup_f32(a);
174 }
175 // Complex double
176 inline vecd operator()(double a, double b){
177 vecd a_v = svdup_f64(a);
178 vecd b_v = svdup_f64(b);
179 return svzip1(a_v, b_v);
180 }
181 // Real double
182 inline vecd operator()(double a){
183 return svdup_f64(a);
184 }
185 // Integer
187 return svdup_u32(a);
188 }
189};
190
191struct Vstore{
192 // Real float
193 inline void operator()(vecf a, float *D){
194 pred pg1 = acle<float>::pg1();
195 svst1(pg1, D, a);
196 }
197 // Real double
198 inline void operator()(vecd a, double *D){
199 pred pg1 = acle<double>::pg1();
200 svst1(pg1, D, a);
201 }
202 // Real float
203 inline void operator()(veci a, Integer *D){
204 pred pg1 = acle<Integer>::pg1();
205 svst1(pg1, D, a);
206 }
207};
208
209struct Vstream{
210 // Real float
211 inline void operator()(float * a, vecf b){
212 pred pg1 = acle<float>::pg1();
213 svstnt1(pg1, a, b);
214 //svst1(pg1, a, b);
215 }
216 // Real double
217 inline void operator()(double * a, vecd b){
218 pred pg1 = acle<double>::pg1();
219 svstnt1(pg1, a, b);
220 //svst1(pg1, a, b);
221 }
222};
223
224struct Vset{
225 // Complex float
226 inline vecf operator()(Grid::ComplexF *a){
227 pred pg1 = acle<float>::pg1();
228 return svld1(pg1, (float*)a);
229 }
230 // Complex double
231 inline vecd operator()(Grid::ComplexD *a){
232 pred pg1 = acle<double>::pg1();
233 return svld1(pg1, (double*)a);
234 }
235 // Real float
236 inline vecf operator()(float *a){
237 pred pg1 = acle<float>::pg1();
238 return svld1(pg1, a);
239 }
240 // Real double
241 inline vecd operator()(double *a){
242 pred pg1 = acle<double>::pg1();
243 return svld1(pg1, a);
244 }
245 // Integer
247 pred pg1 = acle<Integer>::pg1();
248 return svld1(pg1, a);
249 }
250};
251
253// Arithmetic operations
255
256struct Sum{
257 // Complex/real float
258 inline vecf operator()(vecf a, vecf b){
259 pred pg1 = acle<float>::pg1();
260 return svadd_x(pg1, a, b);
261 }
262 // Complex/real double
263 inline vecd operator()(vecd a, vecd b){
264 pred pg1 = acle<double>::pg1();
265 return svadd_x(pg1, a, b);
266 }
267 // Integer
268 inline veci operator()(veci a, veci b){
269 pred pg1 = acle<Integer>::pg1();
270 return svadd_x(pg1, a, b);
271 }
272};
273
274struct Sub{
275 // Complex/real float
276 inline vecf operator()(vecf a, vecf b){
277 pred pg1 = acle<float>::pg1();
278 return svsub_x(pg1, a, b);
279 }
280 // Complex/real double
281 inline vecd operator()(vecd a, vecd b){
282 pred pg1 = acle<double>::pg1();
283 return svsub_x(pg1, a, b);
284 }
285 // Integer
286 inline veci operator()(veci a, veci b){
287 pred pg1 = acle<Integer>::pg1();
288 return svsub_x(pg1, a, b);
289 }
290
291};
292
293struct Mult{
294 // Real float fma
295 inline vecf operator()(vecf a, vecf b, vecf c){
296 pred pg1 = acle<float>::pg1();
297 return svmad_x(pg1, b, c, a);
298 }
299 // Real double fma
300 inline vecd operator()(vecd a, vecd b, vecd c){
301 pred pg1 = acle<double>::pg1();
302 return svmad_x(pg1, b, c, a);
303 }
304 // Real float
305 inline vecf operator()(vecf a, vecf b){
306 pred pg1 = acle<float>::pg1();
307 return svmul_x(pg1, a, b);
308 }
309 // Real double
310 inline vecd operator()(vecd a, vecd b){
311 pred pg1 = acle<double>::pg1();
312 return svmul_x(pg1, a, b);
313 }
314 // Integer
315 inline veci operator()(veci a, veci b){
316 pred pg1 = acle<Integer>::pg1();
317 return svmul_x(pg1, a, b);
318 }
319};
320
321struct MultRealPart{
322 // Complex float
323 inline vecf operator()(vecf a, vecf b){
324 pred pg1 = acle<float>::pg1();
325 // using FCMLA
326 vecf z_v = acle<float>::zero();
327 return svcmla_x(pg1, z_v, a, b, 0);
328 }
329 // Complex double
330 inline vecd operator()(vecd a, vecd b){
331 pred pg1 = acle<double>::pg1();
332 // using FCMLA
333 vecd z_v = acle<double>::zero();
334 return svcmla_x(pg1, z_v, a, b, 0);
335 }
336};
337
338struct MaddRealPart{
339 // Complex float
340 inline vecf operator()(vecf a, vecf b, vecf c){
341 pred pg1 = acle<float>::pg1();
342 // using FCMLA
343 return svcmla_x(pg1, c, a, b, 0);
344 }
345 // Complex double
346 inline vecd operator()(vecd a, vecd b, vecd c){
347 pred pg1 = acle<double>::pg1();
348 // using FCMLA
349 return svcmla_x(pg1, c, a, b, 0);
350 }
351};
352
353struct MultComplex{
354 // Complex a*b
355 // Complex float
356 inline vecf operator()(vecf a, vecf b){
357 pred pg1 = acle<float>::pg1();
359 // using FCMLA
360 vecf r_v = svcmla_x(pg1, z, a, b, 0);
361 return svcmla_x(pg1, r_v, a, b, 90);
362 }
363 // Complex double
364 inline vecd operator()(vecd a, vecd b){
365 pred pg1 = acle<double>::pg1();
367 // using FCMLA
368 vecd r_v = svcmla_x(pg1, z, a, b, 0);
369 return svcmla_x(pg1, r_v, a, b, 90);
370 }
371};
372
373struct MultAddComplex{
374 // Complex a*b+c
375 // Complex float
376 inline vecf operator()(vecf a, vecf b, vecf c){
377 pred pg1 = acle<float>::pg1();
378 // using FCMLA
379 vecf r_v = svcmla_x(pg1, c, a, b, 0);
380 return svcmla_x(pg1, r_v, a, b, 90);
381 }
382 // Complex double
383 inline vecd operator()(vecd a, vecd b, vecd c){
384 pred pg1 = acle<double>::pg1();
385 // using FCMLA
386 vecd r_v = svcmla_x(pg1, c, a, b, 0);
387 return svcmla_x(pg1, r_v, a, b, 90);
388 }
389};
390
391struct Div{
392 // Real float
393 inline vecf operator()(vecf a, vecf b){
394 pred pg1 = acle<float>::pg1();
395 return svdiv_x(pg1, a, b);
396 }
397 // Real double
398 inline vecd operator()(vecd a, vecd b){
399 pred pg1 = acle<double>::pg1();
400 return svdiv_x(pg1, a, b);
401 }
402};
403
404struct Conj{
405 // Complex float
407 pred pg_odd = acle<float>::pg_odd();
408 //return svneg_x(pg_odd, a); this is unsafe
409 return svneg_m(a, pg_odd, a);
410 }
411 // Complex double
413 pred pg_odd = acle<double>::pg_odd();
414 //return svneg_x(pg_odd, a); this is unsafe
415 return svneg_m(a, pg_odd, a);
416 }
417};
418
419struct TimesMinusI{
420 // Complex float
422 lutf tbl_swap = acle<float>::tbl_swap();
423 pred pg1 = acle<float>::pg1();
424 pred pg_odd = acle<float>::pg_odd();
425
426 vecf a_v = svtbl(a, tbl_swap);
427 //return svneg_x(pg_odd, a_v); this is unsafe
428 return svneg_m(a_v, pg_odd, a_v);
429 }
430 // Complex double
432 lutd tbl_swap = acle<double>::tbl_swap();
433 pred pg1 = acle<double>::pg1();
434 pred pg_odd = acle<double>::pg_odd();
435
436 vecd a_v = svtbl(a, tbl_swap);
437 //return svneg_x(pg_odd, a_v); this is unsafe
438 return svneg_m(a_v, pg_odd, a_v);
439 }
440};
441
442struct TimesI{
443 // Complex float
445 lutf tbl_swap = acle<float>::tbl_swap();
446 pred pg1 = acle<float>::pg1();
447 pred pg_even = acle<float>::pg_even();
448
449 vecf a_v = svtbl(a, tbl_swap);
450 //return svneg_x(pg_even, a_v); this is unsafe
451 return svneg_m(a_v, pg_even, a_v);
452 }
453 // Complex double
455 lutd tbl_swap = acle<double>::tbl_swap();
456 pred pg1 = acle<double>::pg1();
457 pred pg_even = acle<double>::pg_even();
458
459 vecd a_v = svtbl(a, tbl_swap);
460 //return svneg_x(pg_even, a_v); this is unsafe
461 return svneg_m(a_v, pg_even, a_v);
462 }
463};
464
465struct PrecisionChange {
466 static inline vech StoH (vecf sa, vecf sb) {
467 pred pg1s = acle<float>::pg1();
468 vech ha_v = svcvt_f16_x(pg1s, sa);
469 vech hb_v = svcvt_f16_x(pg1s, sb);
470 return svuzp1(ha_v, hb_v);
471 }
472 static inline void HtoS(vech h,vecf &sa,vecf &sb) {
473 pred pg1s = acle<float>::pg1();
474 vech ha_v = svzip1(h, h);
475 vech hb_v = svzip2(h, h);
476 sa = svcvt_f32_x(pg1s, ha_v);
477 sb = svcvt_f32_x(pg1s, hb_v);
478 }
479 static inline vecf DtoS (vecd a,vecd b) {
480 pred pg1d = acle<double>::pg1();
481 vecf sa_v = svcvt_f32_x(pg1d, a);
482 vecf sb_v = svcvt_f32_x(pg1d, b);
483 return svuzp1(sa_v, sb_v);
484 }
485 static inline void StoD (vecf s,vecd &a,vecd &b) {
486 pred pg1d = acle<double>::pg1();
487 vecf sa_v = svzip1(s, s);
488 vecf sb_v = svzip2(s, s);
489 a = svcvt_f64_x(pg1d, sa_v);
490 b = svcvt_f64_x(pg1d, sb_v);
491 }
492 static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) {
493 pred pg1d = acle<double>::pg1();
494 pred pg1h = acle<uint16_t>::pg1();
495 vech ha_v = svcvt_f16_x(pg1d, a);
496 vech hb_v = svcvt_f16_x(pg1d, b);
497 vech hc_v = svcvt_f16_x(pg1d, c);
498 vech hd_v = svcvt_f16_x(pg1d, d);
499 vech hab_v = svuzp1(ha_v, hb_v);
500 vech hcd_v = svuzp1(hc_v, hd_v);
501 return svuzp1(hab_v, hcd_v);
502
503/*
504 vecf sa,sb;
505 sa = DtoS(a,b);
506 sb = DtoS(c,d);
507 return StoH(sa,sb);
508*/
509 }
510 static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) {
511 pred pg1h = acle<uint16_t>::pg1();
512 pred pg1d = acle<double>::pg1();
513 vech sa_v = svzip1(h, h);
514 vech sb_v = svzip2(h, h);
515 vech da_v = svzip1(sa_v, sa_v);
516 vech db_v = svzip2(sa_v, sa_v);
517 vech dc_v = svzip1(sb_v, sb_v);
518 vech dd_v = svzip2(sb_v, sb_v);
519 a = svcvt_f64_x(pg1d, da_v);
520 b = svcvt_f64_x(pg1d, db_v);
521 c = svcvt_f64_x(pg1d, dc_v);
522 d = svcvt_f64_x(pg1d, dd_v);
523
524/*
525 vecf sa,sb;
526 HtoS(h,sa,sb);
527 StoD(sa,a,b);
528 StoD(sb,c,d);
529*/
530 }
531};
532
533struct Exchange{
534 // float
535 static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){
536 vecf r1_v = svext(in1, in1, (uint64_t)8u);
537 vecf r2_v = svext(in2, in2, (uint64_t)8u);
538 out1 = svext(r1_v, in2, (uint64_t)8u);
539 out2 = svext(in1, r2_v, (uint64_t)8u);
540 }
541 static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){
542 // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1
543 // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI
544 lutf tbl_exch1a = acle<float>::tbl_exch1a();
545 lutf tbl_exch1b = acle<float>::tbl_exch1b();
546 lutf tbl_exch1c = acle<float>::tbl_exch1c();
547
548 vecf a1_v = svtbl(in1, tbl_exch1a);
549 vecf a2_v = svtbl(in2, tbl_exch1b);
550 vecf b1_v = svext(a2_v, a1_v, (uint64_t)8u);
551 vecf b2_v = svext(a1_v, a2_v, (uint64_t)8u);
552 out1 = svtbl(b1_v, tbl_exch1c);
553 out2 = svtbl(b2_v, tbl_exch1a);
554 }
555 static inline void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2){
556 out1 = (vecf)svtrn1((vecd)in1, (vecd)in2);
557 out2 = (vecf)svtrn2((vecd)in1, (vecd)in2);
558 }
559 static inline void Exchange3(vecf &out1, vecf &out2, vecf in1, vecf in2){
560 out1 = svtrn1(in1, in2);
561 out2 = svtrn2(in1, in2);
562 }
563
564 // double
565 static inline void Exchange0(vecd &out1, vecd &out2, vecd in1, vecd in2){
566 vecd r1_v = svext(in1, in1, (uint64_t)4u);
567 vecd r2_v = svext(in2, in2, (uint64_t)4u);
568 out1 = svext(r1_v, in2, (uint64_t)4u);
569 out2 = svext(in1, r2_v, (uint64_t)4u);
570 }
571 static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){
572 // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1
573 // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI
574 lutd tbl_exch1a = acle<double>::tbl_exch1a();
575 lutd tbl_exch1b = acle<double>::tbl_exch1b();
576 lutd tbl_exch1c = acle<double>::tbl_exch1c();
577
578 vecd a1_v = svtbl(in1, tbl_exch1a);
579 vecd a2_v = svtbl(in2, tbl_exch1b);
580 vecd b1_v = svext(a2_v, a1_v, (uint64_t)4u);
581 vecd b2_v = svext(a1_v, a2_v, (uint64_t)4u);
582 out1 = svtbl(b1_v, tbl_exch1c);
583 out2 = svtbl(b2_v, tbl_exch1a);
584 }
585 static inline void Exchange2(vecd &out1, vecd &out2, vecd in1, vecd in2){
586 out1 = svtrn1(in1, in2);
587 out2 = svtrn2(in1, in2);
588 }
589 static inline void Exchange3(vecd &out1, vecd &out2, vecd in1, vecd in2){
590 assert(0);
591 return;
592 }
593};
594
595#undef VECTOR_FOR
596
597struct Permute{
598 // float
599 static inline vecf Permute0(vecf in) {
600 return svext(in, in, (uint64_t)8u);
601 }
602 static inline vecf Permute1(vecf in) {
603 lutf tbl_swap = acle<float>::tbl1();
604 return svtbl(in, tbl_swap);
605 }
606 static inline vecf Permute2(vecf in) {
607 lutf tbl_swap = acle<float>::tbl2();
608 return svtbl(in, tbl_swap);
609 }
610 static inline vecf Permute3(vecf in) {
611 lutf tbl_swap = acle<float>::tbl_swap();
612 return svtbl(in, tbl_swap);
613 }
614
615 // double
616 static inline vecd Permute0(vecd in) {
617 return svext(in, in, (uint64_t)(8u / 2u));
618 }
619 static inline vecd Permute1(vecd in) {
620 lutd tbl_swap = acle<double>::tbl1();
621 return svtbl(in, tbl_swap);
622 }
623 static inline vecd Permute2(vecd in) {
624 lutd tbl_swap = acle<double>::tbl_swap();
625 return svtbl(in, tbl_swap);
626 }
627 static inline vecd Permute3(vecd in) {
628 return in;
629 }
630};
631
632struct Rotate{
633
634 static inline vecf rotate(vecf in, int n){
635 switch(n){
636 case 0: return tRotate<0>(in); break;
637 case 1: return tRotate<1>(in); break;
638 case 2: return tRotate<2>(in); break;
639 case 3: return tRotate<3>(in); break;
640 case 4: return tRotate<4>(in); break;
641 case 5: return tRotate<5>(in); break;
642 case 6: return tRotate<6>(in); break;
643 case 7: return tRotate<7>(in); break;
644
645 case 8: return tRotate<8>(in); break;
646 case 9: return tRotate<9>(in); break;
647 case 10: return tRotate<10>(in); break;
648 case 11: return tRotate<11>(in); break;
649 case 12: return tRotate<12>(in); break;
650 case 13: return tRotate<13>(in); break;
651 case 14: return tRotate<14>(in); break;
652 case 15: return tRotate<15>(in); break;
653 default: assert(0);
654 }
655 }
656 static inline vecd rotate(vecd in, int n){
657 switch(n){
658 case 0: return tRotate<0>(in); break;
659 case 1: return tRotate<1>(in); break;
660 case 2: return tRotate<2>(in); break;
661 case 3: return tRotate<3>(in); break;
662 case 4: return tRotate<4>(in); break;
663 case 5: return tRotate<5>(in); break;
664 case 6: return tRotate<6>(in); break;
665 case 7: return tRotate<7>(in); break;
666 default: assert(0);
667 }
668 }
669
670 template <int n> static inline vecf tRotate(vecf in){
671 return svext(in, in, (uint64_t)n);
672 }
673 template <int n> static inline vecd tRotate(vecd in){
674 return svext(in, in, (uint64_t)n);
675 }
676};
677
678// tree-based reduction
679#define svred(pg, v)\
680svaddv(pg, v);
681
682// left-to-right reduction
683// #define svred(pg, v)\
684// svadda(pg, 0, v)
685
686template <typename Out_type, typename In_type>
687struct Reduce{
688 //Need templated class to overload output type
689 //General form must generate error if compiled
690 inline Out_type operator()(In_type in){
691 printf("Error, using wrong Reduce function\n");
692 //exit(1);
693 return 0;
694 }
695};
696//Complex float Reduce
697template <>
699 pred pg_even = acle<float>::pg_even();
700 pred pg_odd = acle<float>::pg_odd();
701 float a = svred(pg_even, in);
702 float b = svred(pg_odd, in);
703 return Grid::ComplexF(a, b);
704}
705//Real float Reduce
706template <>
708 pred pg1 = acle<float>::pg1();
709 return svred(pg1, in);
710}
711//Complex double Reduce
712template <>
714 pred pg_even = acle<double>::pg_even();
715 pred pg_odd = acle<double>::pg_odd();
716 double a = svred(pg_even, in);
717 double b = svred(pg_odd, in);
718 return Grid::ComplexD(a, b);
719}
720//Real double Reduce
721template <>
723 pred pg1 = acle<double>::pg1();
724 return svred(pg1, in);
725}
726//Integer Reduce
727template <>
729 pred pg1 = acle<Integer>::pg1();
730 return svred(pg1, in);
731}
732
733#undef svred
734
735NAMESPACE_END(Optimization);
736
738// Here assign types
739
740typedef vech SIMD_Htype; // Reduced precision type
741typedef vecf SIMD_Ftype; // Single precision type
742typedef vecd SIMD_Dtype; // Double precision type
743typedef veci SIMD_Itype; // Integer type
744
745// prefetch utilities
746inline void v_prefetch0(int size, const char *ptr){};
747inline void prefetch_HINT_T0(const char *ptr){};
748
749// Function name aliases
750typedef Optimization::Vsplat VsplatSIMD;
751typedef Optimization::Vstore VstoreSIMD;
752typedef Optimization::Vset VsetSIMD;
753typedef Optimization::Vstream VstreamSIMD;
754template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
755
756// Arithmetic operations
757typedef Optimization::Sum SumSIMD;
758typedef Optimization::Sub SubSIMD;
759typedef Optimization::Div DivSIMD;
760typedef Optimization::Mult MultSIMD;
761typedef Optimization::MultComplex MultComplexSIMD;
762typedef Optimization::MultAddComplex MultAddComplexSIMD;
763typedef Optimization::MultRealPart MultRealPartSIMD;
764typedef Optimization::MaddRealPart MaddRealPartSIMD;
765typedef Optimization::Conj ConjSIMD;
766typedef Optimization::TimesMinusI TimesMinusISIMD;
767typedef Optimization::TimesI TimesISIMD;
768
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
Optimization::Reduce< S, T > ReduceSIMD
vec< double > vecd
vec< Integer > veci
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
vec< float > vecf
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::MultAddComplex MultAddComplexSIMD
Optimization::Div DivSIMD
vec< uint16_t > vech
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
void prefetch_HINT_T0(const char *ptr)
#define svred(pg, v)
void v_prefetch0(int size, const char *ptr)
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
uint32_t Integer
Definition Simd.h:58
vecf operator()(vecf a)
vecd operator()(vecd a)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b)
static void Exchange3(vecd &out1, vecd &out2, vecd in1, vecd in2)
static void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2)
static void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2)
static void Exchange3(vecf &out1, vecf &out2, vecf in1, vecf in2)
static void Exchange0(vecd &out1, vecd &out2, vecd in1, vecd in2)
static void Exchange2(vecd &out1, vecd &out2, vecd in1, vecd in2)
static void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2)
static void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2)
vecf operator()(vecf a, vecf b, vecf c)
vecd operator()(vecd a, vecd b, vecd c)
vecf operator()(vecf a, vecf b, vecf c)
vecd operator()(vecd a, vecd b, vecd c)
vecd operator()(vecd a, vecd b)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b, vecd c)
veci operator()(veci a, veci b)
vecf operator()(vecf a, vecf b, vecf c)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b)
static vecd Permute3(vecd in)
static vecd Permute1(vecd in)
static vecf Permute3(vecf in)
static vecd Permute2(vecd in)
static vecf Permute0(vecf in)
static vecf Permute1(vecf in)
static vecd Permute0(vecd in)
static vecf Permute2(vecf in)
static vech StoH(vecf sa, vecf sb)
static void StoD(vecf s, vecd &a, vecd &b)
static vecf DtoS(vecd a, vecd b)
static void HtoD(vech h, vecd &a, vecd &b, vecd &c, vecd &d)
static vech DtoH(vecd a, vecd b, vecd c, vecd d)
static void HtoS(vech h, vecf &sa, vecf &sb)
Out_type operator()(In_type in)
static vec< T > tRotate(vec< T > in)
static vecf tRotate(vecf in)
static vecd tRotate(vecd in)
static vecd rotate(vecd in, int n)
static vecf rotate(vecf in, int n)
vecf operator()(vecf a, vecf b)
veci operator()(veci a, veci b)
vecd operator()(vecd a, vecd b)
veci operator()(veci a, veci b)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b)
vecd operator()(vecd a)
vecf operator()(vecf a)
vecd operator()(vecd a)
vecf operator()(vecf a)
vecd operator()(double *a)
veci operator()(Integer *a)
vecf operator()(Grid::ComplexF *a)
vecd operator()(Grid::ComplexD *a)
vecf operator()(float *a)
vecf operator()(float a, float b)
vecd operator()(double a)
vecf operator()(float a)
vecd operator()(double a, double b)
veci operator()(Integer a)
void operator()(veci a, Integer *D)
void operator()(vecd a, double *D)
void operator()(vecf a, float *D)
void operator()(double *a, vecd b)
void operator()(float *a, vecf b)
static lutd tbl_exch1b()
static lutd tbl_exch1a()
static lutd tbl_swap()
static lutd tbl_exch1c()
static pred pg_even()
static lutf tbl_exch1c()
static lutf tbl_swap()
static lutf tbl_exch1b()
static lutf tbl_exch1a()
uint64_t s[8]
uint32_t s[16]