Grid 0.7.0
Grid_neon.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Grid_neon.h
6
7 Copyright (C) 2015
8
9 Author: Nils Meyer <nils.meyer@ur.de>
10 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
11 Author: neo <cossu@post.kek.jp>
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26
27 See the full license in the file "LICENSE" in the top level distribution directory
28*************************************************************************************/
29/* END LEGAL */
30
31/*
32
33 ARMv8 NEON intrinsics layer by
34
35 Nils Meyer <nils.meyer@ur.de>,
36 University of Regensburg, Germany
37 SFB/TRR55
38
39*/
40
41#ifndef GEN_SIMD_WIDTH
42#define GEN_SIMD_WIDTH 16u
43#endif
44
45#include "Grid_generic_types.h"
46#include <arm_neon.h>
47
49NAMESPACE_BEGIN(Optimization);
50
51template<class vtype>
52union uconv {
53 float32x4_t f;
54 vtype v;
55};
56union u128f {
57 float32x4_t v;
58 float f[4];
59};
60union u128d {
61 float64x2_t v;
62 double f[2];
63};
64// half precision
65union u128h {
66 float16x8_t v;
67 uint16_t f[8];
68};
69
70struct Vsplat{
71 //Complex float
72 inline float32x4_t operator()(float a, float b){
73 float tmp[4]={a,b,a,b};
74 return vld1q_f32(tmp);
75 }
76 // Real float
77 inline float32x4_t operator()(float a){
78 return vdupq_n_f32(a);
79 }
80 //Complex double
81 inline float64x2_t operator()(double a, double b){
82 double tmp[2]={a,b};
83 return vld1q_f64(tmp);
84 }
85 //Real double
86 inline float64x2_t operator()(double a){
87 return vdupq_n_f64(a);
88 }
89 //Integer
90 inline uint32x4_t operator()(Integer a){
91 return vdupq_n_u32(a);
92 }
93};
94
95struct Vstore{
96 //Float
97 inline void operator()(float32x4_t a, float* F){
98 vst1q_f32(F, a);
99 }
100 //Double
101 inline void operator()(float64x2_t a, double* D){
102 vst1q_f64(D, a);
103 }
104 //Integer
105 inline void operator()(uint32x4_t a, Integer* I){
106 vst1q_u32(I, a);
107 }
108
109};
110
111struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
112 //Float // N:generic
113 inline void operator()(float * a, float32x4_t b){
114 memcpy(a,&b,4*sizeof(float));
115 }
116 //Double // N:generic
117 inline void operator()(double * a, float64x2_t b){
118 memcpy(a,&b,2*sizeof(double));
119 }
120};
121
122// Nils: Vset untested; not used currently in Grid at all;
123// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
124struct Vset{
125 // Complex float
126 inline float32x4_t operator()(Grid::ComplexF *a){
127 float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
128 return vld1q_f32(tmp);
129 }
130 // Complex double
131 inline float64x2_t operator()(Grid::ComplexD *a){
132 double tmp[2]={a[0].imag(),a[0].real()};
133 return vld1q_f64(tmp);
134 }
135 // Real float
136 inline float32x4_t operator()(float *a){
137 float tmp[4]={a[3],a[2],a[1],a[0]};
138 return vld1q_f32(tmp);
139 }
140 // Real double
141 inline float64x2_t operator()(double *a){
142 double tmp[2]={a[1],a[0]};
143 return vld1q_f64(tmp);
144 }
145 // Integer
146 inline uint32x4_t operator()(Integer *a){
147 return vld1q_dup_u32(a);
148 }
149};
150
151template <typename Out_type, typename In_type>
152struct Reduce{
153 //Need templated class to overload output type
154 //General form must generate error if compiled
155 inline Out_type operator()(In_type in){
156 printf("Error, using wrong Reduce function\n");
157 exit(1);
158 return 0;
159 }
160};
161
163// Arithmetic operations
165struct Sum{
166 //Complex/Real float
167 inline float32x4_t operator()(float32x4_t a, float32x4_t b){
168 return vaddq_f32(a,b);
169 }
170 //Complex/Real double
171 inline float64x2_t operator()(float64x2_t a, float64x2_t b){
172 return vaddq_f64(a,b);
173 }
174 //Integer
175 inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
176 return vaddq_u32(a,b);
177 }
178};
179
180struct Sub{
181 //Complex/Real float
182 inline float32x4_t operator()(float32x4_t a, float32x4_t b){
183 return vsubq_f32(a,b);
184 }
185 //Complex/Real double
186 inline float64x2_t operator()(float64x2_t a, float64x2_t b){
187 return vsubq_f64(a,b);
188 }
189 //Integer
190 inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
191 return vsubq_u32(a,b);
192 }
193};
194
195struct MultRealPart{
196 inline float32x4_t operator()(float32x4_t a, float32x4_t b){
197 float32x4_t re = vtrn1q_f32(a, a);
198 return vmulq_f32(re, b);
199 }
200 inline float64x2_t operator()(float64x2_t a, float64x2_t b){
201 float64x2_t re = vzip1q_f64(a, a);
202 return vmulq_f64(re, b);
203 }
204};
205
206struct MaddRealPart{
207 inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
208 float32x4_t re = vtrn1q_f32(a, a);
209 return vfmaq_f32(c, re, b);
210 }
211 inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
212 float64x2_t re = vzip1q_f64(a, a);
213 return vfmaq_f64(c, re, b);
214 }
215};
216
217struct Div{
218 // Real float
219 inline float32x4_t operator()(float32x4_t a, float32x4_t b){
220 return vdivq_f32(a, b);
221 }
222 // Real double
223 inline float64x2_t operator()(float64x2_t a, float64x2_t b){
224 return vdivq_f64(a, b);
225 }
226};
227
228struct MultComplex{
229 // Complex float
230 inline float32x4_t operator()(float32x4_t a, float32x4_t b){
231
232 float32x4_t r0, r1, r2, r3, r4;
233
234 // a = ar ai Ar Ai
235 // b = br bi Br Bi
236 // collect real/imag part, negate bi and Bi
237 r0 = vtrn1q_f32(b, b); // br br Br Br
238 r1 = vnegq_f32(b); // -br -bi -Br -Bi
239 r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi
240
241 // the fun part
242 r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ...
243 r4 = vrev64q_f32(r3); // -bi*ai bi*ar ...
244
245 // fma(a,b,c) = a+b*c
246 return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
247
248 // no fma, use mul and add
249 // float32x4_t r5;
250 // r5 = vmulq_f32(r0, a);
251 // return vaddq_f32(r4, r5);
252 }
253 // Complex double
254 inline float64x2_t operator()(float64x2_t a, float64x2_t b){
255
256 float64x2_t r0, r1, r2, r3, r4;
257
258 // b = br bi
259 // collect real/imag part, negate bi
260 r0 = vtrn1q_f64(b, b); // br br
261 r1 = vnegq_f64(b); // -br -bi
262 r2 = vtrn2q_f64(b, r1); // bi -bi
263
264 // the fun part
265 r3 = vmulq_f64(r2, a); // bi*ar -bi*ai
266 r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar
267
268 // fma(a,b,c) = a+b*c
269 return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
270
271 // no fma, use mul and add
272 // float64x2_t r5;
273 // r5 = vmulq_f64(r0, a);
274 // return vaddq_f64(r4, r5);
275 }
276};
277
278struct Mult{
279 // Real float
280 inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
281 //return vaddq_f32(vmulq_f32(b,c),a);
282 return vfmaq_f32(a, b, c);
283 }
284 inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
285 //return vaddq_f64(vmulq_f64(b,c),a);
286 return vfmaq_f64(a, b, c);
287 }
288 inline float32x4_t operator()(float32x4_t a, float32x4_t b){
289 return vmulq_f32(a,b);
290 }
291 // Real double
292 inline float64x2_t operator()(float64x2_t a, float64x2_t b){
293 return vmulq_f64(a,b);
294 }
295 // Integer
296 inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
297 return vmulq_u32(a,b);
298 }
299};
300
301struct Conj{
302 // Complex single
303 inline float32x4_t operator()(float32x4_t in){
304 // ar ai br bi -> ar -ai br -bi
305 float32x4_t r0, r1;
306 r0 = vnegq_f32(in); // -ar -ai -br -bi
307 r1 = vrev64q_f32(r0); // -ai -ar -bi -br
308 return vtrn1q_f32(in, r1); // ar -ai br -bi
309 }
310 // Complex double
311 inline float64x2_t operator()(float64x2_t in){
312
313 float64x2_t r0, r1;
314 r0 = vextq_f64(in, in, 1); // ai ar
315 r1 = vnegq_f64(r0); // -ai -ar
316 return vextq_f64(r0, r1, 1); // ar -ai
317 }
318 // do not define for integer input
319};
320
321struct TimesMinusI{
322 //Complex single
323 inline float32x4_t operator()(float32x4_t in){
324 // ar ai br bi -> ai -ar ai -br
325 float32x4_t r0, r1;
326 r0 = vnegq_f32(in); // -ar -ai -br -bi
327 r1 = vrev64q_f32(in); // ai ar bi br
328 return vtrn1q_f32(r1, r0); // ar -ai br -bi
329 }
330 //Complex double
331 inline float64x2_t operator()(float64x2_t in){
332 // a ib -> b -ia
333 float64x2_t tmp;
334 tmp = vnegq_f64(in);
335 return vextq_f64(in, tmp, 1);
336 }
337};
338
339struct TimesI{
340 //Complex single
341 inline float32x4_t operator()(float32x4_t in){
342 // ar ai br bi -> -ai ar -bi br
343 float32x4_t r0, r1;
344 r0 = vnegq_f32(in); // -ar -ai -br -bi
345 r1 = vrev64q_f32(r0); // -ai -ar -bi -br
346 return vtrn1q_f32(r1, in); // -ai ar -bi br
347 }
348 //Complex double
349 inline float64x2_t operator()(float64x2_t in){
350 // a ib -> -b ia
351 float64x2_t tmp;
352 tmp = vnegq_f64(in);
353 return vextq_f64(tmp, in, 1);
354 }
355};
356
357struct Permute{
358
359 static inline float32x4_t Permute0(float32x4_t in){ // N:ok
360 // AB CD -> CD AB
361 return vextq_f32(in, in, 2);
362 };
363 static inline float32x4_t Permute1(float32x4_t in){ // N:ok
364 // AB CD -> BA DC
365 return vrev64q_f32(in);
366 };
367 static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
368 return in;
369 };
370 static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
371 return in;
372 };
373
374 static inline float64x2_t Permute0(float64x2_t in){ // N:ok
375 // AB -> BA
376 return vextq_f64(in, in, 1);
377 };
378 static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
379 return in;
380 };
381 static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
382 return in;
383 };
384 static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
385 return in;
386 };
387
388};
389
390struct Rotate{
391
392 static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
393 switch(n){
394 case 0: // AB CD -> AB CD
395 return tRotate<0>(in);
396 break;
397 case 1: // AB CD -> BC DA
398 return tRotate<1>(in);
399 break;
400 case 2: // AB CD -> CD AB
401 return tRotate<2>(in);
402 break;
403 case 3: // AB CD -> DA BC
404 return tRotate<3>(in);
405 break;
406 default: assert(0);
407 }
408 }
409 static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
410 switch(n){
411 case 0: // AB -> AB
412 return tRotate<0>(in);
413 break;
414 case 1: // AB -> BA
415 return tRotate<1>(in);
416 break;
417 default: assert(0);
418 }
419 }
420
421 template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
422 template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
423
424};
425
426struct PrecisionChange {
427
428 static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
429 float16x4_t h = vcvt_f16_f32(a);
430 return vcvt_high_f16_f32(h, b);
431 }
432 static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
433 sb = vcvt_high_f32_f16(h);
434 // there is no direct conversion from lower float32x4_t to float64x2_t
435 // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
436 // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
437 // workaround for clang
438 uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
439 float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
440 sa = vcvt_high_f32_f16(h1);
441 }
442 static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
443 float32x2_t s = vcvt_f32_f64(a);
444 return vcvt_high_f32_f64(s, b);
445
446 }
447 static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
448 b = vcvt_high_f64_f32(s);
449 // there is no direct conversion from lower float32x4_t to float64x2_t
450 float32x4_t s1 = vextq_f32(s, s, 2);
451 a = vcvt_high_f64_f32(s1);
452
453 }
454 static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
455 float32x4_t s1 = DtoS(a, b);
456 float32x4_t s2 = DtoS(c, d);
457 return StoH(s1, s2);
458 }
459 static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
460 float32x4_t s1, s2;
461 HtoS(h, s1, s2);
462 StoD(s1, a, b);
463 StoD(s2, c, d);
464 }
465};
466
468// Exchange support
469
470struct Exchange{
471 static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
472 // in1: ABCD -> out1: ABEF
473 // in2: EFGH -> out2: CDGH
474
475 // z: CDAB
476 float32x4_t z = vextq_f32(in1, in1, 2);
477 // out1: ABEF
478 out1 = vextq_f32(z, in2, 2);
479
480 // z: GHEF
481 z = vextq_f32(in2, in2, 2);
482 // out2: CDGH
483 out2 = vextq_f32(in1, z, 2);
484 };
485
486 static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
487 // in1: ABCD -> out1: AECG
488 // in2: EFGH -> out2: BFDH
489 out1 = vtrn1q_f32(in1, in2);
490 out2 = vtrn2q_f32(in1, in2);
491 };
492 static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
493 assert(0);
494 return;
495 };
496 static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
497 assert(0);
498 return;
499 };
500 // double precision
501 static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
502 // in1: AB -> out1: AC
503 // in2: CD -> out2: BD
504 out1 = vzip1q_f64(in1, in2);
505 out2 = vzip2q_f64(in1, in2);
506 };
507 static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
508 assert(0);
509 return;
510 };
511 static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
512 assert(0);
513 return;
514 };
515 static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
516 assert(0);
517 return;
518 };
519};
520
522// Some Template specialization
523
524
525//Complex float Reduce
526template<>
527inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
528 float32x4_t v1; // two complex
529 v1 = Optimization::Permute::Permute0(in);
530 v1 = vaddq_f32(v1,in);
531 u128f conv; conv.v=v1;
532 return Grid::ComplexF(conv.f[0],conv.f[1]);
533}
534//Real float Reduce
535template<>
536inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
537 return vaddvq_f32(in);
538}
539
540
541//Complex double Reduce
542template<>
543inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
544 u128d conv; conv.v = in;
545 return Grid::ComplexD(conv.f[0],conv.f[1]);
546}
547
548//Real double Reduce
549template<>
550inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
551 return vaddvq_f64(in);
552}
553
554//Integer Reduce
555template<>
557 return vaddvq_u32(in);
558}
559
560NAMESPACE_END(Optimization);
561
563// Here assign types
564
565// typedef Optimization::vech SIMD_Htype; // Reduced precision type
566typedef float16x8_t SIMD_Htype; // Half precision type
567typedef float32x4_t SIMD_Ftype; // Single precision type
568typedef float64x2_t SIMD_Dtype; // Double precision type
569typedef uint32x4_t SIMD_Itype; // Integer type
570
571inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities
572inline void prefetch_HINT_T0(const char *ptr){};
573
574
575// Function name aliases
576typedef Optimization::Vsplat VsplatSIMD;
577typedef Optimization::Vstore VstoreSIMD;
578typedef Optimization::Vset VsetSIMD;
579typedef Optimization::Vstream VstreamSIMD;
580template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
581
582// Arithmetic operations
583typedef Optimization::Sum SumSIMD;
584typedef Optimization::Sub SubSIMD;
585typedef Optimization::Div DivSIMD;
586typedef Optimization::Mult MultSIMD;
587typedef Optimization::MultComplex MultComplexSIMD;
588typedef Optimization::MultRealPart MultRealPartSIMD;
589typedef Optimization::MaddRealPart MaddRealPartSIMD;
590typedef Optimization::Conj ConjSIMD;
591typedef Optimization::TimesMinusI TimesMinusISIMD;
592typedef Optimization::TimesI TimesISIMD;
593
595
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
void prefetch_HINT_T0(const char *ptr)
Definition Grid_neon.h:572
void v_prefetch0(int size, const char *ptr)
Definition Grid_neon.h:571
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
uint32_t Integer
Definition Simd.h:58
static INTERNAL_PRECISION F
Definition Zolotarev.cc:230
float64x2_t operator()(float64x2_t in)
Definition Grid_neon.h:311
float32x4_t operator()(float32x4_t in)
Definition Grid_neon.h:303
float32x4_t operator()(float32x4_t a, float32x4_t b)
Definition Grid_neon.h:219
float64x2_t operator()(float64x2_t a, float64x2_t b)
Definition Grid_neon.h:223
static void Exchange3(float64x2_t &out1, float64x2_t &out2, float64x2_t in1, float64x2_t in2)
Definition Grid_neon.h:515
static void Exchange2(float64x2_t &out1, float64x2_t &out2, float64x2_t in1, float64x2_t in2)
Definition Grid_neon.h:511
static void Exchange2(float32x4_t &out1, float32x4_t &out2, float32x4_t in1, float32x4_t in2)
Definition Grid_neon.h:492
static void Exchange0(float64x2_t &out1, float64x2_t &out2, float64x2_t in1, float64x2_t in2)
Definition Grid_neon.h:501
static void Exchange1(float64x2_t &out1, float64x2_t &out2, float64x2_t in1, float64x2_t in2)
Definition Grid_neon.h:507
static void Exchange3(float32x4_t &out1, float32x4_t &out2, float32x4_t in1, float32x4_t in2)
Definition Grid_neon.h:496
static void Exchange0(float32x4_t &out1, float32x4_t &out2, float32x4_t in1, float32x4_t in2)
Definition Grid_neon.h:471
static void Exchange1(float32x4_t &out1, float32x4_t &out2, float32x4_t in1, float32x4_t in2)
Definition Grid_neon.h:486
float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c)
Definition Grid_neon.h:211
float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c)
Definition Grid_neon.h:207
float32x4_t operator()(float32x4_t a, float32x4_t b)
Definition Grid_neon.h:230
float64x2_t operator()(float64x2_t a, float64x2_t b)
Definition Grid_neon.h:254
float64x2_t operator()(float64x2_t a, float64x2_t b)
Definition Grid_neon.h:200
float32x4_t operator()(float32x4_t a, float32x4_t b)
Definition Grid_neon.h:196
float32x4_t operator()(float32x4_t a, float32x4_t b)
Definition Grid_neon.h:288
float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c)
Definition Grid_neon.h:284
float64x2_t operator()(float64x2_t a, float64x2_t b)
Definition Grid_neon.h:292
float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c)
Definition Grid_neon.h:280
uint32x4_t operator()(uint32x4_t a, uint32x4_t b)
Definition Grid_neon.h:296
static float64x2_t Permute0(float64x2_t in)
Definition Grid_neon.h:374
static float64x2_t Permute1(float64x2_t in)
Definition Grid_neon.h:378
static float32x4_t Permute1(float32x4_t in)
Definition Grid_neon.h:363
static float64x2_t Permute3(float64x2_t in)
Definition Grid_neon.h:384
static float32x4_t Permute0(float32x4_t in)
Definition Grid_neon.h:359
static float64x2_t Permute2(float64x2_t in)
Definition Grid_neon.h:381
static float32x4_t Permute2(float32x4_t in)
Definition Grid_neon.h:367
static float32x4_t Permute3(float32x4_t in)
Definition Grid_neon.h:370
static vech StoH(const vecf &sa, const vecf &sb)
static void HtoD(float16x8_t h, float64x2_t &a, float64x2_t &b, float64x2_t &c, float64x2_t &d)
Definition Grid_neon.h:459
static float16x8_t StoH(const float32x4_t &a, const float32x4_t &b)
Definition Grid_neon.h:428
static float32x4_t DtoS(float64x2_t a, float64x2_t b)
Definition Grid_neon.h:442
static void StoD(vecf s, vecd &a, vecd &b)
static vecf DtoS(vecd a, vecd b)
static float16x8_t DtoH(float64x2_t a, float64x2_t b, float64x2_t c, float64x2_t d)
Definition Grid_neon.h:454
static void StoD(float32x4_t s, float64x2_t &a, float64x2_t &b)
Definition Grid_neon.h:447
static void HtoS(vech h, vecf &sa, vecf &sb)
static void HtoS(float16x8_t h, float32x4_t &sa, float32x4_t &sb)
Definition Grid_neon.h:432
Out_type operator()(In_type in)
Definition Grid_neon.h:155
static vec< T > tRotate(vec< T > in)
static float64x2_t tRotate(float64x2_t in)
Definition Grid_neon.h:422
static float32x4_t rotate(float32x4_t in, int n)
Definition Grid_neon.h:392
static float32x4_t tRotate(float32x4_t in)
Definition Grid_neon.h:421
static float64x2_t rotate(float64x2_t in, int n)
Definition Grid_neon.h:409
uint32x4_t operator()(uint32x4_t a, uint32x4_t b)
Definition Grid_neon.h:190
float64x2_t operator()(float64x2_t a, float64x2_t b)
Definition Grid_neon.h:186
float32x4_t operator()(float32x4_t a, float32x4_t b)
Definition Grid_neon.h:182
uint32x4_t operator()(uint32x4_t a, uint32x4_t b)
Definition Grid_neon.h:175
float64x2_t operator()(float64x2_t a, float64x2_t b)
Definition Grid_neon.h:171
float32x4_t operator()(float32x4_t a, float32x4_t b)
Definition Grid_neon.h:167
float32x4_t operator()(float32x4_t in)
Definition Grid_neon.h:341
float64x2_t operator()(float64x2_t in)
Definition Grid_neon.h:349
float64x2_t operator()(float64x2_t in)
Definition Grid_neon.h:331
float32x4_t operator()(float32x4_t in)
Definition Grid_neon.h:323
uint32x4_t operator()(Integer *a)
Definition Grid_neon.h:146
float64x2_t operator()(double *a)
Definition Grid_neon.h:141
float32x4_t operator()(Grid::ComplexF *a)
Definition Grid_neon.h:126
float32x4_t operator()(float *a)
Definition Grid_neon.h:136
float64x2_t operator()(Grid::ComplexD *a)
Definition Grid_neon.h:131
float64x2_t operator()(double a, double b)
Definition Grid_neon.h:81
float32x4_t operator()(float a)
Definition Grid_neon.h:77
float32x4_t operator()(float a, float b)
Definition Grid_neon.h:72
uint32x4_t operator()(Integer a)
Definition Grid_neon.h:90
float64x2_t operator()(double a)
Definition Grid_neon.h:86
void operator()(uint32x4_t a, Integer *I)
Definition Grid_neon.h:105
void operator()(float32x4_t a, float *F)
Definition Grid_neon.h:97
void operator()(float64x2_t a, double *D)
Definition Grid_neon.h:101
void operator()(double *a, float64x2_t b)
Definition Grid_neon.h:117
void operator()(float *a, float32x4_t b)
Definition Grid_neon.h:113
double f[2]
Definition Grid_neon.h:62
float64x2_t v
Definition Grid_neon.h:61
float32x4_t v
Definition Grid_neon.h:57
float f[4]
Definition Grid_neon.h:58
uint16_t f[8]
Definition Grid_neon.h:67
float16x8_t v
Definition Grid_neon.h:66
__m256 f
Definition Grid_avx.h:46
vtype v
Definition Grid_avx.h:47