42#define GEN_SIMD_WIDTH 16u
73 float tmp[4]={a,b,a,b};
74 return vld1q_f32(tmp);
78 return vdupq_n_f32(a);
83 return vld1q_f64(tmp);
87 return vdupq_n_f64(a);
91 return vdupq_n_u32(a);
114 memcpy(a,&b,4*
sizeof(
float));
118 memcpy(a,&b,2*
sizeof(
double));
127 float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
128 return vld1q_f32(tmp);
132 double tmp[2]={a[0].imag(),a[0].real()};
133 return vld1q_f64(tmp);
137 float tmp[4]={a[3],a[2],a[1],a[0]};
138 return vld1q_f32(tmp);
142 double tmp[2]={a[1],a[0]};
143 return vld1q_f64(tmp);
147 return vld1q_dup_u32(a);
151template <
typename Out_type,
typename In_type>
156 printf(
"Error, using wrong Reduce function\n");
168 return vaddq_f32(a,b);
172 return vaddq_f64(a,b);
176 return vaddq_u32(a,b);
183 return vsubq_f32(a,b);
187 return vsubq_f64(a,b);
191 return vsubq_u32(a,b);
197 float32x4_t re = vtrn1q_f32(a, a);
198 return vmulq_f32(re, b);
201 float64x2_t re = vzip1q_f64(a, a);
202 return vmulq_f64(re, b);
207 inline float32x4_t
operator()(float32x4_t a, float32x4_t b, float32x4_t c){
208 float32x4_t re = vtrn1q_f32(a, a);
209 return vfmaq_f32(c, re, b);
211 inline float64x2_t
operator()(float64x2_t a, float64x2_t b, float64x2_t c){
212 float64x2_t re = vzip1q_f64(a, a);
213 return vfmaq_f64(c, re, b);
220 return vdivq_f32(a, b);
224 return vdivq_f64(a, b);
232 float32x4_t r0, r1, r2, r3, r4;
237 r0 = vtrn1q_f32(b, b);
239 r2 = vtrn2q_f32(b, r1);
242 r3 = vmulq_f32(r2, a);
243 r4 = vrev64q_f32(r3);
246 return vfmaq_f32(r4, r0, a);
256 float64x2_t r0, r1, r2, r3, r4;
260 r0 = vtrn1q_f64(b, b);
262 r2 = vtrn2q_f64(b, r1);
265 r3 = vmulq_f64(r2, a);
266 r4 = vextq_f64(r3,r3,1);
269 return vfmaq_f64(r4, r0, a);
280 inline float32x4_t
mac(float32x4_t a, float32x4_t b, float32x4_t c){
282 return vfmaq_f32(a, b, c);
284 inline float64x2_t
mac(float64x2_t a, float64x2_t b, float64x2_t c){
286 return vfmaq_f64(a, b, c);
289 return vmulq_f32(a,b);
293 return vmulq_f64(a,b);
297 return vmulq_u32(a,b);
307 r1 = vrev64q_f32(r0);
308 return vtrn1q_f32(in, r1);
314 r0 = vextq_f64(in, in, 1);
316 return vextq_f64(r0, r1, 1);
327 r1 = vrev64q_f32(in);
328 return vtrn1q_f32(r1, r0);
335 return vextq_f64(in, tmp, 1);
345 r1 = vrev64q_f32(r0);
346 return vtrn1q_f32(r1, in);
353 return vextq_f64(tmp, in, 1);
359 static inline float32x4_t
Permute0(float32x4_t in){
361 return vextq_f32(in, in, 2);
363 static inline float32x4_t
Permute1(float32x4_t in){
365 return vrev64q_f32(in);
367 static inline float32x4_t
Permute2(float32x4_t in){
370 static inline float32x4_t
Permute3(float32x4_t in){
374 static inline float64x2_t
Permute0(float64x2_t in){
376 return vextq_f64(in, in, 1);
378 static inline float64x2_t
Permute1(float64x2_t in){
381 static inline float64x2_t
Permute2(float64x2_t in){
384 static inline float64x2_t
Permute3(float64x2_t in){
392 static inline float32x4_t
rotate(float32x4_t in,
int n){
409 static inline float64x2_t
rotate(float64x2_t in,
int n){
421 template<
int n>
static inline float32x4_t
tRotate(float32x4_t in){
return vextq_f32(in,in,n%4); };
422 template<
int n>
static inline float64x2_t
tRotate(float64x2_t in){
return vextq_f64(in,in,n%2); };
428 static inline float16x8_t
StoH (
const float32x4_t &a,
const float32x4_t &b) {
429 float16x4_t h = vcvt_f16_f32(a);
430 return vcvt_high_f16_f32(h, b);
432 static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
433 sb = vcvt_high_f32_f16(h);
438 uint32x4_t h1u =
reinterpret_cast<uint32x4_t
>(h);
439 float16x8_t h1 =
reinterpret_cast<float16x8_t
>(vextq_u32(h1u, h1u, 2));
440 sa = vcvt_high_f32_f16(h1);
442 static inline float32x4_t
DtoS (float64x2_t a,float64x2_t b) {
443 float32x2_t s = vcvt_f32_f64(a);
444 return vcvt_high_f32_f64(s, b);
447 static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
448 b = vcvt_high_f64_f32(s);
450 float32x4_t s1 = vextq_f32(s, s, 2);
451 a = vcvt_high_f64_f32(s1);
454 static inline float16x8_t
DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
455 float32x4_t s1 =
DtoS(a, b);
456 float32x4_t s2 =
DtoS(c, d);
459 static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
471 static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
476 float32x4_t z = vextq_f32(in1, in1, 2);
478 out1 = vextq_f32(z, in2, 2);
481 z = vextq_f32(in2, in2, 2);
483 out2 = vextq_f32(in1, z, 2);
486 static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
489 out1 = vtrn1q_f32(in1, in2);
490 out2 = vtrn2q_f32(in1, in2);
492 static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
496 static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
501 static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
504 out1 = vzip1q_f64(in1, in2);
505 out2 = vzip2q_f64(in1, in2);
507 static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
511 static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
515 static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
529 v1 = Optimization::Permute::Permute0(in);
530 v1 = vaddq_f32(v1,in);
532 return Grid::ComplexF(conv.
f[0],conv.
f[1]);
537 return vaddvq_f32(in);
545 return Grid::ComplexD(conv.
f[0],conv.
f[1]);
551 return vaddvq_f64(in);
557 return vaddvq_u32(in);
580template <
typename S,
typename T>
using ReduceSIMD = Optimization::Reduce<S,T>;
583typedef Optimization::Sum
SumSIMD;
584typedef Optimization::Sub
SubSIMD;
585typedef Optimization::Div
DivSIMD;
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
void prefetch_HINT_T0(const char *ptr)
void v_prefetch0(int size, const char *ptr)
#define NAMESPACE_BEGIN(A)
static INTERNAL_PRECISION F
float64x2_t operator()(float64x2_t in)
float32x4_t operator()(float32x4_t in)
float32x4_t operator()(float32x4_t a, float32x4_t b)
float64x2_t operator()(float64x2_t a, float64x2_t b)
static void Exchange3(float64x2_t &out1, float64x2_t &out2, float64x2_t in1, float64x2_t in2)
static void Exchange2(float64x2_t &out1, float64x2_t &out2, float64x2_t in1, float64x2_t in2)
static void Exchange2(float32x4_t &out1, float32x4_t &out2, float32x4_t in1, float32x4_t in2)
static void Exchange0(float64x2_t &out1, float64x2_t &out2, float64x2_t in1, float64x2_t in2)
static void Exchange1(float64x2_t &out1, float64x2_t &out2, float64x2_t in1, float64x2_t in2)
static void Exchange3(float32x4_t &out1, float32x4_t &out2, float32x4_t in1, float32x4_t in2)
static void Exchange0(float32x4_t &out1, float32x4_t &out2, float32x4_t in1, float32x4_t in2)
static void Exchange1(float32x4_t &out1, float32x4_t &out2, float32x4_t in1, float32x4_t in2)
float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c)
float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c)
float32x4_t operator()(float32x4_t a, float32x4_t b)
float64x2_t operator()(float64x2_t a, float64x2_t b)
float64x2_t operator()(float64x2_t a, float64x2_t b)
float32x4_t operator()(float32x4_t a, float32x4_t b)
float32x4_t operator()(float32x4_t a, float32x4_t b)
float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c)
float64x2_t operator()(float64x2_t a, float64x2_t b)
float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c)
uint32x4_t operator()(uint32x4_t a, uint32x4_t b)
static float64x2_t Permute0(float64x2_t in)
static float64x2_t Permute1(float64x2_t in)
static float32x4_t Permute1(float32x4_t in)
static float64x2_t Permute3(float64x2_t in)
static float32x4_t Permute0(float32x4_t in)
static float64x2_t Permute2(float64x2_t in)
static float32x4_t Permute2(float32x4_t in)
static float32x4_t Permute3(float32x4_t in)
static vech StoH(const vecf &sa, const vecf &sb)
static void HtoD(float16x8_t h, float64x2_t &a, float64x2_t &b, float64x2_t &c, float64x2_t &d)
static float16x8_t StoH(const float32x4_t &a, const float32x4_t &b)
static float32x4_t DtoS(float64x2_t a, float64x2_t b)
static void StoD(vecf s, vecd &a, vecd &b)
static vecf DtoS(vecd a, vecd b)
static float16x8_t DtoH(float64x2_t a, float64x2_t b, float64x2_t c, float64x2_t d)
static void StoD(float32x4_t s, float64x2_t &a, float64x2_t &b)
static void HtoS(vech h, vecf &sa, vecf &sb)
static void HtoS(float16x8_t h, float32x4_t &sa, float32x4_t &sb)
Out_type operator()(In_type in)
static vec< T > tRotate(vec< T > in)
static float64x2_t tRotate(float64x2_t in)
static float32x4_t rotate(float32x4_t in, int n)
static float32x4_t tRotate(float32x4_t in)
static float64x2_t rotate(float64x2_t in, int n)
uint32x4_t operator()(uint32x4_t a, uint32x4_t b)
float64x2_t operator()(float64x2_t a, float64x2_t b)
float32x4_t operator()(float32x4_t a, float32x4_t b)
uint32x4_t operator()(uint32x4_t a, uint32x4_t b)
float64x2_t operator()(float64x2_t a, float64x2_t b)
float32x4_t operator()(float32x4_t a, float32x4_t b)
float32x4_t operator()(float32x4_t in)
float64x2_t operator()(float64x2_t in)
float64x2_t operator()(float64x2_t in)
float32x4_t operator()(float32x4_t in)
uint32x4_t operator()(Integer *a)
float64x2_t operator()(double *a)
float32x4_t operator()(Grid::ComplexF *a)
float32x4_t operator()(float *a)
float64x2_t operator()(Grid::ComplexD *a)
float64x2_t operator()(double a, double b)
float32x4_t operator()(float a)
float32x4_t operator()(float a, float b)
uint32x4_t operator()(Integer a)
float64x2_t operator()(double a)
void operator()(uint32x4_t a, Integer *I)
void operator()(float32x4_t a, float *F)
void operator()(float64x2_t a, double *D)
void operator()(double *a, float64x2_t b)
void operator()(float *a, float32x4_t b)