31#define GEN_SIMD_WIDTH 32u
38#include <spi/include/kernel/location.h>
39#include <spi/include/l1p/types.h>
40#include <hwi/include/bqc/l1p_mmio.h>
41#include <hwi/include/bqc/A2_inlines.h>
51inline std::ostream &
operator<<(std::ostream& stream,
const vector4double a)
53 stream <<
"{"<<vec_extract(a,0)<<
","<<vec_extract(a,1)<<
","<<vec_extract(a,2)<<
","<<vec_extract(a,3)<<
"}";
59 stream <<
"{"<< a.
v0 <<
","<< a.
v1 <<
","<< a.
v2 <<
","<< a.
v3 <<
"}";
74 return (vector4double){a, b, a, b};
78 return (vector4double){a, a, a, a};
100 vec_st(a, 0, (
float *)(&f));
128 vec_st(a, 0, (
float *)(&f));
147 return (
vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()};
151 return vec_ld(0, (
double *)a);
160 return vec_ld(0, (
float *)(&a));
177template <
typename Out_type,
typename In_type>
182 printf(
"Error, using wrong Reduce function\n");
192#define FLOAT_WRAP_3(fn, pref) \
193 pref vector4float fn(vector4float a, vector4float b, vector4float c) \
195 vector4double ad, bd, rd, cd; \
201 rd = fn(ad, bd, cd); \
207#define FLOAT_WRAP_2(fn, pref) \
208 pref vector4float fn(vector4float a, vector4float b) \
210 vector4double ad, bd, rd; \
221#define FLOAT_WRAP_1(fn, pref) \
222 pref vector4float fn(vector4float a) \
224 vector4double ad, rd; \
236 inline vector4double
operator()(vector4double a, vector4double b){
237 return vec_add(a, b);
249 out.
v[i] = a.v[i] + b.v[i];
258 inline vector4double
operator()(vector4double a, vector4double b){
259 return vec_sub(a, b);
271 out.
v[i] = a.v[i] - b.v[i];
280 inline vector4double
operator()(vector4double a, vector4double b){
282 return vec_xmul(a, b);
288 inline vector4double
operator()(vector4double a, vector4double b,vector4double c){
289 return vec_xmadd(a, b, c);
295 inline vector4double
operator()(vector4double a, vector4double b){
296 return vec_xxnpmadd(a, b, vec_xmul(b, a));
305 inline vector4double
operator()(vector4double a, vector4double b){
306 return vec_mul(a, b);
318 out.
v[i] = a.v[i]*b.v[i];
327 inline vector4double
operator()(vector4double a, vector4double b){
328 return vec_swdiv(a, b);
340 out.
v[i] = a.v[i]/b.v[i];
350 return vec_mul(v, (vector4double){1., -1., 1., -1.});
360 return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
361 (vector4double){0., 0., 0., 0.});
371 return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
372 (vector4double){0., 0., 0., 0.});
382 std::cout <<
GridLogError <<
"QPX single to half precision conversion not yet supported." << std::endl;
387 std::cout <<
GridLogError <<
"QPX half to single precision conversion not yet supported." << std::endl;
392 std::cout <<
GridLogError <<
"QPX double to single precision conversion not yet supported." << std::endl;
397 std::cout <<
GridLogError <<
"QPX single to double precision conversion not yet supported." << std::endl;
400 static inline vech DtoH (vector4double a, vector4double b,
401 vector4double c, vector4double d) {
403 std::cout <<
GridLogError <<
"QPX double to half precision conversion not yet supported." << std::endl;
407 static inline void HtoD (
vech h, vector4double &a, vector4double &b,
408 vector4double &c, vector4double &d) {
409 std::cout <<
GridLogError <<
"QPX half to double precision conversion not yet supported." << std::endl;
416#define FLOAT_WRAP_EXCHANGE(fn) \
417 static inline void fn(vector4float &out1, vector4float &out2, \
418 vector4float in1, vector4float in2) \
420 vector4double out1d, out2d, in1d, in2d; \
421 in1d = Vset()(in1); \
422 in2d = Vset()(in2); \
423 fn(out1d, out2d, in1d, in2d); \
424 Vstore()(out1d, out1); \
425 Vstore()(out2d, out2); \
431 static inline void Exchange0(vector4double &out1, vector4double &out2,
432 vector4double in1, vector4double in2) {
433 out1 = vec_perm(in1, in2, vec_gpci(0145));
434 out2 = vec_perm(in1, in2, vec_gpci(02367));
436 static inline void Exchange1(vector4double &out1, vector4double &out2,
437 vector4double in1, vector4double in2) {
438 out1 = vec_perm(in1, in2, vec_gpci(0426));
439 out2 = vec_perm(in1, in2, vec_gpci(01537));
441 static inline void Exchange2(vector4double &out1, vector4double &out2,
442 vector4double in1, vector4double in2) {
445 static inline void Exchange3(vector4double &out1, vector4double &out2,
446 vector4double in1, vector4double in2) {
459 static inline vector4double
Permute0(vector4double v){
460 return vec_perm(v, v, vec_gpci(02301));
462 static inline vector4double
Permute1(vector4double v){
463 return vec_perm(v, v, vec_gpci(01032));
465 static inline vector4double
Permute2(vector4double v){
468 static inline vector4double
Permute3(vector4double v){
481 template<
int n>
static inline vector4double
tRotate(vector4double v){
482 if ( n==1 )
return vec_perm(v, v, vec_gpci(01230));
483 if ( n==2 )
return vec_perm(v, v, vec_gpci(02301));
484 if ( n==3 )
return vec_perm(v, v, vec_gpci(03012));
489 vector4double ad, rd;
497 static inline vector4double
rotate(vector4double v,
int n){
516 vector4double vd, rd;
531 v1 = Optimization::Permute::Permute0(v);
532 v1 = Optimization::Sum()(v1, v);
534 return Grid::ComplexF(v1.
v0, v1.
v1);
542 v1 = Optimization::Permute::Permute0(v);
543 v1 = Optimization::Sum()(v1, v);
544 v2 = Optimization::Permute::Permute1(v1);
545 v1 = Optimization::Sum()(v1, v2);
557 v1 = Optimization::Permute::Permute0(v);
560 return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
569 v1 = Optimization::Permute::Permute0(v);
571 v2 = Optimization::Permute::Permute1(v1);
572 v1 = vec_add(v1, v2);
574 return vec_extract(v1, 0);
581 for (
unsigned int i = 0; i < W<Integer>::r; ++i)
606template <
typename S,
typename T>
using ReduceSIMD = Optimization::Reduce<S,T>;
609typedef Optimization::Sum
SumSIMD;
610typedef Optimization::Sub
SubSIMD;
612typedef Optimization::Div
DivSIMD;
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
#define VECTOR_FOR(i, w, inc)
#define FLOAT_WRAP_1(fn, pref)
#define FLOAT_WRAP_3(fn, pref)
#define FLOAT_WRAP_2(fn, pref)
std::ostream & operator<<(std::ostream &stream, const vector4double a)
void prefetch_HINT_T0(const char *ptr)
void v_prefetch0(int size, const char *ptr)
accelerator_inline Grid_simd< S, V > rotate(Grid_simd< S, V > b, int nrot)
GridLogger GridLogError(1, "Error", GridLogColours, "RED")
#define NAMESPACE_BEGIN(A)
vector4double operator()(vector4double v)
vector4double operator()(vector4double a, vector4double b)
static void Exchange1(vec< T > &out1, vec< T > &out2, const vec< T > &in1, const vec< T > &in2)
static void Exchange0(vec< T > &out1, vec< T > &out2, const vec< T > &in1, const vec< T > &in2)
FLOAT_WRAP_EXCHANGE(Exchange3)
FLOAT_WRAP_EXCHANGE(Exchange2)
FLOAT_WRAP_EXCHANGE(Exchange0)
static void Exchange3(vector4double &out1, vector4double &out2, vector4double in1, vector4double in2)
static void Exchange2(vector4double &out1, vector4double &out2, vector4double in1, vector4double in2)
FLOAT_WRAP_EXCHANGE(Exchange1)
static void Exchange2(vec< T > &out1, vec< T > &out2, const vec< T > &in1, const vec< T > &in2)
static void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2)
static void Exchange1(vector4double &out1, vector4double &out2, vector4double in1, vector4double in2)
static void Exchange0(vector4double &out1, vector4double &out2, vector4double in1, vector4double in2)
vector4double operator()(vector4double a, vector4double b, vector4double c)
vector4double operator()(vector4double a, vector4double b)
vector4double operator()(vector4double a, vector4double b)
vector4double operator()(vector4double a, vector4double b)
static vec< T > Permute0(vec< T > in)
static vector4double Permute0(vector4double v)
static vector4double Permute1(vector4double v)
static vector4double Permute2(vector4double v)
static vecd Permute1(vecd in)
static vector4double Permute3(vector4double v)
static vecf Permute3(vecf in)
static vecd Permute2(vecd in)
static void HtoS(vech h, vector4float &sa, vector4float &sb)
static vech StoH(const vector4float &a, const vector4float &b)
static vector4float DtoS(vector4double a, vector4double b)
static vech DtoH(vector4double a, vector4double b, vector4double c, vector4double d)
static void HtoD(vech h, vector4double &a, vector4double &b, vector4double &c, vector4double &d)
static void StoD(vector4float s, vector4double &a, vector4double &b)
Out_type operator()(In_type in)
static vec< T > tRotate(vec< T > in)
static vector4float rotate(vector4float v, int n)
static vector4float tRotate(vector4float a)
static vector4double rotate(vector4double v, int n)
static vector4double tRotate(vector4double v)
vector4double operator()(vector4double a, vector4double b)
vector4double operator()(vector4double a, vector4double b)
vector4double operator()(vector4double v)
vector4double operator()(vector4double v)
veci operator()(Integer *a)
vector4double operator()(Grid::ComplexD *a)
vector4double operator()(double *a)
vector4float operator()(float *a)
vector4double operator()(vector4float a)
vector4float operator()(Grid::ComplexF *a)
vector4double operator()(double a, double b)
vector4float operator()(float a, float b)
veci operator()(Integer a)
vector4double operator()(double a)
vector4float operator()(float a)
void operator()(veci a, Integer *i)
void operator()(vector4double a, vector4float &f)
void operator()(vector4double a, double *d)
void operator()(vector4float a, float *f)
void operator()(vector4double a, float *f)
void operator()(vector4float f, vector4double a)
void operator()(float *f, vector4double a)
void operator()(double *d, vector4double a)
void operator()(float *f, vector4float a)