39#include <hip/hip_fp16.h>
41#if !defined(GRID_CUDA) && !defined(GRID_HIP)
43 typedef struct { uint16_t x;}
half;
57#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
85 r.
z.x = lhs.
z.x + rhs.
z.x;
86 r.
z.y = lhs.
z.y + rhs.
z.y;
91 r.
z.x = lhs.
z.x - rhs.
z.x;
92 r.
z.y = lhs.
z.y - rhs.
z.y;
97 r.
z.x= lhs.
z.x*rhs.
z.x - lhs.
z.y*rhs.
z.y;
98 r.
z.y= lhs.
z.x*rhs.
z.y + lhs.
z.y*rhs.
z.x;
104 ret.
z.x = l.
z.x*r.
z.x;
105 ret.
z.y = l.
z.x*r.
z.y;
109 stream <<
"("<< o.
z.x <<
","<< o.
z.y <<
")";
114template<
int _N,
class _datum>
117 static const int N = _N;
122template<
int N,
class datum>
125 for(
int i=0;i<N;i++) {
126 ret.v[i] = l.v[i]*r.v[i];
130template<
int N,
class datum>
133 for(
int i=0;i<N;i++) {
134 ret.v[i] = l.v[i]-r.v[i];
138template<
int N,
class datum>
141 for(
int i=0;i<N;i++) {
142 ret.v[i] = l.v[i]+r.v[i];
146template<
int N,
class datum>
149 for(
int i=0;i<N;i++) {
150 ret.v[i] = l.v[i]/r.v[i];
183#if defined(GRID_CUDA) || defined(GRID_HIP)
195#if defined(GRID_CUDA) || defined(GRID_HIP)
250 template<
int N,
class datum,
class P>
258 template<
int N,
class datum,
class P>
270 for(
int i=0;i<vec::N;i++){
271 ret.
v[i] = vec::datum(a[i].
real(),a[i].
imag());
279 for(
int i=0;i<vec::N;i++){
280 ret.
v[i] = vec::datum(a[i].
real(),a[i].
imag());
288 for(
int i=0;i<vec::N;i++){
289 ret.
v[i] = vec::datum(a[i]);
297 for(
int i=0;i<vec::N;i++){
298 ret.
v[i] = vec::datum(a[i]);
306 for(
int i=0;i<vec::N;i++){
307 ret.
v[i] = vec::datum(a[i]);
313 template <
typename Out_type,
typename In_type>
318 printf(
"Error, using wrong Reduce function\n");
368 for(
int i=0;i<vec::N;i++){
376 for(
int i=0;i<vec::N;i++){
387 for(
int i=0;i<vec::N;i++){
395 for(
int i=0;i<vec::N;i++){
449 ret.v[i].z.x = a.v[i].z.x / b.v[i].z.x;
450 ret.v[i].z.y = a.v[i].z.y / b.v[i].z.y;
457 ret.v[i].z.x = a.v[i].z.x / b.v[i].z.x;
458 ret.v[i].z.y = a.v[i].z.y / b.v[i].z.y;
470 for(
int i=0;i<vec::N;i++){
471 ret.
v[i].z.x = in.v[i].z.x;
472 ret.
v[i].z.y =-in.v[i].z.y;
479 for(
int i=0;i<vec::N;i++){
480 ret.
v[i].z.x = in.v[i].z.x;
481 ret.
v[i].z.y =-in.v[i].z.y;
492 for(
int i=0;i<vec::N;i++){
493 ret.
v[i].z.x = in.v[i].z.y;
494 ret.
v[i].z.y =-in.v[i].z.x;
501 for(
int i=0;i<vec::N;i++){
502 ret.
v[i].z.x = in.v[i].z.y;
503 ret.
v[i].z.y =-in.v[i].z.x;
514 for(
int i=0;i<vec::N;i++){
515 ret.
v[i].z.x =-in.v[i].z.y;
516 ret.
v[i].z.y = in.v[i].z.x;
523 for(
int i=0;i<vec::N;i++){
524 ret.
v[i].z.x =-in.v[i].z.y;
525 ret.
v[i].z.y = in.v[i].z.x;
533 template <
int n,
typename vec>
536 unsigned int _mask = vec::N >> (n + 1);
537 for(
int i=0;i<vec::N;i++) {
538 out.
v[i] = in.
v[i^_mask];
558 for(
int i=0;i<N;i++) {
568 for(
int i=0;i<N;i++) {
578 for(
int i=0;i<N;i++) {
586 for(
int i=0;i<N;i++) {
598 for(
int i=0;i<N;i++) {
599 h.v[i ].z.x = a.v[i].z.x;
600 h.v[i ].z.y = a.v[i].z.y;
601 h.v[i+N].z.x = b.v[i].z.x;
602 h.v[i+N].z.y = b.v[i].z.y;
609 for(
int i=0;i<N;i++) {
610 sa.v[i].z.x = h.v[i ].z.x;
611 sa.v[i].z.y = h.v[i ].z.y;
612 sb.v[i].z.x = h.v[i+N].z.x;
613 sb.v[i].z.y = h.v[i+N].z.y;
620 for(
int i=0;i<N;i++) {
629 for(
int i=0;i<N;i++) {
666 template <
typename vec,
int n>
668 unsigned int mask = vec::N >> (n + 1);
669 for(
int i=0;i<vec::N;i++) {
671 if ( (i&mask) == 0 ) { out1.
v[i]=in1.
v[j1];}
672 else { out1.
v[i]=in2.
v[j1];}
674 if ( (i&mask) == 0 ) { out2.
v[i]=in1.
v[j2];}
675 else { out2.
v[i]=in2.
v[j2];}
678 template <
typename vec>
682 template <
typename vec>
686 template <
typename vec>
690 template <
typename vec>
703 template <
typename vec>
706 for(
int i=0;i<vec::N;i++){
707 out.
v[i] = in.
v[(i + n)%vec::N];
741 greduce = greduce+in.v[i];
743 Grid::ComplexF ret(greduce.z.x,greduce.z.y);
753 greduce = greduce+in.v[i];
755 Grid::ComplexD ret(greduce.z.x,greduce.z.y);
#define accelerator_inline
accelerator_inline Grid_simd2< S, V > real_mult(Grid_simd2< S, V > a, Grid_simd2< S, V > b)
#define COALESCE_GRANULARITY
accelerator_inline float sfw_half_to_float(Grid_half h)
accelerator_inline Grid_half sfw_float_to_half(float ff)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
static INTERNAL_PRECISION F
accelerator_inline GpuComplex()=default
accelerator_inline GpuComplex & operator-=(const GpuComplex &r)
accelerator_inline GpuComplex & operator+=(const GpuComplex &r)
accelerator_inline Real real(void) const
accelerator_inline Real imag(void) const
friend accelerator_inline GpuComplex operator*(const GpuComplex &lhs, const GpuComplex &rhs)
accelerator_inline GpuComplex(const GpuComplex &zz)
friend std::ostream & operator<<(std::ostream &stream, const GpuComplex o)
accelerator_inline GpuComplex & operator=(const Zero &zz)
friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs, const GpuComplex &rhs)
friend accelerator_inline GpuComplex operator-(const GpuComplex &lhs, const GpuComplex &rhs)
accelerator_inline GpuComplex & operator*=(const GpuComplex &r)
friend accelerator_inline GpuComplex real_mult(const GpuComplex &l, const GpuComplex &r)
accelerator_inline GpuComplex(Real re, Real im)
accelerator_inline half float2half(float f)
constexpr int NSIMD_ComplexF
Optimization::Reduce< S, T > ReduceSIMD
Optimization::MaddRealPart MaddRealPartSIMD
struct Grid::Half2_t Half2
Optimization::Div DivSIMD
accelerator GpuVector< N, datum > operator/(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
GpuComplexVector< NSIMD_ComplexF, float > GpuVectorCF
GpuComplex< double2 > GpuComplexD
GpuComplex< Half2 > GpuComplexH
Optimization::MultComplex MultComplexSIMD
Optimization::Conj ConjSIMD
Optimization::Vsplat VsplatSIMD
accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r)
Optimization::Sum SumSIMD
accelerator_inline void prefetch_HINT_T0(const char *ptr)
constexpr int NSIMD_Integer
constexpr int NSIMD_ComplexH
GpuComplex< float2 > GpuComplexF
GpuVector< NSIMD_RealF, float > GpuVectorRF
Optimization::MultRealPart MultRealPartSIMD
Optimization::TimesI TimesISIMD
constexpr int NSIMD_ComplexD
Optimization::Mult MultSIMD
accelerator_inline void v_prefetch0(int size, const char *ptr)
accelerator_inline GpuComplexF timesI(const GpuComplexF &r)
GpuComplexVector< NSIMD_ComplexH, half > GpuVectorCH
accelerator_inline float half2float(half h)
constexpr int NSIMD_RealH
Optimization::Vset VsetSIMD
GpuVector< NSIMD_RealD, double > GpuVectorRD
Optimization::Vstore VstoreSIMD
GpuVector< NSIMD_RealH, half > GpuVectorRH
constexpr int NSIMD_RealF
Optimization::Sub SubSIMD
Optimization::TimesMinusI TimesMinusISIMD
GpuVector< NSIMD_Integer, Integer > GpuVectorI
accelerator GpuVector< N, datum > operator*(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
accelerator GpuVector< N, datum > operator+(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
Optimization::Vstream VstreamSIMD
accelerator GpuVector< N, datum > operator-(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
constexpr int NSIMD_RealD
GpuComplexVector< NSIMD_ComplexD, double > GpuVectorCD
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
static accelerator_inline void ExchangeN(GpuVector< _N, _datum > &out1, GpuVector< _N, _datum > &out2, GpuVector< _N, _datum > &in1, GpuVector< _N, _datum > &in2)
static accelerator_inline void Exchange1(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange3(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void ExchangeN(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange0(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange2(vec &out1, vec &out2, vec &in1, vec &in2)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b, GpuVectorCD c)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b, GpuVectorCF c)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline void mac(GpuVectorRF &a, GpuVectorRF b, GpuVectorRF c)
accelerator_inline void mac(GpuVectorRD &a, GpuVectorRD b, GpuVectorRD c)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
static accelerator_inline vec Permute0(vec in)
static accelerator_inline vec Permute1(vec in)
static accelerator_inline vec PermuteN(vec in)
static accelerator_inline vec Permute2(vec in)
static accelerator_inline GpuVector< _N, _datum > PermuteN(GpuVector< _N, _datum > &in)
static accelerator_inline vec Permute3(vec in)
static accelerator_inline GpuVectorRH DtoH(GpuVectorRD a, GpuVectorRD b, GpuVectorRD c, GpuVectorRD d)
static accelerator_inline GpuVectorCH DtoH(GpuVectorCD a, GpuVectorCD b, GpuVectorCD c, GpuVectorCD d)
static accelerator_inline void HtoS(GpuVectorCH h, GpuVectorCF &sa, GpuVectorCF &sb)
static accelerator_inline GpuVectorRF DtoS(GpuVectorRD a, GpuVectorRD b)
static accelerator_inline void HtoD(GpuVectorRH h, GpuVectorRD &a, GpuVectorRD &b, GpuVectorRD &c, GpuVectorRD &d)
static accelerator_inline void HtoS(GpuVectorRH h, GpuVectorRF &sa, GpuVectorRF &sb)
static accelerator_inline GpuVectorCF DtoS(GpuVectorCD a, GpuVectorCD b)
static accelerator_inline void HtoD(GpuVectorCH h, GpuVectorCD &a, GpuVectorCD &b, GpuVectorCD &c, GpuVectorCD &d)
static accelerator_inline void StoD(GpuVectorRF h, GpuVectorRD &sa, GpuVectorRD &sb)
static accelerator_inline GpuVectorRH StoH(GpuVectorRF a, GpuVectorRF b)
static accelerator_inline GpuVectorCH StoH(GpuVectorCF a, GpuVectorCF b)
static accelerator_inline void StoD(GpuVectorCF h, GpuVectorCD &sa, GpuVectorCD &sb)
accelerator_inline Out_type operator()(In_type in)
static accelerator_inline GpuVectorCH rotate(GpuVectorCH in, int n)
static accelerator_inline GpuVectorI rotate(GpuVectorI in, int n)
static accelerator_inline GpuVectorRF rotate(GpuVectorRF in, int n)
static accelerator_inline GpuComplexVector< _N, _datum > rotate_template(GpuComplexVector< _N, _datum > &in, int n)
static accelerator_inline GpuVectorCF rotate(GpuVectorCF in, int n)
static accelerator_inline GpuVectorRH rotate(GpuVectorRH in, int n)
static accelerator_inline GpuVectorRD rotate(GpuVectorRD in, int n)
static accelerator_inline GpuVectorCD rotate(GpuVectorCD in, int n)
static accelerator_inline vec rotate_template(vec in, int n)
static accelerator_inline vec tRotate(vec in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a)
accelerator_inline GpuVectorRD operator()(double *a)
accelerator_inline GpuVectorCD operator()(Grid::ComplexD *a)
accelerator_inline GpuVectorRF operator()(float *a)
accelerator_inline GpuVectorI operator()(Integer *a)
accelerator_inline GpuVectorCF operator()(float a, float b)
accelerator_inline GpuVectorI operator()(Integer a)
accelerator_inline GpuVectorRD operator()(double a)
accelerator_inline GpuVectorCD operator()(double a, double b)
accelerator_inline GpuVectorRF operator()(float a)
accelerator_inline void operator()(GpuVector< N, datum > a, P *Fp)
accelerator_inline void operator()(P *F, GpuVector< N, datum > a)