39#include <hip/hip_fp16.h>
41#if !defined(GRID_HIP) && !defined(GRID_CUDA)
43 typedef struct { uint16_t
x;}
half;
50#if defined(GRID_CUDA) || defined(GRID_HIP)
62#if defined(GRID_CUDA) || defined(GRID_HIP)
73#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
80template<
int _N,
class _datum>
83 static const int N = _N;
86template<
int N,
class datum>
89 for(
int i=0;i<N;i++) {
94template<
int N,
class datum>
97 for(
int i=0;i<N;i++) {
102template<
int N,
class datum>
105 for(
int i=0;i<N;i++) {
110template<
int N,
class datum>
113 for(
int i=0;i<N;i++) {
122template<
int _N,
class _datum>
126 static const int N = _N;
129template<
int N,
class datum>
132 for(
int i=0;i<N;i++) {
138template<
int N,
class datum>
141 for(
int i=0;i<N;i++) {
147template<
int N,
class datum>
150 for(
int i=0;i<N;i++) {
156template<
int N,
class datum>
159 for(
int i=0;i<N;i++) {
234 template<
int N,
class datum,
class P>
239 template<
int N,
class datum,
class P>
247 template<
int N,
class datum,
class P>
252 template<
int N,
class datum,
class P>
264 for(
int i=0;i<vec::N;i++){
265 ret.rrrr[i] = vec::datum(a[i].
real());
266 ret.iiii[i] = vec::datum(a[i].
imag());
274 for(
int i=0;i<vec::N;i++){
275 ret.rrrr[i] = vec::datum(a[i].
real());
276 ret.iiii[i] = vec::datum(a[i].
imag());
284 for(
int i=0;i<vec::N;i++){
285 ret.rrrr[i] = vec::datum(a[i]);
293 for(
int i=0;i<vec::N;i++){
294 ret.rrrr[i] = vec::datum(a[i]);
302 for(
int i=0;i<vec::N;i++){
303 ret.rrrr[i] = vec::datum(a[i]);
309 template <
typename Out_type,
typename In_type>
314 printf(
"Error, using wrong Reduce function\n");
364 for(
int i=0;i<vec::N;i++){
373 for(
int i=0;i<vec::N;i++){
385 for(
int i=0;i<vec::N;i++){
394 for(
int i=0;i<vec::N;i++){
460 for(
int i=0;i<vec::N;i++){
461 ret.rrrr[i] = in.
rrrr[i];
462 ret.iiii[i] =-in.
iiii[i];
469 for(
int i=0;i<vec::N;i++){
470 ret.rrrr[i] = in.
rrrr[i];
471 ret.iiii[i] =-in.
iiii[i];
482 for(
int i=0;i<vec::N;i++){
483 ret.rrrr[i] = in.
iiii[i];
484 ret.iiii[i] =-in.
rrrr[i];
491 for(
int i=0;i<vec::N;i++){
492 ret.rrrr[i] = in.
iiii[i];
493 ret.iiii[i] =-in.
rrrr[i];
504 for(
int i=0;i<vec::N;i++){
505 ret.rrrr[i] =-in.
iiii[i];
506 ret.iiii[i] = in.
rrrr[i];
513 for(
int i=0;i<vec::N;i++){
514 ret.rrrr[i] =-in.
iiii[i];
515 ret.iiii[i] = in.
rrrr[i];
523 template <
int n,
int _N,
class _datum >
527 unsigned int _mask = vec::N >> (n + 1);
528 for(
int i=0;i<vec::N;i++) {
529 out.rrrr[i] = in.
rrrr[i^_mask];
533 template <
int n,
int _N,
class _datum >
537 unsigned int _mask = vec::N >> (n + 1);
538 for(
int i=0;i<vec::N;i++) {
539 out.rrrr[i] = in.
rrrr[i^_mask];
540 out.iiii[i] = in.
iiii[i^_mask];
560 for(
int i=0;i<N;i++) {
570 for(
int i=0;i<N;i++) {
580 for(
int i=0;i<N;i++) {
588 for(
int i=0;i<N;i++) {
600 for(
int i=0;i<N;i++) {
611 for(
int i=0;i<N;i++) {
622 for(
int i=0;i<N;i++) {
631 for(
int i=0;i<N;i++) {
668 template <
int n,
int _N,
class _datum >
675 unsigned int mask = vec::N >> (n + 1);
676 for(
int i=0;i<vec::N;i++) {
678 if ( (i&mask) == 0 ) { out1.
rrrr[i]=in1.
rrrr[j1];}
681 if ( (i&mask) == 0 ) { out2.
rrrr[i]=in1.
rrrr[j2];}
685 template <
int n,
int _N,
class _datum >
692 unsigned int mask = vec::N >> (n + 1);
693 for(
int i=0;i<vec::N;i++) {
695 if ( (i&mask) == 0 ) {
704 if ( (i&mask) == 0 ) {
714 template <
typename vec>
718 template <
typename vec>
722 template <
typename vec>
726 template <
typename vec>
739 template <
int _N,
class _datum >
744 for(
int i=0;i<vec::N;i++){
745 out.rrrr[i] = in.
rrrr[(i + n)%vec::N];
746 out.iiii[i] = in.
iiii[(i + n)%vec::N];
751 template <
int _N,
class _datum >
756 for(
int i=0;i<vec::N;i++){
757 out.rrrr[i] = in.
rrrr[(i + n)%vec::N];
789 Grid::ComplexF greduce(in.rrrr[0],in.iiii[0]);
791 greduce = greduce+Grid::ComplexF(in.rrrr[i],in.iiii[i]);
800 Grid::ComplexD greduce(in.rrrr[0],in.iiii[0]);
802 greduce = greduce+ Grid::ComplexD(in.rrrr[i],in.iiii[i]);
812 RealF ret = in.rrrr[0];
814 ret = ret+in.rrrr[i];
823 RealD ret = in.rrrr[0];
825 ret = ret+in.rrrr[i];
836 ret = ret+in.rrrr[i];
#define accelerator_inline
#define COALESCE_GRANULARITY
accelerator_inline float sfw_half_to_float(Grid_half h)
accelerator_inline Grid_half sfw_float_to_half(float ff)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
static INTERNAL_PRECISION F
accelerator_inline half float2half(float f)
constexpr int NSIMD_ComplexF
Optimization::Reduce< S, T > ReduceSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::Div DivSIMD
accelerator GpuVector< N, datum > operator/(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
GpuComplexVector< NSIMD_ComplexF, float > GpuVectorCF
Optimization::MultComplex MultComplexSIMD
Optimization::Conj ConjSIMD
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
accelerator_inline void prefetch_HINT_T0(const char *ptr)
constexpr int NSIMD_Integer
constexpr int NSIMD_ComplexH
GpuVector< NSIMD_RealF, float > GpuVectorRF
Optimization::MultRealPart MultRealPartSIMD
Optimization::TimesI TimesISIMD
constexpr int NSIMD_ComplexD
Optimization::Mult MultSIMD
accelerator_inline void v_prefetch0(int size, const char *ptr)
GpuComplexVector< NSIMD_ComplexH, half > GpuVectorCH
accelerator_inline float half2float(half h)
constexpr int NSIMD_RealH
Optimization::Vset VsetSIMD
GpuVector< NSIMD_RealD, double > GpuVectorRD
Optimization::Vstore VstoreSIMD
GpuVector< NSIMD_RealH, half > GpuVectorRH
constexpr int NSIMD_RealF
Optimization::Sub SubSIMD
Optimization::TimesMinusI TimesMinusISIMD
GpuVector< NSIMD_Integer, Integer > GpuVectorI
accelerator GpuVector< N, datum > operator*(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
accelerator GpuVector< N, datum > operator+(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
Optimization::Vstream VstreamSIMD
accelerator GpuVector< N, datum > operator-(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
constexpr int NSIMD_RealD
GpuComplexVector< NSIMD_ComplexD, double > GpuVectorCD
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
static accelerator_inline void ExchangeN(GpuVector< _N, _datum > &out1, GpuVector< _N, _datum > &out2, GpuVector< _N, _datum > &in1, GpuVector< _N, _datum > &in2)
static accelerator_inline void Exchange1(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange3(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange0(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange2(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void ExchangeN(GpuComplexVector< _N, _datum > &out1, GpuComplexVector< _N, _datum > &out2, GpuComplexVector< _N, _datum > &in1, GpuComplexVector< _N, _datum > &in2)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b, GpuVectorCD c)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b, GpuVectorCF c)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline void mac(GpuVectorRF &a, GpuVectorRF b, GpuVectorRF c)
accelerator_inline void mac(GpuVectorRD &a, GpuVectorRD b, GpuVectorRD c)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
static accelerator_inline GpuComplexVector< _N, _datum > PermuteN(GpuComplexVector< _N, _datum > &in)
static accelerator_inline vec Permute0(vec in)
static accelerator_inline vec Permute1(vec in)
static accelerator_inline vec Permute2(vec in)
static accelerator_inline GpuVector< _N, _datum > PermuteN(GpuVector< _N, _datum > &in)
static accelerator_inline vec Permute3(vec in)
static accelerator_inline GpuVectorRH DtoH(GpuVectorRD a, GpuVectorRD b, GpuVectorRD c, GpuVectorRD d)
static accelerator_inline GpuVectorCH DtoH(GpuVectorCD a, GpuVectorCD b, GpuVectorCD c, GpuVectorCD d)
static accelerator_inline void HtoS(GpuVectorCH h, GpuVectorCF &sa, GpuVectorCF &sb)
static accelerator_inline GpuVectorRF DtoS(GpuVectorRD a, GpuVectorRD b)
static accelerator_inline void HtoD(GpuVectorRH h, GpuVectorRD &a, GpuVectorRD &b, GpuVectorRD &c, GpuVectorRD &d)
static accelerator_inline void HtoS(GpuVectorRH h, GpuVectorRF &sa, GpuVectorRF &sb)
static accelerator_inline GpuVectorCF DtoS(GpuVectorCD a, GpuVectorCD b)
static accelerator_inline void HtoD(GpuVectorCH h, GpuVectorCD &a, GpuVectorCD &b, GpuVectorCD &c, GpuVectorCD &d)
static accelerator_inline void StoD(GpuVectorRF h, GpuVectorRD &sa, GpuVectorRD &sb)
static accelerator_inline GpuVectorRH StoH(GpuVectorRF a, GpuVectorRF b)
static accelerator_inline GpuVectorCH StoH(GpuVectorCF a, GpuVectorCF b)
static accelerator_inline void StoD(GpuVectorCF h, GpuVectorCD &sa, GpuVectorCD &sb)
accelerator_inline Out_type operator()(In_type in)
static accelerator_inline GpuVector< _N, _datum > rotate_template(GpuVector< _N, _datum > &in, int n)
static accelerator_inline GpuVectorCH rotate(GpuVectorCH in, int n)
static accelerator_inline GpuVectorI rotate(GpuVectorI in, int n)
static accelerator_inline GpuVectorRF rotate(GpuVectorRF in, int n)
static accelerator_inline GpuComplexVector< _N, _datum > rotate_template(GpuComplexVector< _N, _datum > &in, int n)
static accelerator_inline GpuVectorCF rotate(GpuVectorCF in, int n)
static accelerator_inline GpuVectorRH rotate(GpuVectorRH in, int n)
static accelerator_inline GpuVectorRD rotate(GpuVectorRD in, int n)
static accelerator_inline GpuVectorCD rotate(GpuVectorCD in, int n)
static accelerator_inline vec tRotate(vec in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a)
accelerator_inline GpuVectorRD operator()(double *a)
accelerator_inline GpuVectorCD operator()(Grid::ComplexD *a)
accelerator_inline GpuVectorRF operator()(float *a)
accelerator_inline GpuVectorI operator()(Integer *a)
accelerator_inline GpuVectorCF operator()(float a, float b)
accelerator_inline GpuVectorI operator()(Integer a)
accelerator_inline GpuVectorRD operator()(double a)
accelerator_inline GpuVectorCD operator()(double a, double b)
accelerator_inline GpuVectorRF operator()(float a)
accelerator_inline void operator()(GpuComplexVector< N, datum > a, P *Fp)
accelerator_inline void operator()(GpuVector< N, datum > a, P *Fp)
accelerator_inline void operator()(P *F, GpuVector< N, datum > a)
accelerator_inline void operator()(P *F, GpuComplexVector< N, datum > a)