39#ifndef GRID_VECTOR_TYPES
40#define GRID_VECTOR_TYPES
53 const FP32 magic = { 113 << 23 };
54 const unsigned int shifted_exp = 0x7c00 << 13;
56 o.
u = (h.
x & 0x7fff) << 13;
57 unsigned int exp = shifted_exp & o.
u;
58 o.
u += (127 - 15) << 23;
60 if (
exp == shifted_exp) {
61 o.
u += (128 - 16) << 23;
62 }
else if (
exp == 0) {
66 o.
u |= (h.
x & 0x8000) << 16;
71 const FP32 f32infty = { 255 << 23 };
72 const FP32 f16max = { (127 + 16) << 23 };
73 const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
74 unsigned int sign_mask = 0x80000000u;
77 o.
x =
static_cast<unsigned short>(0x0u);
78 unsigned int sign = f.
u & sign_mask;
84 if (f.
u >= f16max.
u) {
85 o.
x = (f.
u > f32infty.
u) ? 0x7e00 : 0x7c00;
87 if (f.
u < (113 << 23)) {
91 f.
f += denorm_magic.
f;
93 o.
x =
static_cast<unsigned short>(f.
u - denorm_magic.
u);
95 unsigned int mant_odd = (f.
u >> 13) & 1;
98 f.
u += ((
unsigned int)(15 - 127) << 23) + 0xfff;
102 o.
x =
static_cast<unsigned short>(f.
u >> 13);
105 o.
x |=
static_cast<unsigned short>(sign >> 16);
119 #if defined(A64FX) || defined(A64FXFIXEDSIZE)
122 #pragma message("building A64FX / SVE ACLE VLA")
123 #if defined(ARMCLANGCOMPAT)
124 #pragma message("applying data types patch")
128 #if defined(A64FXFIXEDSIZE)
129 #pragma message("building for A64FX / SVE ACLE fixed size")
139 #ifdef __ARM_FEATURE_SVE_BITS
143 #pragma message("building A64FX SVE VLA")
144 #if defined(ARMCLANGCOMPAT)
145 #pragma message("applying data types patch")
154#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
185#include <type_traits>
191template <
typename T>
using Invoke =
typename T::type;
197template <
typename T>
struct is_complex :
public std::false_type {};
201template <
typename T>
struct is_ComplexD :
public std::false_type {};
204template <
typename T>
struct is_ComplexF :
public std::false_type {};
207template<
typename T,
typename V=
void>
struct is_real :
public std::false_type {};
208template<
typename T>
struct is_real<T, typename std::enable_if<std::is_floating_point<T>::value,
209 void>
::type> :
public std::true_type {};
211template<
typename T,
typename V=
void>
struct is_integer :
public std::false_type {};
212template<
typename T>
struct is_integer<T, typename std::enable_if<std::is_integral<T>::value,
213 void>
::type> :
public std::true_type {};
230template <
class Out,
class Input1,
class Input2,
class Input3,
class Operation>
232 return op(src_1, src_2, src_3);
234template <
class Out,
class Input1,
class Input2,
class Operation>
236 return op(src_1, src_2);
238template <
class Out,
class Input,
class Operation>
247template <
class Scalar_type,
class Vector_type>
265 static_assert( (
sizeof(Vector_type) /
sizeof(Scalar_type) >= 1),
" size mismatch " );
266 return sizeof(Vector_type) /
sizeof(Scalar_type);
269 #ifdef ARMCLANGCOMPAT
270 template <
class S = Scalar_type>
273 svst1(svptrue_b8(), (Scalar_type*)
this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
277 template <
class S = Scalar_type>
280 svst1(svptrue_b8(), (Scalar_type*)
this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
301 template <
class S = Scalar_type>
304 svst1(svptrue_b32(), (
float*)
this, svld1(svptrue_b32(), (
float*)&(rhs.v)));
308 template <
class S = Scalar_type>
311 svst1(svptrue_b32(), (
float*)
this, svld1(svptrue_b32(), (
float*)&(rhs.v)));
316 template <
class S = Scalar_type>
319 svst1(svptrue_b64(), (
double*)
this, svld1(svptrue_b64(), (
double*)&(rhs.v)));
323 template <
class S = Scalar_type>
326 svst1(svptrue_b64(), (
double*)
this, svld1(svptrue_b64(), (
double*)&(rhs.v)));
345 #ifdef ARMCLANGCOMPAT
346 template <
class S = Scalar_type>
348 template <
class S = Scalar_type>
350 template <
class S = Scalar_type>
352 template <
class S = Scalar_type>
381 #if defined(A64FX) || defined(A64FXFIXEDSIZE)
385 *y = fxmac((*a), (*x), (*y));
391 *y = (*a) * (*x) + (*y);
412 const Scalar_type *__restrict__ a,
414 *y = (*a) * (*x) + (*y);
417 const Scalar_type *__restrict__ l,
422 const Scalar_type *__restrict__ l,
427 const Scalar_type *__restrict__ l,
434 const Scalar_type *__restrict__ x) {
435 *y = (*a) * (*x) + (*y);
439 const Scalar_type *__restrict__ r) {
444 const Scalar_type *__restrict__ r) {
449 const Scalar_type *__restrict__ r) {
538 template <
class functor>
543 for (
int i = 0; i <
Nsimd(); i++) {
550 template <
class functor>
557 for (
int i = 0; i <
Nsimd(); i++) {
572 Optimization::Exchange::Exchange3(out1.
v,out2.
v,in1.
v,in2.
v);
574 Optimization::Exchange::Exchange2(out1.
v,out2.
v,in1.
v,in2.
v);
576 Optimization::Exchange::Exchange1(out1.
v,out2.
v,in1.
v,in2.
v);
578 Optimization::Exchange::Exchange0(out1.
v,out2.
v,in1.
v,in2.
v);
582 Optimization::Exchange::Exchange0(out1.
v,out2.
v,in1.
v,in2.
v);
585 Optimization::Exchange::Exchange1(out1.
v,out2.
v,in1.
v,in2.
v);
588 Optimization::Exchange::Exchange2(out1.
v,out2.
v,in1.
v,in2.
v);
591 Optimization::Exchange::Exchange3(out1.
v,out2.
v,in1.
v,in2.
v);
599 y.
v = Optimization::Permute::Permute0(b.
v);
602 y.
v = Optimization::Permute::Permute1(b.
v);
605 y.
v = Optimization::Permute::Permute2(b.
v);
608 y.
v = Optimization::Permute::Permute3(b.
v);
612 int dist =
perm & 0xF;
626 template <
class S = Scalar_type,IfComplex<S> = 0>
628 return Scalar_type(
v.rrrr[lane],
v.iiii[lane]);
630 template <
class S = Scalar_type,IfComplex<S> = 0>
632 v.rrrr[lane] =
real(_S);
633 v.iiii[lane] =
imag(_S);
635 template <
class S = Scalar_type,IfNotComplex<S> = 0>
637 return ((S*)&
v)[lane];
639 template <
class S = Scalar_type,IfNotComplex<S> = 0>
641 ((Scalar_type*)&
v)[lane] = _S;
645 return ((Scalar_type*)&
v)[lane];
648 ((Scalar_type*)&
v)[lane] = S;
664#if defined(GPU_VEC) || defined(GPU_RRII)
696template <
class S,
class V, IfNotComplex<S> = 0>
700 ret.
v = Optimization::Rotate::rotate(b.
v, nrot);
703template <
class S,
class V, IfComplex<S> = 0>
707 ret.
v = Optimization::Rotate::rotate(b.
v, 2 * nrot);
710template <
class S,
class V, IfNotComplex<S> =0>
714 ret.
v = Optimization::Rotate::rotate(b.
v,nrot);
716template <
class S,
class V, IfComplex<S> =0>
720 ret.
v = Optimization::Rotate::rotate(b.
v,2*nrot);
723template <
class S,
class V>
725 S* typepun =(S*) &src;
726 vsplat(ret,typepun[lane]);
728template <
class S,
class V, IfComplex<S> =0>
730 S* typepun =(S*) &src;
741template <
class S,
class V, IfComplex<S> = 0,
class ABtype>
747template <
class S,
class V>
751template <
class S,
class V>
758template <
class S,
class V>
769template <
class S,
class V, IfComplex<S> = 0>
773template <
class S,
class V, IfComplex<S> = 0>
777template <
class S,
class V, IfComplex<S> = 0>
782template <
class S,
class V, IfComplex<S> = 0>
784 vsplat(ret, S(1.0, -1.0));
786template <
class S,
class V, IfComplex<S> = 0>
788 vsplat(ret, S(-1.0, 1.0));
792template <
class S,
class V, IfReal<S> = 0>
796template <
class S,
class V, IfReal<S> = 0>
802template <
class S,
class V, IfInteger<S> = 0>
806template <
class S,
class V, IfInteger<S> = 0>
810template <
class S,
class V, IfInteger<S> = 0>
814template <
class S,
class V, IfInteger<S> = 0>
818template <
class S,
class V>
826template <
class S,
class V, IfReal<S> = 0>
830template <
class S,
class V, IfComplex<S> = 0>
832 typedef typename S::value_type T;
835template <
class S,
class V, IfInteger<S> = 0>
843template <
class S,
class V>
850template <
class S,
class V>
858template <
class S,
class V, IfComplex<S> = 0>
864template <
class S,
class V, IfComplex<S> = 0>
873template <
class S,
class V, IfComplex<S> = 0>
881template <
class S,
class V, IfNotComplex<S> = 0>
890#if defined(A64FX) || defined(A64FXFIXEDSIZE)
891template <
class S,
class V, IfComplex<S> = 0>
899template <
class S,
class V, IfNotComplex<S> = 0>
912template <
class S,
class V, IfComplex<S> = 0>
918template <
class S,
class V, IfNotComplex<S> = 0>
923template <
class S,
class V, IfNotInteger<S> = 0>
931template <
class S,
class V, IfComplex<S> = 0>
935template <
class S,
class V, IfComplex<S> = 0>
941template <
class S,
class V, IfNotComplex<S> = 0>
948template <
class S,
class V, IfComplex<S> = 0>
952template <
class S,
class V, IfComplex<S> = 0>
958template <
class S,
class V, IfNotComplex<S> = 0>
965template <
class S,
class V, IfComplex<S> = 0>
976 auto real_den =
toReal(den);
978 memcpy((
void *)&zden.v,(
void *)&real_den.v,
sizeof(zden));
984template <
class S,
class V, IfNotComplex<S> = 0>
995template <
class S,
class V>
999template <
class S,
class V>
1004template <
class S,
class V>
1019template <
class Csimd>
1024 for (
int i = 0; i < Rsimd::Nsimd(); i += 2) {
1025 auto s =
real(in.getlane(j++));
1037template <
class Rsimd>
1044 for (
int i = 0; i < Rsimd::Nsimd(); i += 2) {
1045 auto rr = in.getlane(i);
1046 auto ri = in.getlane(i+1);
1060 assert((nvec&0x1)==0);
1061 for(
int m=0;m*2<nvec;m++){
1063 out[m].
v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
1068 assert((nvec&0x3)==0);
1069 for(
int m=0;m*4<nvec;m++){
1071 out[m].
v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
1076 assert((nvec&0x1)==0);
1077 for(
int m=0;m*2<nvec;m++){
1079 out[m].
v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
1084 assert((nvec&0x1)==0);
1085 for(
int m=0;m*2<nvec;m++){
1087 Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
1100 assert((nvec&0x3)==0);
1101 for(
int m=0;m*4<nvec;m++){
1103 Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
1108 assert((nvec&0x1)==0);
1109 for(
int m=0;m*2<nvec;m++){
1111 Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
1127#ifndef GENERIC_SCALAR
1136 if (
perm & 0x1 ) {
permute(inout,tmp,0); tmp=inout;}
1137 if (
perm & 0x2 ) {
permute(inout,tmp,1); tmp=inout;}
1138 if (
perm & 0x4 ) {
permute(inout,tmp,2); tmp=inout;}
1139 if (
perm & 0x8 ) {
permute(inout,tmp,3); tmp=inout;}
1145template<>
struct sycl::is_device_copyable<
Grid::
vComplexF> :
public std::true_type {};
1146template<>
struct sycl::is_device_copyable<
Grid::
vComplexD> :
public std::true_type {};
1147template<>
struct sycl::is_device_copyable<
Grid::
vRealF > :
public std::true_type {};
1148template<>
struct sycl::is_device_copyable<
Grid::
vRealD > :
public std::true_type {};
1149template<>
struct sycl::is_device_copyable<
Grid::
vInteger > :
public std::true_type {};
#define accelerator_inline
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
void prefetch_HINT_T0(const char *ptr)
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::MultAddComplex MultAddComplexSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
Optimization libraries for SSE4 instructions set.
accelerator_inline Grid_simd< S, V > operator/(Grid_simd< S, V > a, Grid_simd< S, V > b)
accelerator_inline void timesMinusI(Grid_simd< S, V > &ret, const Grid_simd< S, V > &in)
accelerator_inline Grid_simd< S, V > real_mult(Grid_simd< S, V > a, Grid_simd< S, V > b)
accelerator_inline void vone(Grid_simd< S, V > &ret)
accelerator_inline void permute(ComplexD &y, ComplexD b, int perm)
accelerator_inline Grid_simd< S, V > operator+(Grid_simd< S, V > a, Grid_simd< S, V > b)
Invoke< std::enable_if<!std::is_same< T1, T2 >::value, int > > IfNotSame
accelerator_inline void timesI(Grid_simd< S, V > &ret, const Grid_simd< S, V > &in)
Invoke< std::enable_if< is_complex< T >::value, int > > IfComplex
Grid_simd< complex< float >, SIMD_Ftype > vComplexF
accelerator_inline void vcomplex_i(Grid_simd< S, V > &ret)
Out accelerator_inline unary(Input src, Operation op)
Grid_simd< uint16_t, SIMD_Htype > vRealH
accelerator_inline void vfalse(Grid_simd< S, V > &ret)
Invoke< std::enable_if< std::is_same< T1, T2 >::value, int > > IfSame
accelerator_inline float sfw_half_to_float(Grid_half h)
accelerator_inline Grid_simd< S, V > operator-(Grid_simd< S, V > a, Grid_simd< S, V > b)
Grid_simd< complex< uint16_t >, SIMD_Htype > vComplexH
accelerator_inline Grid_simd< S, V > outerProduct(const Grid_simd< S, V > &l, const Grid_simd< S, V > &r)
Grid_simd< float, SIMD_Ftype > vRealF
accelerator_inline void zeroit(Grid_simd< S, V > &z)
accelerator_inline void vstream(Grid_simd< S, V > &out, const Grid_simd< S, V > &in)
accelerator_inline toRealMapper< Csimd >::Realified toReal(const Csimd &in)
Invoke< std::enable_if<!is_integer< T >::value, int > > IfNotInteger
Out accelerator_inline trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op)
Invoke< std::enable_if<!is_real< T >::value, int > > IfNotReal
accelerator_inline Grid_half sfw_float_to_half(float ff)
accelerator_inline void rbroadcast(Grid_simd< S, V > &ret, const Grid_simd< S, V > &src, int lane)
accelerator_inline Grid_simd< S, V > adj(const Grid_simd< S, V > &in)
accelerator_inline Grid_simd< S, V > rotate(Grid_simd< S, V > b, int nrot)
Grid_simd< complex< double >, SIMD_Dtype > vComplexD
Invoke< std::enable_if< is_integer< T >::value, int > > IfInteger
accelerator_inline Grid_simd< S, V > operator*(Grid_simd< S, V > a, Grid_simd< S, V > b)
accelerator_inline Grid_simd< S, V > real_madd(Grid_simd< S, V > a, Grid_simd< S, V > b, Grid_simd< S, V > c)
accelerator_inline Grid_simd< S, V > trace(const Grid_simd< S, V > &arg)
accelerator_inline void vsplat(Grid_simd< S, V > &ret, ABtype a, ABtype b)
accelerator_inline void rsplat(Grid_simd< S, V > &ret, EnableIf< is_complex< S >, S > c)
accelerator_inline void visign(Grid_simd< S, V > &ret)
accelerator_inline Grid_simd< S, V > conjugate(const Grid_simd< S, V > &in)
Invoke< std::enable_if<!Condition::value, ReturnType > > NotEnableIf
accelerator_inline void precisionChange(vRealF *out, const vRealD *in, int nvec)
accelerator_inline void vzero(Grid_simd< S, V > &ret)
accelerator_inline void vrsign(Grid_simd< S, V > &ret)
accelerator_inline Grid_simd< S, V > innerProduct(const Grid_simd< S, V > &l, const Grid_simd< S, V > &r)
Invoke< std::enable_if< Condition::value, ReturnType > > EnableIf
void gpermute(vobj &inout, int perm)
accelerator_inline toComplexMapper< Rsimd >::Complexified toComplex(const Rsimd &in)
accelerator_inline void vbroadcast(Grid_simd< S, V > &ret, const Grid_simd< S, V > &src, int lane)
Invoke< std::enable_if< is_real< T >::value, int > > IfReal
Grid_simd< Integer, SIMD_Itype > vInteger
Grid_simd< double, SIMD_Dtype > vRealD
Out accelerator_inline binary(Input1 src_1, Input2 src_2, Operation op)
Invoke< std::enable_if<!is_complex< T >::value, int > > IfNotComplex
accelerator_inline void vtrue(Grid_simd< S, V > &ret)
accelerator_inline Grid_simd< S, V > exp(const Grid_simd< S, V > &r)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
#define NAMESPACE_BEGIN(A)
std::complex< T > complex
std::complex< RealF > ComplexF
std::complex< RealD > ComplexD
friend accelerator_inline void exchange3(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2)
accelerator_inline Grid_simd & operator=(const Grid_simd &rhs)
friend accelerator_inline void vprefetch(const Grid_simd &v)
friend accelerator_inline void add(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r)
accelerator_inline Scalar_type getlane(int lane) const
static accelerator_inline constexpr int Nsimd(void)
friend accelerator_inline void vset(Grid_simd &ret, Scalar_type *a)
accelerator Grid_simd()=default
friend accelerator_inline Grid_simd operator*(const Scalar_type &a, Grid_simd b)
friend accelerator_inline void sub(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ l, const Grid_simd *__restrict__ r)
friend accelerator_inline void sub(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r)
friend accelerator_inline void add(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ l, const Grid_simd *__restrict__ r)
accelerator_inline Grid_simd(const Grid_simd &&rhs)
friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x)
friend accelerator_inline Grid_simd operator/(const Scalar_type &a, Grid_simd b)
friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ a, const Grid_simd *__restrict__ x)
accelerator_inline Grid_simd(const typename std::enable_if< is_complex< S >::value, S >::type a)
friend accelerator_inline void permute2(Grid_simd &y, Grid_simd b)
friend accelerator_inline Grid_simd operator/(Grid_simd b, const Scalar_type &a)
friend accelerator_inline void permute3(Grid_simd &y, Grid_simd b)
accelerator_inline Grid_simd & operator=(const Grid_simd &&rhs)
friend accelerator_inline void sub(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Scalar_type *__restrict__ r)
friend accelerator_inline void permute0(Grid_simd &y, Grid_simd b)
accelerator_inline void putlane(const Scalar_type &S, int lane)
friend accelerator_inline void add(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Scalar_type *__restrict__ r)
accelerator_inline Grid_simd & operator*=(const Grid_simd &r)
friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ l, const Grid_simd *__restrict__ r)
friend accelerator_inline void exchange1(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2)
friend accelerator_inline Grid_simd operator-(const Grid_simd &r)
friend accelerator_inline Scalar_type Reduce(const Grid_simd &in)
friend accelerator_inline Grid_simd operator*(Grid_simd b, const Scalar_type &a)
accelerator_inline Grid_simd(const Real a)
RealPart< Scalar_type >::type Real
friend accelerator_inline void exchange0(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2)
accelerator_inline Grid_simd(const Grid_simd &rhs)
accelerator_inline Grid_simd & operator=(const Zero &z)
accelerator_inline Grid_simd & operator+=(const Grid_simd &r)
friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Scalar_type *__restrict__ r)
friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r)
friend accelerator_inline void vstore(const Grid_simd &ret, Scalar_type *a)
friend accelerator_inline void exchange(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2, int n)
friend accelerator_inline void permute(Grid_simd &y, Grid_simd b, int perm)
friend accelerator_inline void exchange2(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2)
accelerator_inline Grid_simd & operator-=(const Grid_simd &r)
friend accelerator_inline Grid_simd SimdApplyBinop(const functor &func, const Grid_simd &x, const Grid_simd &y)
friend accelerator_inline Grid_simd SimdApply(const functor &func, const Grid_simd &v)
friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Scalar_type *__restrict__ x)
friend accelerator_inline void permute1(Grid_simd &y, Grid_simd b)
accelerator Grid_half(uint16_t raw)