35static_assert(
GEN_SIMD_WIDTH % 64u == 0,
"A64FX SIMD vector size is 64 bytes");
41 template <
typename T>
struct W;
42 template <>
struct W<double> {
46 template <>
struct W<float> {
53 template <>
struct W<uint16_t> {
57 template <>
struct W<uint64_t> {
74 vec(
const vec &rhs) { this->operator=(rhs); }
77 svst1(svptrue_b8(), (T*)
this, svld1(svptrue_b8(), (T*)rhs.v));
80 inline vec &operator=(
const vec &rhs) {
82 svst1(svptrue_b8(), (T*)
this, svld1(svptrue_b8(), (T*)rhs.
v));
113 typedef svfloat64_t
vt;
114 typedef svfloat64x2_t
vt2;
115 typedef svfloat64x4_t
vt4;
116 typedef float64_t
pt;
120 static inline svbool_t
pg1(){
return svptrue_b64();}
121 static inline svbool_t
pg2(){
return svptrue_pat_b64(SV_VL4);}
122 static inline svbool_t
pg4(){
return svptrue_pat_b64(SV_VL2);}
153 static inline svbool_t
pg_even(){
return svzip1_b64(svptrue_b64(), svpfalse_b());}
154 static inline svbool_t
pg_odd() {
return svzip1_b64(svpfalse_b(), svptrue_b64());}
155 static inline svfloat64_t
zero(){
return svdup_f64(0.);}
160 typedef svfloat32_t
vt;
161 typedef svfloat32x2_t
vt2;
162 typedef float32_t
pt;
166 static inline svbool_t
pg1(){
return svptrue_b32();}
167 static inline svbool_t
pg2(){
return svptrue_pat_b32(SV_VL8);}
171 const vec_imm<uint32_t> t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
176 const vec_imm<uint32_t> t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
181 const vec_imm<uint32_t> t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
186 const vec_imm<uint32_t> t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
191 const vec_imm<uint32_t> t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 };
196 const vec_imm<uint32_t> t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 };
201 const vec_imm<uint32_t> t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7};
204 static inline svbool_t
pg_even(){
return svzip1_b32(svptrue_b32(), svpfalse_b());}
205 static inline svbool_t
pg_odd() {
return svzip1_b32(svpfalse_b(), svptrue_b32());}
206 static inline svfloat32_t
zero(){
return svdup_f32(0.);}
211 typedef svfloat16_t
vt;
212 typedef float16_t
pt;
216 static inline svbool_t
pg1(){
return svptrue_b16();}
217 static inline svbool_t
pg2(){
return svptrue_pat_b16(SV_VL16);}
218 static inline svbool_t
pg_even(){
return svzip1_b16(svptrue_b16(), svpfalse_b());}
219 static inline svbool_t
pg_odd() {
return svzip1_b16(svpfalse_b(), svptrue_b16());}
220 static inline svfloat16_t
zero(){
return svdup_f16(0.);}
225 typedef svuint32_t
vt;
232 static inline svbool_t
pg1(){
return svptrue_b32();}
233 static inline svbool_t
pg2(){
return svptrue_pat_b32(SV_VL8);}
234 static inline svbool_t
pg_even(){
return svzip1_b32(svptrue_b32(), svpfalse_b());}
235 static inline svbool_t
pg_odd() {
return svzip1_b32(svpfalse_b(), svptrue_b32());}
248 svst1(pg1, out.
v, r_v);
257 svst1(pg1, out.
v, r_v);
268 svst1(pg1, out.
v, r_v);
277 svst1(pg1, out.
v, r_v);
287 svst1(pg1, out.
v, r_v);
294 template <
typename T>
304 template <
typename T>
308 svstnt1(pg1, a, b_v);
315 template <
typename T>
320 svst1(pg1, out.
v, a_v);
326 template <
typename T>
331 svst1(pg1, out.
v, a_v);
342 template <
typename T>
349 svst1(pg1, out.
v, r_v);
356 template <
typename T>
363 svst1(pg1, out.
v, r_v);
370 template <
typename T>
377 typename acle<T>::vt r_v = svmla_x(pg1, c_v, a_v, b_v);
378 svst1(pg1, out.
v, r_v);
382 template <
typename T>
389 svst1(pg1, out.
v, r_v);
396 template <
typename T>
405 typename acle<T>::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0);
407 svst1(pg1, out.
v, r_v);
414 template <
typename T>
423 typename acle<T>::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0);
425 svst1(pg1, out.
v, r_v);
433 template <
typename T>
442 typename acle<T>::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0);
443 r_v = svcmla_x(pg1, r_v, a_v, b_v, 90);
445 svst1(pg1, out.
v, r_v);
453 template <
typename T>
462 typename acle<T>::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0);
463 r_v = svcmla_x(pg1, r_v, a_v, b_v, 90);
464 svst1(pg1, out.
v, r_v);
472 template <
typename T>
479 svst1(pg1, out.
v, r_v);
487 template <
typename T>
494 typename acle<T>::vt r_v = svneg_m(a_v, pg_odd, a_v);
495 svst1(pg1, out.
v, r_v);
503 template <
typename T>
512 a_v = svtbl(a_v, tbl_swap_v);
513 typename acle<T>::vt r_v = svneg_m(a_v, pg_odd, a_v);
514 svst1(pg1, out.
v, r_v);
522 template <
typename T>
531 a_v = svtbl(a_v, tbl_swap_v);
533 typename acle<T>::vt r_v = svneg_m(a_v, pg_even, a_v);
534 svst1(pg1, out.
v, r_v);
562 svst1(pg1s, sa.
v, sa_v);
563 svst1(pg1s, sb.
v, sb_v);
574 svst1(pg1s, ret.
v, r_v);
586 svst1(pg1d, a.
v, a_v);
587 svst1(pg1d, b.
v, b_v);
628 svst1(pg1d, a.
v, a_v);
629 svst1(pg1d, b.
v, b_v);
630 svst1(pg1d, c.
v, c_v);
631 svst1(pg1d, d.
v, d_v);
644 template <
typename T>
650 r1_v = svext(r1_v, a2_v, (uint64_t)
W<T>::c);
652 r2_v = svext(a1_v, r2_v, (uint64_t)
W<T>::c);
653 svst1(pg1, out1.
v, r1_v);
654 svst1(pg1, out2.
v, r2_v);
657 template <
typename T>
673 typename acle<T>::vt a1_v = svtbl(in1_v, tbl_exch1a_v);
674 typename acle<T>::vt a2_v = svtbl(in2_v, tbl_exch1b_v);
677 typename acle<T>::vt out1_v = svtbl(b1_v, tbl_exch1c_v);
678 typename acle<T>::vt out2_v = svtbl(b2_v, tbl_exch1a_v);
680 svst1(pg1, out1.
v, out1_v);
681 svst1(pg1, out2.
v, out2_v);
684 template <
typename T>
701 svst1(pg1, out1.
v, r1_v);
702 svst1(pg1, out2.
v, r2_v);
714 template <
typename T>
720 svst1(pg1, out.
v, r_v);
732 svst1(pg1, out.
v, r_v);
744 svst1(pg1, out.
v, r_v);
756 svst1(pg1, out.
v, r_v);
768 svst1(pg1, out.
v, r_v);
780 svst1(pg1, out.
v, r_v);
798 svst1(pg1, out.
v, r_v);
803 template <
typename T>
837template <
typename Out_type,
typename In_type>
842 printf(
"Error, using wrong Reduce function\n");
855 float a =
svred(pg_even, a_v);
856 float b =
svred(pg_odd, a_v);
858 return Grid::ComplexF(a, b);
867 float a =
svred(pg1, a_v);
879 double a =
svred(pg_even, a_v);
880 double b =
svred(pg_odd, a_v);
882 return Grid::ComplexD(a, b);
890 double a =
svred(pg1, a_v);
927template <
typename S,
typename T>
using ReduceSIMD = Optimization::Reduce<S,T>;
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
void prefetch_HINT_T0(const char *ptr)
Optimization::Reduce< S, T > ReduceSIMD
void v_prefetch0(int size, const char *ptr)
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::MultAddComplex MultAddComplexSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
#define NAMESPACE_BEGIN(A)
vec< T > operator()(vec< T > a)
vec< T > operator()(vec< T > a, vec< T > b)
static void Exchange1(vec< T > &out1, vec< T > &out2, const vec< T > &in1, const vec< T > &in2)
static void Exchange0(vec< T > &out1, vec< T > &out2, const vec< T > &in1, const vec< T > &in2)
static void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2)
static void Exchange2(vec< T > &out1, vec< T > &out2, const vec< T > &in1, const vec< T > &in2)
static void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2)
vec< T > operator()(vec< T > a, vec< T > b, vec< T > c)
vec< T > operator()(vec< T > a, vec< T > b, vec< T > c)
vec< T > operator()(vec< T > a, vec< T > b)
vec< T > operator()(vec< T > a, vec< T > b)
vec< T > operator()(vec< T > a, vec< T > b, vec< T > c)
vec< T > operator()(vec< T > a, vec< T > b)
static vec< T > Permute0(vec< T > in)
static vecd Permute3(vecd in)
static vecd Permute1(vecd in)
static vecf Permute3(vecf in)
static vecd Permute2(vecd in)
static vecf Permute1(vecf in)
static vecf Permute2(vecf in)
static vech StoH(const vecf &sa, const vecf &sb)
static void StoD(vecf s, vecd &a, vecd &b)
static vecf DtoS(vecd a, vecd b)
static void HtoD(vech h, vecd &a, vecd &b, vecd &c, vecd &d)
static vech DtoH(vecd a, vecd b, vecd c, vecd d)
static void HtoS(vech h, vecf &sa, vecf &sb)
Out_type operator()(In_type in)
static vec< T > tRotate(vec< T > in)
static vec< T > rotate(vec< T > in, int n)
vec< T > operator()(vec< T > a, vec< T > b)
vec< T > operator()(vec< T > a, vec< T > b)
vec< T > operator()(vec< T > a)
vec< T > operator()(vec< T > a)
vec< T > operator()(T *a)
vec< T > operator()(std::complex< T > *a)
vecf operator()(float a, float b)
vecd operator()(double a)
vecd operator()(double a, double b)
vec< Integer > operator()(Integer a)
void operator()(vec< T > a, T *D)
void operator()(T *a, vec< T > b)
static constexpr unsigned int r
static constexpr unsigned int c
static constexpr unsigned int r
static constexpr unsigned int c
static constexpr unsigned int r
static constexpr unsigned int c
static constexpr unsigned int r
static constexpr unsigned int r
static constexpr unsigned int c
static svbool_t pg_even()
static vec< uint64_t > tbl_exch1a()
static vec< uint64_t > tbl1()
static vec< uint64_t > tbl_exch1b()
static svfloat64_t zero()
static svbool_t pg_even()
static vec< uint64_t > tbl_swap()
static vec< uint64_t > tbl0()
static vec< uint64_t > tbl_exch1c()
static vec< uint32_t > tbl2()
static vec< uint32_t > tbl_exch1b()
static vec< uint32_t > tbl_exch1a()
static vec< uint32_t > tbl1()
static vec< uint32_t > tbl_exch1c()
static svbool_t pg_even()
static vec< uint32_t > tbl_swap()
static svfloat32_t zero()
static vec< uint32_t > tbl0()
static svbool_t pg_even()
static svfloat16_t zero()