38#if __ARM_FEATURE_SVE_BITS==512
49typedef __SVBool_t pred __attribute__((arm_sve_vector_bits(512)));
50typedef __SVFloat16_t
vech __attribute__((arm_sve_vector_bits(512)));
51typedef __SVFloat32_t
vecf __attribute__((arm_sve_vector_bits(512)));
52typedef __SVFloat64_t
vecd __attribute__((arm_sve_vector_bits(512)));
53typedef __SVUint32_t
veci __attribute__((arm_sve_vector_bits(512)));
54typedef __SVUint32_t lutf __attribute__((arm_sve_vector_bits(512)));
55typedef __SVUint64_t lutd __attribute__((arm_sve_vector_bits(512)));
57#pragma error("Oops. Illegal SVE vector size!?")
80 const ulutd t = { .s = {1, 0, 3, 2, 5, 4, 7, 6} };
84 const ulutd t = { .s = {4, 5, 6, 7, 0, 1, 2, 3} };
88 const ulutd t = { .s = {2, 3, 0, 1, 6, 7, 4, 5} };
92 const ulutd t = { .s = {0, 1, 4, 5, 2, 3, 6, 7} };
96 const ulutd t = { .s = {2, 3, 6, 7, 0, 1, 4, 5} };
100 const ulutd t = { .s = {4, 5, 0, 1, 6, 7, 2, 3} };
103 static inline pred
pg1(){
return svptrue_b64();}
104 static inline pred
pg_even(){
return svzip1_b64(svptrue_b64(), svpfalse_b());}
105 static inline pred
pg_odd() {
return svzip1_b64(svpfalse_b(), svptrue_b64());}
113 const ulutf t = { .s = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} };
117 const ulutf t = { .s = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7} };
121 const ulutf t = { .s = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11} };
125 const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} };
129 const ulutf t = { .s = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 } };
133 const ulutf t = { .s = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 } };
137 const ulutf t = { .s = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7} };
140 static inline pred
pg1(){
return svptrue_b32();}
141 static inline pred
pg_even(){
return svzip1_b32(svptrue_b32(), svpfalse_b());}
142 static inline pred
pg_odd() {
return svzip1_b32(svpfalse_b(), svptrue_b32());}
147struct acle<uint16_t>{
148 static inline pred
pg1(){
return svptrue_b16();}
149 static inline pred
pg_even(){
return svzip1_b16(svptrue_b16(), svpfalse_b());}
150 static inline pred
pg_odd() {
return svzip1_b16(svpfalse_b(), svptrue_b16());}
157 static inline pred
pg1(){
return svptrue_b32();}
158 static inline pred
pg_even(){
return svzip1_b32(svptrue_b32(), svpfalse_b());}
159 static inline pred
pg_odd() {
return svzip1_b32(svpfalse_b(), svptrue_b32());}
167 vecf a_v = svdup_f32(a);
168 vecf b_v = svdup_f32(b);
169 return svzip1(a_v, b_v);
177 vecd a_v = svdup_f64(a);
178 vecd b_v = svdup_f64(b);
179 return svzip1(a_v, b_v);
228 return svld1(pg1, (
float*)a);
233 return svld1(pg1, (
double*)a);
238 return svld1(pg1, a);
243 return svld1(pg1, a);
248 return svld1(pg1, a);
260 return svadd_x(pg1, a, b);
265 return svadd_x(pg1, a, b);
270 return svadd_x(pg1, a, b);
278 return svsub_x(pg1, a, b);
283 return svsub_x(pg1, a, b);
288 return svsub_x(pg1, a, b);
297 return svmad_x(pg1, b, c, a);
302 return svmad_x(pg1, b, c, a);
307 return svmul_x(pg1, a, b);
312 return svmul_x(pg1, a, b);
317 return svmul_x(pg1, a, b);
327 return svcmla_x(pg1, z_v, a, b, 0);
334 return svcmla_x(pg1, z_v, a, b, 0);
343 return svcmla_x(pg1, c, a, b, 0);
349 return svcmla_x(pg1, c, a, b, 0);
360 vecf r_v = svcmla_x(pg1, z, a, b, 0);
361 return svcmla_x(pg1, r_v, a, b, 90);
368 vecd r_v = svcmla_x(pg1, z, a, b, 0);
369 return svcmla_x(pg1, r_v, a, b, 90);
379 vecf r_v = svcmla_x(pg1, c, a, b, 0);
380 return svcmla_x(pg1, r_v, a, b, 90);
386 vecd r_v = svcmla_x(pg1, c, a, b, 0);
387 return svcmla_x(pg1, r_v, a, b, 90);
395 return svdiv_x(pg1, a, b);
400 return svdiv_x(pg1, a, b);
409 return svneg_m(a, pg_odd, a);
415 return svneg_m(a, pg_odd, a);
426 vecf a_v = svtbl(a, tbl_swap);
428 return svneg_m(a_v, pg_odd, a_v);
436 vecd a_v = svtbl(a, tbl_swap);
438 return svneg_m(a_v, pg_odd, a_v);
449 vecf a_v = svtbl(a, tbl_swap);
451 return svneg_m(a_v, pg_even, a_v);
459 vecd a_v = svtbl(a, tbl_swap);
461 return svneg_m(a_v, pg_even, a_v);
468 vech ha_v = svcvt_f16_x(pg1s, sa);
469 vech hb_v = svcvt_f16_x(pg1s, sb);
470 return svuzp1(ha_v, hb_v);
474 vech ha_v = svzip1(h, h);
475 vech hb_v = svzip2(h, h);
476 sa = svcvt_f32_x(pg1s, ha_v);
477 sb = svcvt_f32_x(pg1s, hb_v);
481 vecf sa_v = svcvt_f32_x(pg1d, a);
482 vecf sb_v = svcvt_f32_x(pg1d, b);
483 return svuzp1(sa_v, sb_v);
487 vecf sa_v = svzip1(s, s);
488 vecf sb_v = svzip2(s, s);
489 a = svcvt_f64_x(pg1d, sa_v);
490 b = svcvt_f64_x(pg1d, sb_v);
495 vech ha_v = svcvt_f16_x(pg1d, a);
496 vech hb_v = svcvt_f16_x(pg1d, b);
497 vech hc_v = svcvt_f16_x(pg1d, c);
498 vech hd_v = svcvt_f16_x(pg1d, d);
499 vech hab_v = svuzp1(ha_v, hb_v);
500 vech hcd_v = svuzp1(hc_v, hd_v);
501 return svuzp1(hab_v, hcd_v);
513 vech sa_v = svzip1(h, h);
514 vech sb_v = svzip2(h, h);
515 vech da_v = svzip1(sa_v, sa_v);
516 vech db_v = svzip2(sa_v, sa_v);
517 vech dc_v = svzip1(sb_v, sb_v);
518 vech dd_v = svzip2(sb_v, sb_v);
519 a = svcvt_f64_x(pg1d, da_v);
520 b = svcvt_f64_x(pg1d, db_v);
521 c = svcvt_f64_x(pg1d, dc_v);
522 d = svcvt_f64_x(pg1d, dd_v);
536 vecf r1_v = svext(in1, in1, (uint64_t)8u);
537 vecf r2_v = svext(in2, in2, (uint64_t)8u);
538 out1 = svext(r1_v, in2, (uint64_t)8u);
539 out2 = svext(in1, r2_v, (uint64_t)8u);
548 vecf a1_v = svtbl(in1, tbl_exch1a);
549 vecf a2_v = svtbl(in2, tbl_exch1b);
550 vecf b1_v = svext(a2_v, a1_v, (uint64_t)8u);
551 vecf b2_v = svext(a1_v, a2_v, (uint64_t)8u);
552 out1 = svtbl(b1_v, tbl_exch1c);
553 out2 = svtbl(b2_v, tbl_exch1a);
560 out1 = svtrn1(in1, in2);
561 out2 = svtrn2(in1, in2);
566 vecd r1_v = svext(in1, in1, (uint64_t)4u);
567 vecd r2_v = svext(in2, in2, (uint64_t)4u);
568 out1 = svext(r1_v, in2, (uint64_t)4u);
569 out2 = svext(in1, r2_v, (uint64_t)4u);
578 vecd a1_v = svtbl(in1, tbl_exch1a);
579 vecd a2_v = svtbl(in2, tbl_exch1b);
580 vecd b1_v = svext(a2_v, a1_v, (uint64_t)4u);
581 vecd b2_v = svext(a1_v, a2_v, (uint64_t)4u);
582 out1 = svtbl(b1_v, tbl_exch1c);
583 out2 = svtbl(b2_v, tbl_exch1a);
586 out1 = svtrn1(in1, in2);
587 out2 = svtrn2(in1, in2);
600 return svext(in, in, (uint64_t)8u);
604 return svtbl(in, tbl_swap);
608 return svtbl(in, tbl_swap);
612 return svtbl(in, tbl_swap);
617 return svext(in, in, (uint64_t)(8u / 2u));
621 return svtbl(in, tbl_swap);
625 return svtbl(in, tbl_swap);
671 return svext(in, in, (uint64_t)n);
674 return svext(in, in, (uint64_t)n);
686template <
typename Out_type,
typename In_type>
691 printf(
"Error, using wrong Reduce function\n");
701 float a =
svred(pg_even, in);
702 float b =
svred(pg_odd, in);
703 return Grid::ComplexF(a, b);
709 return svred(pg1, in);
716 double a =
svred(pg_even, in);
717 double b =
svred(pg_odd, in);
718 return Grid::ComplexD(a, b);
724 return svred(pg1, in);
730 return svred(pg1, in);
754template <
typename S,
typename T>
using ReduceSIMD = Optimization::Reduce<S,T>;
757typedef Optimization::Sum
SumSIMD;
758typedef Optimization::Sub
SubSIMD;
759typedef Optimization::Div
DivSIMD;
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::MultAddComplex MultAddComplexSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
void prefetch_HINT_T0(const char *ptr)
void v_prefetch0(int size, const char *ptr)
#define NAMESPACE_BEGIN(A)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b)
static void Exchange3(vecd &out1, vecd &out2, vecd in1, vecd in2)
static void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2)
static void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2)
static void Exchange3(vecf &out1, vecf &out2, vecf in1, vecf in2)
static void Exchange0(vecd &out1, vecd &out2, vecd in1, vecd in2)
static void Exchange2(vecd &out1, vecd &out2, vecd in1, vecd in2)
static void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2)
static void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2)
vecf operator()(vecf a, vecf b, vecf c)
vecd operator()(vecd a, vecd b, vecd c)
vecf operator()(vecf a, vecf b, vecf c)
vecd operator()(vecd a, vecd b, vecd c)
vecd operator()(vecd a, vecd b)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b, vecd c)
veci operator()(veci a, veci b)
vecf operator()(vecf a, vecf b, vecf c)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b)
static vecd Permute3(vecd in)
static vecd Permute1(vecd in)
static vecf Permute3(vecf in)
static vecd Permute2(vecd in)
static vecf Permute0(vecf in)
static vecf Permute1(vecf in)
static vecd Permute0(vecd in)
static vecf Permute2(vecf in)
static vech StoH(vecf sa, vecf sb)
static void StoD(vecf s, vecd &a, vecd &b)
static vecf DtoS(vecd a, vecd b)
static void HtoD(vech h, vecd &a, vecd &b, vecd &c, vecd &d)
static vech DtoH(vecd a, vecd b, vecd c, vecd d)
static void HtoS(vech h, vecf &sa, vecf &sb)
Out_type operator()(In_type in)
static vec< T > tRotate(vec< T > in)
static vecf tRotate(vecf in)
static vecd tRotate(vecd in)
static vecd rotate(vecd in, int n)
static vecf rotate(vecf in, int n)
vecf operator()(vecf a, vecf b)
veci operator()(veci a, veci b)
vecd operator()(vecd a, vecd b)
veci operator()(veci a, veci b)
vecf operator()(vecf a, vecf b)
vecd operator()(vecd a, vecd b)
vecd operator()(double *a)
veci operator()(Integer *a)
vecf operator()(Grid::ComplexF *a)
vecd operator()(Grid::ComplexD *a)
vecf operator()(float *a)
vecf operator()(float a, float b)
vecd operator()(double a)
vecd operator()(double a, double b)
veci operator()(Integer a)
void operator()(veci a, Integer *D)
void operator()(vecd a, double *D)
void operator()(vecf a, float *D)
void operator()(double *a, vecd b)
void operator()(float *a, vecf b)