39 return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
43 return _mm512_set1_ps(a);
47 return _mm512_set_pd(b,a,b,a,b,a,b,a);
51 return _mm512_set1_pd(a);
55 return _mm512_set1_epi32(a);
70 _mm512_store_si512((__m512i *)I,a);
79 _mm512_storenrngo_ps(a,b);
83 _mm512_storenrngo_pd(a,b);
106 return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
107 a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
111 return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
115 return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
116 a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
122 template <
typename Out_type,
typename In_type>
127 printf(
"Error, using wrong Reduce function\n");
142 return _mm512_add_ps(a,b);
146 return _mm512_add_pd(a,b);
150 return _mm512_add_epi32(a,b);
157 return _mm512_sub_ps(a,b);
161 return _mm512_sub_pd(a,b);
165 return _mm512_sub_epi32(a,b);
174 vzero = _mm512_setzero_ps();
175 ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB);
176 real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)
vzero,(__m512i)ymm0);
177 imag = _mm512_mask_sub_ps(a, 0x5555,
vzero, ymm0);
178 ymm1 = _mm512_mul_ps(
real, b);
179 ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB);
180 return _mm512_fmadd_ps(ymm0,
imag,ymm1);
205 vzero =_mm512_setzero_pd();
206 ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB);
207 real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)
vzero,(__m512i) ymm0);
208 imag = _mm512_mask_sub_pd(a, 0x55,
vzero, ymm0);
209 ymm1 = _mm512_mul_pd(
real, b);
210 ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB);
211 return _mm512_fmadd_pd(ymm0,
imag,ymm1);
217 inline void mac(__m512 &a, __m512 b, __m512 c){
218 a= _mm512_fmadd_ps( b, c, a);
221 inline void mac(__m512d &a, __m512d b, __m512d c){
222 a= _mm512_fmadd_pd( b, c, a);
227 return _mm512_mul_ps(a,b);
231 return _mm512_mul_pd(a,b);
235 return _mm512_mullo_epi32(a,b);
242 return _mm512_div_ps(a,b);
246 return _mm512_div_pd(a,b);
254 return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in);
258 return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
266 __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in);
267 return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);
271 __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in);
272 return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);
281 __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);
282 return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
286 __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);
287 return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
303 return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
306 return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB);
310 return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)
_MM_SELECT_FOUR_FOUR(1,0,3,2));
313 return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
316 return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
326 static inline __m512
rotate(__m512 in,
int n){
348 static inline __m512d
rotate(__m512d in,
int n){
362 template<
int n>
static inline __m512
tRotate(__m512 in){
363 return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
366 template<
int n>
static inline __m512d
tRotate(__m512d in){
367 return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);
380 return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
385 return _mm512_reduce_add_ps(in);
392 return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
398 return _mm512_reduce_add_pd(in);
404 return _mm512_reduce_add_epi32(in);
418 inline void v_prefetch0(
int size,
const char *ptr){
419 for(
int i=0;i<size;i+=64){
420 _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
421 _mm_prefetch(ptr+i+512,_MM_HINT_T0);
425 _mm_prefetch(ptr,_MM_HINT_T0);
accelerator_inline void vzero(Grid_simd2< S, V > &ret)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
#define _MM_SELECT_FOUR_FOUR(A, B, C, D)
static INTERNAL_PRECISION F
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Div DivSIMD
Optimization::MultComplex MultComplexSIMD
Optimization::Conj ConjSIMD
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
accelerator_inline void prefetch_HINT_T0(const char *ptr)
Optimization::TimesI TimesISIMD
Optimization::Mult MultSIMD
accelerator_inline void v_prefetch0(int size, const char *ptr)
Optimization::Vset VsetSIMD
Optimization::Vstore VstoreSIMD
Optimization::Sub SubSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::Vstream VstreamSIMD
__m512d operator()(__m512d in)
__m512 operator()(__m512 in)
__m512d operator()(__m512d a, __m512d b)
__m512 operator()(__m512 a, __m512 b)
__m512 operator()(__m512 a, __m512 b)
__m512d operator()(__m512d a, __m512d b)
__m512i operator()(__m512i a, __m512i b)
__m512d operator()(__m512d a, __m512d b)
void mac(__m512d &a, __m512d b, __m512d c)
__m512 operator()(__m512 a, __m512 b)
void mac(__m512 &a, __m512 b, __m512 c)
static __m512 Permute3(__m512 in)
static __m512d Permute1(__m512d in)
static __m512d Permute3(__m512d in)
static __m512d Permute2(__m512d in)
static __m512 Permute1(__m512 in)
static __m512d Permute0(__m512d in)
static __m512 Permute0(__m512 in)
static __m512 Permute2(__m512 in)
Out_type operator()(In_type in)
accelerator_inline Out_type operator()(In_type in)
static __m512 rotate(__m512 in, int n)
static __m512 tRotate(__m512 in)
static __m512d rotate(__m512d in, int n)
static accelerator_inline vec tRotate(vec in)
static __m512d tRotate(__m512d in)
__m512 operator()(__m512 a, __m512 b)
__m512i operator()(__m512i a, __m512i b)
__m512d operator()(__m512d a, __m512d b)
__m512d operator()(__m512d a, __m512d b)
__m512i operator()(__m512i a, __m512i b)
__m512 operator()(__m512 a, __m512 b)
__m512d operator()(__m512d in, __m512d ret)
__m512 operator()(__m512 in, __m512 ret)
__m512 operator()(__m512 in, __m512 ret)
__m512d operator()(__m512d in, __m512d ret)
__m512 operator()(Grid::ComplexF *a)
__m512i operator()(Integer *a)
__m512d operator()(double *a)
__m512 operator()(float *a)
__m512d operator()(Grid::ComplexD *a)
__m512 operator()(float a, float b)
__m512 operator()(float a)
__m512d operator()(double a)
__m512d operator()(double a, double b)
__m512i operator()(Integer a)
void operator()(__m512d a, double *D)
void operator()(__m512 a, float *F)
void operator()(__m512i a, Integer *I)
void operator()(double *a, __m512d b)
void operator()(float *a, __m512 b)