62 return _mm_set_ps(b,a,b,a);
66 return _mm_set_ps(a,a,a,a);
70 return _mm_set_pd(b,a);
74 return _mm_set_pd(a,a);
78 return _mm_set1_epi32(a);
93 _mm_store_si128((__m128i *)I,a);
118 return _mm_set_pd(a[0].
imag(),a[0].
real());
122 return _mm_set_ps(a[3],a[2],a[1],a[0]);
126 return _mm_set_pd(a[1],a[0]);
130 return _mm_set_epi32(a[3],a[2],a[1],a[0]);
136template <
typename Out_type,
typename In_type>
141 printf(
"Error, using wrong Reduce function\n");
153 return _mm_add_ps(a,b);
157 return _mm_add_pd(a,b);
161 return _mm_add_epi32(a,b);
168 return _mm_sub_ps(a,b);
172 return _mm_sub_pd(a,b);
176 return _mm_sub_epi32(a,b);
184 return _mm_mul_ps(ymm0,b);
188 ymm0 = _mm_shuffle_pd(a,a,0x0);
189 return _mm_mul_pd(ymm0,b);
195 return _mm_add_ps(_mm_mul_ps( ymm0, b),c);
198 __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
199 return _mm_add_pd(_mm_mul_pd( ymm0, b),c);
206 __m128 ymm0,ymm1,ymm2;
208 ymm0 = _mm_mul_ps(ymm0,b);
211 ymm1 = _mm_mul_ps(ymm1,ymm2);
212 return _mm_addsub_ps(ymm0,ymm1);
216 __m128d ymm0,ymm1,ymm2;
217 ymm0 = _mm_shuffle_pd(a,a,0x0);
218 ymm0 = _mm_mul_pd(ymm0,b);
219 ymm1 = _mm_shuffle_pd(b,b,0x1);
220 ymm2 = _mm_shuffle_pd(a,a,0x3);
221 ymm1 = _mm_mul_pd(ymm1,ymm2);
222 return _mm_addsub_pd(ymm0,ymm1);
228 inline void mac(__m128 &a, __m128 b, __m128 c){
229 a= _mm_add_ps(_mm_mul_ps(b,c),a);
232 inline void mac(__m128d &a, __m128d b, __m128d c){
233 a= _mm_add_pd(_mm_mul_pd(b,c),a);
238 return _mm_mul_ps(a,b);
242 return _mm_mul_pd(a,b);
246 return _mm_mullo_epi32(a,b);
253 return _mm_div_ps(a,b);
257 return _mm_div_pd(a,b);
265 return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f));
269 return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));
277 __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in);
282 __m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in);
283 return _mm_shuffle_pd(tmp,tmp,0x1);
291 return _mm_addsub_ps(_mm_setzero_ps(),tmp);
295 __m128d tmp = _mm_shuffle_pd(in,in,0x1);
296 return _mm_addsub_pd(_mm_setzero_pd(),tmp);
316 return _mm_shuffle_pd(in,in,0x1);
329#define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
330#define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
335 __m128i ret=(__m128i)_mm_setzero_ps();
336 float *fp = (
float *)&f;
345 __m128 ret=_mm_setzero_ps();
346 float *fp = (
float *)&ret;
355#define Grid_mm_cvtps_ph _mm_cvtps_ph
356#define Grid_mm_cvtph_ps _mm_cvtph_ps
359 static inline __m128i
StoH (__m128 a,__m128 b) {
365 static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) {
370 static inline __m128
DtoS (__m128d a,__m128d b) {
371 __m128 sa = _mm_cvtpd_ps(a);
372 __m128 sb = _mm_cvtpd_ps(b);
376 static inline void StoD (__m128 s,__m128d &a,__m128d &b) {
381 static inline __m128i
DtoH (__m128d a,__m128d b,__m128d c,__m128d d) {
387 static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) {
397 static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
401 static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
407 static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
411 static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
416 static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
417 out1= _mm_shuffle_pd(in1,in2,0x0);
418 out2= _mm_shuffle_pd(in1,in2,0x3);
420 static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
424 static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
428 static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
436 static inline __m128
rotate(__m128 in,
int n){
445 static inline __m128d
rotate(__m128d in,
int n){
465 v1= Optimization::Permute::Permute0(in);
466 v1= _mm_add_ps(v1,in);
468 return Grid::ComplexF(conv.
f[0],conv.
f[1]);
474 v1= Optimization::Permute::Permute0(in);
475 v1= _mm_add_ps(v1,in);
476 v2= Optimization::Permute::Permute1(v1);
477 v1 = _mm_add_ps(v1,v2);
486 return Grid::ComplexD(conv.
f[0],conv.
f[1]);
493 v1 = Optimization::Permute::Permute0(in);
494 v1 = _mm_add_pd(v1,in);
502 __m128i v1 = _mm_hadd_epi32(in, in);
503 __m128i v2 = _mm_hadd_epi32(v1, v1);
504 return _mm_cvtsi128_si32(v2);
518 _mm_prefetch(ptr,_MM_HINT_T0);
526template <
typename S,
typename T>
using ReduceSIMD = Optimization::Reduce<S,T>;
529typedef Optimization::Sum
SumSIMD;
530typedef Optimization::Sub
SubSIMD;
531typedef Optimization::Div
DivSIMD;
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
#define _my_alignr_epi64(a, b, n)
void prefetch_HINT_T0(const char *ptr)
void v_prefetch0(int size, const char *ptr)
#define _my_alignr_epi32(a, b, n)
accelerator_inline float sfw_half_to_float(Grid_half h)
accelerator_inline Grid_half sfw_float_to_half(float ff)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
#define NAMESPACE_BEGIN(A)
#define _MM_SELECT_FOUR_FOUR(A, B, C, D)
static INTERNAL_PRECISION F
__m128d operator()(__m128d in)
__m128 operator()(__m128 in)
__m128 operator()(__m128 a, __m128 b)
__m128d operator()(__m128d a, __m128d b)
static void Exchange0(__m128d &out1, __m128d &out2, __m128d in1, __m128d in2)
static void Exchange0(__m128 &out1, __m128 &out2, __m128 in1, __m128 in2)
static void Exchange2(__m128 &out1, __m128 &out2, __m128 in1, __m128 in2)
static void Exchange2(__m128d &out1, __m128d &out2, __m128d in1, __m128d in2)
static void Exchange1(__m128 &out1, __m128 &out2, __m128 in1, __m128 in2)
static void Exchange1(__m128d &out1, __m128d &out2, __m128d in1, __m128d in2)
static void Exchange3(__m128d &out1, __m128d &out2, __m128d in1, __m128d in2)
static void Exchange3(__m128 &out1, __m128 &out2, __m128 in1, __m128 in2)
__m128d operator()(__m128d a, __m128d b, __m128d c)
__m128 operator()(__m128 a, __m128 b, __m128 c)
__m128 operator()(__m128 a, __m128 b)
__m128d operator()(__m128d a, __m128d b)
__m128 operator()(__m128 a, __m128 b)
__m128d operator()(__m128d a, __m128d b)
__m128d operator()(__m128d a, __m128d b)
void mac(__m128d &a, __m128d b, __m128d c)
__m128i operator()(__m128i a, __m128i b)
__m128 operator()(__m128 a, __m128 b)
void mac(__m128 &a, __m128 b, __m128 c)
static __m128d Permute1(__m128d in)
static __m128 Permute0(__m128 in)
static __m128d Permute2(__m128d in)
static __m128d Permute0(__m128d in)
static __m128 Permute2(__m128 in)
static __m128 Permute3(__m128 in)
static __m128 Permute1(__m128 in)
static __m128d Permute3(__m128d in)
static __m128 DtoS(__m128d a, __m128d b)
static vech StoH(const vecf &sa, const vecf &sb)
static void StoD(__m128 s, __m128d &a, __m128d &b)
static __m128i StoH(__m128 a, __m128 b)
static void HtoD(__m128i h, __m128d &a, __m128d &b, __m128d &c, __m128d &d)
static void HtoS(__m128i h, __m128 &sa, __m128 &sb)
static void StoD(vecf s, vecd &a, vecd &b)
static vecf DtoS(vecd a, vecd b)
static __m128i DtoH(__m128d a, __m128d b, __m128d c, __m128d d)
static void HtoS(vech h, vecf &sa, vecf &sb)
Out_type operator()(In_type in)
static vec< T > tRotate(vec< T > in)
static __m128d rotate(__m128d in, int n)
static __m128 rotate(__m128 in, int n)
static __m128 tRotate(__m128 in)
static __m128d tRotate(__m128d in)
__m128i operator()(__m128i a, __m128i b)
__m128d operator()(__m128d a, __m128d b)
__m128 operator()(__m128 a, __m128 b)
__m128 operator()(__m128 a, __m128 b)
__m128d operator()(__m128d a, __m128d b)
__m128i operator()(__m128i a, __m128i b)
__m128d operator()(__m128d in)
__m128 operator()(__m128 in)
__m128d operator()(__m128d in)
__m128 operator()(__m128 in)
__m128i operator()(Integer *a)
__m128 operator()(float *a)
__m128d operator()(double *a)
__m128 operator()(Grid::ComplexF *a)
__m128d operator()(Grid::ComplexD *a)
__m128 operator()(float a)
__m128d operator()(double a)
__m128i operator()(Integer a)
__m128 operator()(float a, float b)
__m128d operator()(double a, double b)
void operator()(__m128i a, Integer *I)
void operator()(__m128 a, float *F)
void operator()(__m128d a, double *D)
void operator()(double *a, __m128d b)
void operator()(float *a, __m128 b)