Grid 0.7.0
Grid_sse4.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Grid_sse4.h
6
7 Copyright (C) 2015
8
9Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
10Author: Peter Boyle <paboyle@ph.ed.ac.uk>
11Author: neo <cossu@post.kek.jp>
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26
27 See the full license in the file "LICENSE" in the top level distribution directory
28*************************************************************************************/
29/* END LEGAL */
30//----------------------------------------------------------------------
36// Time-stamp: <2015-06-16 23:27:54 neo>
37//----------------------------------------------------------------------
38#include <immintrin.h>
39#include <pmmintrin.h>
40
42NAMESPACE_BEGIN(Optimization);
43
44template<class vtype>
45union uconv {
46 __m128 f;
47 vtype v;
48};
49
50union u128f {
51 __m128 v;
52 float f[4];
53};
54union u128d {
55 __m128d v;
56 double f[2];
57};
58
59struct Vsplat{
60 //Complex float
61 inline __m128 operator()(float a, float b){
62 return _mm_set_ps(b,a,b,a);
63 }
64 // Real float
65 inline __m128 operator()(float a){
66 return _mm_set_ps(a,a,a,a);
67 }
68 //Complex double
69 inline __m128d operator()(double a, double b){
70 return _mm_set_pd(b,a);
71 }
72 //Real double
73 inline __m128d operator()(double a){
74 return _mm_set_pd(a,a);
75 }
76 //Integer
77 inline __m128i operator()(Integer a){
78 return _mm_set1_epi32(a);
79 }
80};
81
82struct Vstore{
83 //Float
84 inline void operator()(__m128 a, float* F){
85 _mm_store_ps(F,a);
86 }
87 //Double
88 inline void operator()(__m128d a, double* D){
89 _mm_store_pd(D,a);
90 }
91 //Integer
92 inline void operator()(__m128i a, Integer* I){
93 _mm_store_si128((__m128i *)I,a);
94 }
95
96};
97
98struct Vstream{
99 //Float
100 inline void operator()(float * a, __m128 b){
101 _mm_stream_ps(a,b);
102 }
103 //Double
104 inline void operator()(double * a, __m128d b){
105 _mm_stream_pd(a,b);
106 }
107
108
109};
110
111struct Vset{
112 // Complex float
113 inline __m128 operator()(Grid::ComplexF *a){
114 return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
115 }
116 // Complex double
117 inline __m128d operator()(Grid::ComplexD *a){
118 return _mm_set_pd(a[0].imag(),a[0].real());
119 }
120 // Real float
121 inline __m128 operator()(float *a){
122 return _mm_set_ps(a[3],a[2],a[1],a[0]);
123 }
124 // Real double
125 inline __m128d operator()(double *a){
126 return _mm_set_pd(a[1],a[0]);
127 }
128 // Integer
129 inline __m128i operator()(Integer *a){
130 return _mm_set_epi32(a[3],a[2],a[1],a[0]);
131 }
132
133
134};
135
136template <typename Out_type, typename In_type>
137struct Reduce{
138 //Need templated class to overload output type
139 //General form must generate error if compiled
140 inline Out_type operator()(In_type in){
141 printf("Error, using wrong Reduce function\n");
142 exit(1);
143 return 0;
144 }
145};
146
148// Arithmetic operations
150struct Sum{
151 //Complex/Real float
152 inline __m128 operator()(__m128 a, __m128 b){
153 return _mm_add_ps(a,b);
154 }
155 //Complex/Real double
156 inline __m128d operator()(__m128d a, __m128d b){
157 return _mm_add_pd(a,b);
158 }
159 //Integer
160 inline __m128i operator()(__m128i a, __m128i b){
161 return _mm_add_epi32(a,b);
162 }
163};
164
165struct Sub{
166 //Complex/Real float
167 inline __m128 operator()(__m128 a, __m128 b){
168 return _mm_sub_ps(a,b);
169 }
170 //Complex/Real double
171 inline __m128d operator()(__m128d a, __m128d b){
172 return _mm_sub_pd(a,b);
173 }
174 //Integer
175 inline __m128i operator()(__m128i a, __m128i b){
176 return _mm_sub_epi32(a,b);
177 }
178};
179
180struct MultRealPart{
181 inline __m128 operator()(__m128 a, __m128 b){
182 __m128 ymm0;
183 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
184 return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
185 }
186 inline __m128d operator()(__m128d a, __m128d b){
187 __m128d ymm0;
188 ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
189 return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
190 }
191};
192struct MaddRealPart{
193 inline __m128 operator()(__m128 a, __m128 b, __m128 c){
194 __m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
195 return _mm_add_ps(_mm_mul_ps( ymm0, b),c);
196 }
197 inline __m128d operator()(__m128d a, __m128d b, __m128d c){
198 __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
199 return _mm_add_pd(_mm_mul_pd( ymm0, b),c);
200 }
201};
202
203struct MultComplex{
204 // Complex float
205 inline __m128 operator()(__m128 a, __m128 b){
206 __m128 ymm0,ymm1,ymm2;
207 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
208 ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
209 ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
210 ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
211 ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
212 return _mm_addsub_ps(ymm0,ymm1);
213 }
214 // Complex double
215 inline __m128d operator()(__m128d a, __m128d b){
216 __m128d ymm0,ymm1,ymm2;
217 ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar,
218 ymm0 = _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
219 ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi b01
220 ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai b11
221 ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
222 return _mm_addsub_pd(ymm0,ymm1);
223 }
224};
225
226struct Mult{
227
228 inline void mac(__m128 &a, __m128 b, __m128 c){
229 a= _mm_add_ps(_mm_mul_ps(b,c),a);
230 }
231
232 inline void mac(__m128d &a, __m128d b, __m128d c){
233 a= _mm_add_pd(_mm_mul_pd(b,c),a);
234 }
235
236 // Real float
237 inline __m128 operator()(__m128 a, __m128 b){
238 return _mm_mul_ps(a,b);
239 }
240 // Real double
241 inline __m128d operator()(__m128d a, __m128d b){
242 return _mm_mul_pd(a,b);
243 }
244 // Integer
245 inline __m128i operator()(__m128i a, __m128i b){
246 return _mm_mullo_epi32(a,b);
247 }
248};
249
250struct Div{
251 // Real float
252 inline __m128 operator()(__m128 a, __m128 b){
253 return _mm_div_ps(a,b);
254 }
255 // Real double
256 inline __m128d operator()(__m128d a, __m128d b){
257 return _mm_div_pd(a,b);
258 }
259};
260
261
262struct Conj{
263 // Complex single
264 inline __m128 operator()(__m128 in){
265 return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f));
266 }
267 // Complex double
268 inline __m128d operator()(__m128d in){
269 return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));//untested
270 }
271 // do not define for integer input
272};
273
274struct TimesMinusI{
275 //Complex single
276 inline __m128 operator()(__m128 in){
277 __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
278 return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
279 }
280 //Complex double
281 inline __m128d operator()(__m128d in){
282 __m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i
283 return _mm_shuffle_pd(tmp,tmp,0x1);
284 }
285};
286
287struct TimesI{
288 //Complex single
289 inline __m128 operator()(__m128 in){
290 __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
291 return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
292 }
293 //Complex double
294 inline __m128d operator()(__m128d in){
295 __m128d tmp = _mm_shuffle_pd(in,in,0x1);
296 return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i
297 }
298};
299
300struct Permute{
301
302 static inline __m128 Permute0(__m128 in){
303 return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
304 };
305 static inline __m128 Permute1(__m128 in){
306 return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
307 };
308 static inline __m128 Permute2(__m128 in){
309 return in;
310 };
311 static inline __m128 Permute3(__m128 in){
312 return in;
313 };
314
315 static inline __m128d Permute0(__m128d in){ //AB -> BA
316 return _mm_shuffle_pd(in,in,0x1);
317 };
318 static inline __m128d Permute1(__m128d in){
319 return in;
320 };
321 static inline __m128d Permute2(__m128d in){
322 return in;
323 };
324 static inline __m128d Permute3(__m128d in){
325 return in;
326 };
327};
328
329#define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
330#define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
331
332#ifdef SFW_FP16
333
334static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) {
335 __m128i ret=(__m128i)_mm_setzero_ps();
336 float *fp = (float *)&f;
337 Grid_half *hp = (Grid_half *)&ret;
338 hp[0] = sfw_float_to_half(fp[0]);
339 hp[1] = sfw_float_to_half(fp[1]);
340 hp[2] = sfw_float_to_half(fp[2]);
341 hp[3] = sfw_float_to_half(fp[3]);
342 return ret;
343}
344static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) {
345 __m128 ret=_mm_setzero_ps();
346 float *fp = (float *)&ret;
347 Grid_half *hp = (Grid_half *)&h;
348 fp[0] = sfw_half_to_float(hp[0]);
349 fp[1] = sfw_half_to_float(hp[1]);
350 fp[2] = sfw_half_to_float(hp[2]);
351 fp[3] = sfw_half_to_float(hp[3]);
352 return ret;
353}
354#else
355#define Grid_mm_cvtps_ph _mm_cvtps_ph
356#define Grid_mm_cvtph_ps _mm_cvtph_ps
357#endif
358struct PrecisionChange {
359 static inline __m128i StoH (__m128 a,__m128 b) {
360 __m128i ha = Grid_mm_cvtps_ph(a,0);
361 __m128i hb = Grid_mm_cvtps_ph(b,0);
362 __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
363 return h;
364 }
365 static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) {
366 sa = Grid_mm_cvtph_ps(h,0);
367 h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2);
368 sb = Grid_mm_cvtph_ps(h,0);
369 }
370 static inline __m128 DtoS (__m128d a,__m128d b) {
371 __m128 sa = _mm_cvtpd_ps(a);
372 __m128 sb = _mm_cvtpd_ps(b);
373 __m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
374 return s;
375 }
376 static inline void StoD (__m128 s,__m128d &a,__m128d &b) {
377 a = _mm_cvtps_pd(s);
378 s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2);
379 b = _mm_cvtps_pd(s);
380 }
381 static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) {
382 __m128 sa,sb;
383 sa = DtoS(a,b);
384 sb = DtoS(c,d);
385 return StoH(sa,sb);
386 }
387 static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) {
388 __m128 sa,sb;
389 HtoS(h,sa,sb);
390 StoD(sa,a,b);
391 StoD(sb,c,d);
392 }
393};
394
395struct Exchange{
396 // 3210 ordering
397 static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
398 out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
399 out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
400 };
401 static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
402 out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
403 out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
404 out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
405 out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
406 };
407 static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
408 assert(0);
409 return;
410 };
411 static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
412 assert(0);
413 return;
414 };
415
416 static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
417 out1= _mm_shuffle_pd(in1,in2,0x0);
418 out2= _mm_shuffle_pd(in1,in2,0x3);
419 };
420 static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
421 assert(0);
422 return;
423 };
424 static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
425 assert(0);
426 return;
427 };
428 static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
429 assert(0);
430 return;
431 };
432};
433
434struct Rotate{
435
436 static inline __m128 rotate(__m128 in,int n){
437 switch(n){
438 case 0: return tRotate<0>(in);break;
439 case 1: return tRotate<1>(in);break;
440 case 2: return tRotate<2>(in);break;
441 case 3: return tRotate<3>(in);break;
442 default: assert(0);
443 }
444 }
445 static inline __m128d rotate(__m128d in,int n){
446 switch(n){
447 case 0: return tRotate<0>(in);break;
448 case 1: return tRotate<1>(in);break;
449 default: assert(0);
450 }
451 }
452
453 template<int n> static inline __m128 tRotate(__m128 in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); };
454 template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); };
455
456};
458// Some Template specialization
459
460
461//Complex float Reduce
462template<>
463inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
464 __m128 v1; // two complex
465 v1= Optimization::Permute::Permute0(in);
466 v1= _mm_add_ps(v1,in);
467 u128f conv; conv.v=v1;
468 return Grid::ComplexF(conv.f[0],conv.f[1]);
469}
470//Real float Reduce
471template<>
472inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
473 __m128 v1,v2; // quad single
474 v1= Optimization::Permute::Permute0(in);
475 v1= _mm_add_ps(v1,in);
476 v2= Optimization::Permute::Permute1(v1);
477 v1 = _mm_add_ps(v1,v2);
478 u128f conv; conv.v=v1;
479 return conv.f[0];
480}
481
482//Complex double Reduce
483template<>
484inline Grid::ComplexD Reduce<Grid::ComplexD, __m128d>::operator()(__m128d in){
485 u128d conv; conv.v = in;
486 return Grid::ComplexD(conv.f[0],conv.f[1]);
487}
488
489//Real double Reduce
490template<>
491inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
492 __m128d v1;
493 v1 = Optimization::Permute::Permute0(in);
494 v1 = _mm_add_pd(v1,in);
495 u128d conv; conv.v = v1;
496 return conv.f[0];
497}
498
499//Integer Reduce
500template<>
502 __m128i v1 = _mm_hadd_epi32(in, in);
503 __m128i v2 = _mm_hadd_epi32(v1, v1);
504 return _mm_cvtsi128_si32(v2);
505}
506NAMESPACE_END(Optimization);
507
509// Here assign types
510typedef __m128i SIMD_Htype; // Single precision type
511typedef __m128 SIMD_Ftype; // Single precision type
512typedef __m128d SIMD_Dtype; // Double precision type
513typedef __m128i SIMD_Itype; // Integer type
514
515// prefetch utilities
516inline void v_prefetch0(int size, const char *ptr){};
517inline void prefetch_HINT_T0(const char *ptr){
518 _mm_prefetch(ptr,_MM_HINT_T0);
519}
520
521// Function name aliases
522typedef Optimization::Vsplat VsplatSIMD;
523typedef Optimization::Vstore VstoreSIMD;
524typedef Optimization::Vset VsetSIMD;
525typedef Optimization::Vstream VstreamSIMD;
526template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
527
528// Arithmetic operations
529typedef Optimization::Sum SumSIMD;
530typedef Optimization::Sub SubSIMD;
531typedef Optimization::Div DivSIMD;
532typedef Optimization::Mult MultSIMD;
533typedef Optimization::MultComplex MultComplexSIMD;
534typedef Optimization::MultRealPart MultRealPartSIMD;
535typedef Optimization::MaddRealPart MaddRealPartSIMD;
536typedef Optimization::Conj ConjSIMD;
537typedef Optimization::TimesMinusI TimesMinusISIMD;
538typedef Optimization::TimesI TimesISIMD;
539
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
#define _my_alignr_epi64(a, b, n)
Definition Grid_sse4.h:330
#define Grid_mm_cvtps_ph
Definition Grid_sse4.h:355
void prefetch_HINT_T0(const char *ptr)
Definition Grid_sse4.h:517
void v_prefetch0(int size, const char *ptr)
Definition Grid_sse4.h:516
#define Grid_mm_cvtph_ps
Definition Grid_sse4.h:356
#define _my_alignr_epi32(a, b, n)
Definition Grid_sse4.h:329
accelerator_inline float sfw_half_to_float(Grid_half h)
accelerator_inline Grid_half sfw_float_to_half(float ff)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
uint32_t Integer
Definition Simd.h:58
#define _MM_SELECT_FOUR_FOUR(A, B, C, D)
Definition Simd.h:48
static INTERNAL_PRECISION F
Definition Zolotarev.cc:230
__m128d operator()(__m128d in)
Definition Grid_sse4.h:268
__m128 operator()(__m128 in)
Definition Grid_sse4.h:264
__m128 operator()(__m128 a, __m128 b)
Definition Grid_sse4.h:252
__m128d operator()(__m128d a, __m128d b)
Definition Grid_sse4.h:256
static void Exchange0(__m128d &out1, __m128d &out2, __m128d in1, __m128d in2)
Definition Grid_sse4.h:416
static void Exchange0(__m128 &out1, __m128 &out2, __m128 in1, __m128 in2)
Definition Grid_sse4.h:397
static void Exchange2(__m128 &out1, __m128 &out2, __m128 in1, __m128 in2)
Definition Grid_sse4.h:407
static void Exchange2(__m128d &out1, __m128d &out2, __m128d in1, __m128d in2)
Definition Grid_sse4.h:424
static void Exchange1(__m128 &out1, __m128 &out2, __m128 in1, __m128 in2)
Definition Grid_sse4.h:401
static void Exchange1(__m128d &out1, __m128d &out2, __m128d in1, __m128d in2)
Definition Grid_sse4.h:420
static void Exchange3(__m128d &out1, __m128d &out2, __m128d in1, __m128d in2)
Definition Grid_sse4.h:428
static void Exchange3(__m128 &out1, __m128 &out2, __m128 in1, __m128 in2)
Definition Grid_sse4.h:411
__m128d operator()(__m128d a, __m128d b, __m128d c)
Definition Grid_sse4.h:197
__m128 operator()(__m128 a, __m128 b, __m128 c)
Definition Grid_sse4.h:193
__m128 operator()(__m128 a, __m128 b)
Definition Grid_sse4.h:205
__m128d operator()(__m128d a, __m128d b)
Definition Grid_sse4.h:215
__m128 operator()(__m128 a, __m128 b)
Definition Grid_sse4.h:181
__m128d operator()(__m128d a, __m128d b)
Definition Grid_sse4.h:186
__m128d operator()(__m128d a, __m128d b)
Definition Grid_sse4.h:241
void mac(__m128d &a, __m128d b, __m128d c)
Definition Grid_sse4.h:232
__m128i operator()(__m128i a, __m128i b)
Definition Grid_sse4.h:245
__m128 operator()(__m128 a, __m128 b)
Definition Grid_sse4.h:237
void mac(__m128 &a, __m128 b, __m128 c)
Definition Grid_sse4.h:228
static __m128d Permute1(__m128d in)
Definition Grid_sse4.h:318
static __m128 Permute0(__m128 in)
Definition Grid_sse4.h:302
static __m128d Permute2(__m128d in)
Definition Grid_sse4.h:321
static __m128d Permute0(__m128d in)
Definition Grid_sse4.h:315
static __m128 Permute2(__m128 in)
Definition Grid_sse4.h:308
static __m128 Permute3(__m128 in)
Definition Grid_sse4.h:311
static __m128 Permute1(__m128 in)
Definition Grid_sse4.h:305
static __m128d Permute3(__m128d in)
Definition Grid_sse4.h:324
static __m128 DtoS(__m128d a, __m128d b)
Definition Grid_sse4.h:370
static vech StoH(const vecf &sa, const vecf &sb)
static void StoD(__m128 s, __m128d &a, __m128d &b)
Definition Grid_sse4.h:376
static __m128i StoH(__m128 a, __m128 b)
Definition Grid_sse4.h:359
static void HtoD(__m128i h, __m128d &a, __m128d &b, __m128d &c, __m128d &d)
Definition Grid_sse4.h:387
static void HtoS(__m128i h, __m128 &sa, __m128 &sb)
Definition Grid_sse4.h:365
static void StoD(vecf s, vecd &a, vecd &b)
static vecf DtoS(vecd a, vecd b)
static __m128i DtoH(__m128d a, __m128d b, __m128d c, __m128d d)
Definition Grid_sse4.h:381
static void HtoS(vech h, vecf &sa, vecf &sb)
Out_type operator()(In_type in)
Definition Grid_sse4.h:140
static vec< T > tRotate(vec< T > in)
static __m128d rotate(__m128d in, int n)
Definition Grid_sse4.h:445
static __m128 rotate(__m128 in, int n)
Definition Grid_sse4.h:436
static __m128 tRotate(__m128 in)
Definition Grid_sse4.h:453
static __m128d tRotate(__m128d in)
Definition Grid_sse4.h:454
__m128i operator()(__m128i a, __m128i b)
Definition Grid_sse4.h:175
__m128d operator()(__m128d a, __m128d b)
Definition Grid_sse4.h:171
__m128 operator()(__m128 a, __m128 b)
Definition Grid_sse4.h:167
__m128 operator()(__m128 a, __m128 b)
Definition Grid_sse4.h:152
__m128d operator()(__m128d a, __m128d b)
Definition Grid_sse4.h:156
__m128i operator()(__m128i a, __m128i b)
Definition Grid_sse4.h:160
__m128d operator()(__m128d in)
Definition Grid_sse4.h:294
__m128 operator()(__m128 in)
Definition Grid_sse4.h:289
__m128d operator()(__m128d in)
Definition Grid_sse4.h:281
__m128 operator()(__m128 in)
Definition Grid_sse4.h:276
__m128i operator()(Integer *a)
Definition Grid_sse4.h:129
__m128 operator()(float *a)
Definition Grid_sse4.h:121
__m128d operator()(double *a)
Definition Grid_sse4.h:125
__m128 operator()(Grid::ComplexF *a)
Definition Grid_sse4.h:113
__m128d operator()(Grid::ComplexD *a)
Definition Grid_sse4.h:117
__m128 operator()(float a)
Definition Grid_sse4.h:65
__m128d operator()(double a)
Definition Grid_sse4.h:73
__m128i operator()(Integer a)
Definition Grid_sse4.h:77
__m128 operator()(float a, float b)
Definition Grid_sse4.h:61
__m128d operator()(double a, double b)
Definition Grid_sse4.h:69
void operator()(__m128i a, Integer *I)
Definition Grid_sse4.h:92
void operator()(__m128 a, float *F)
Definition Grid_sse4.h:84
void operator()(__m128d a, double *D)
Definition Grid_sse4.h:88
void operator()(double *a, __m128d b)
Definition Grid_sse4.h:104
void operator()(float *a, __m128 b)
Definition Grid_sse4.h:100
double f[2]
Definition Grid_neon.h:62
float64x2_t v
Definition Grid_neon.h:61
float32x4_t v
Definition Grid_neon.h:57
float f[4]
Definition Grid_neon.h:58
__m256 f
Definition Grid_avx.h:46
vtype v
Definition Grid_avx.h:47