Grid 0.7.0
Grid_avx512.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Grid_avx512.h
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10Author: neo <cossu@post.kek.jp>
11Author: paboyle <paboyle@ph.ed.ac.uk>
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26
27 See the full license in the file "LICENSE" in the top level distribution directory
28*************************************************************************************/
29/* END LEGAL */
30#include <immintrin.h>
31
33NAMESPACE_BEGIN(Optimization);
34
35union u512f {
36 __m512 v;
37 float f[16];
38};
39
40union u512d {
41 __m512d v;
42 double f[8];
43};
44
45struct Vsplat{
46 //Complex float
47 inline __m512 operator()(float a, float b){
48 return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
49 }
50 // Real float
51 inline __m512 operator()(float a){
52 return _mm512_set1_ps(a);
53 }
54 //Complex double
55 inline __m512d operator()(double a, double b){
56 return _mm512_set_pd(b,a,b,a,b,a,b,a);
57 }
58 //Real double
59 inline __m512d operator()(double a){
60 return _mm512_set1_pd(a);
61 }
62 //Integer
63 inline __m512i operator()(Integer a){
64 return _mm512_set1_epi32(a);
65 }
66};
67
68struct Vstore{
69 //Float
70 inline void operator()(__m512 a, float* F){
71 _mm512_store_ps(F,a);
72 }
73 //Double
74 inline void operator()(__m512d a, double* D){
75 _mm512_store_pd(D,a);
76 }
77 //Integer
78 inline void operator()(__m512i a, Integer* I){
79 _mm512_store_si512((__m512i *)I,a);
80 }
81
82};
83
84struct Vstream{
85 //Float
86 inline void operator()(float * a, __m512 b){
87 _mm512_stream_ps(a,b);
88 // _mm512_store_ps(a,b);
89 }
90 //Double
91 inline void operator()(double * a, __m512d b){
92 _mm512_stream_pd(a,b);
93 // _mm512_store_pd(a,b);
94 }
95
96};
97
98struct Vset{
99 // Complex float
100 inline __m512 operator()(Grid::ComplexF *a){
101 return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
102 a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
103 a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
104 a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
105 }
106 // Complex double
107 inline __m512d operator()(Grid::ComplexD *a){
108 return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
109 a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
110 }
111 // Real float
112 inline __m512 operator()(float *a){
113 return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
114 a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
115 }
116 // Real double
117 inline __m512d operator()(double *a){
118 return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
119 }
120 // Integer
121 inline __m512i operator()(Integer *a){
122 return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
123 a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
124 }
125
126};
127
128template <typename Out_type, typename In_type>
129struct Reduce{
130 //Need templated class to overload output type
131 //General form must generate error if compiled
132 inline Out_type operator()(In_type in){
133 printf("Error, using wrong Reduce function\n");
134 exit(1);
135 return 0;
136 }
137};
138
139
141// Arithmetic operations
143struct Sum{
144 //Complex/Real float
145 inline __m512 operator()(__m512 a, __m512 b){
146 return _mm512_add_ps(a,b);
147 }
148 //Complex/Real double
149 inline __m512d operator()(__m512d a, __m512d b){
150 return _mm512_add_pd(a,b);
151 }
152 //Integer
153 inline __m512i operator()(__m512i a, __m512i b){
154 return _mm512_add_epi32(a,b);
155 }
156};
157
158struct Sub{
159 //Complex/Real float
160 inline __m512 operator()(__m512 a, __m512 b){
161 return _mm512_sub_ps(a,b);
162 }
163 //Complex/Real double
164 inline __m512d operator()(__m512d a, __m512d b){
165 return _mm512_sub_pd(a,b);
166 }
167 //Integer
168 inline __m512i operator()(__m512i a, __m512i b){
169 return _mm512_sub_epi32(a,b);
170 }
171};
172
173// Note, we can beat the shuf overhead in chain with two temporaries
174// Ar Ai , Br Bi, Ai Ar // one shuf
175//tmpr Ar Br, Ai Bi // Mul/Mac/Mac
176//tmpi Br Ai, Bi Ar // Mul/Mac/Mac
177// add tmpi,shuf(tmpi)
178// sub tmpr,shuf(tmpi)
179// shuf(tmpr,tmpi). // Could drop/trade for write mask
180
181// Gives
182// 2mul,4 mac +add+sub = 8 flop type insns
183// 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load.
184
185struct MultRealPart{
186 inline __m512 operator()(__m512 a, __m512 b){
187 __m512 ymm0;
188 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
189 return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
190 }
191 inline __m512d operator()(__m512d a, __m512d b){
192 __m512d ymm0;
193 ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00
194 return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
195 }
196};
197struct MaddRealPart{
198 inline __m512 operator()(__m512 a, __m512 b, __m512 c){
199 __m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
200 return _mm512_fmadd_ps( ymm0, b, c);
201 }
202 inline __m512d operator()(__m512d a, __m512d b, __m512d c){
203 __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 );
204 return _mm512_fmadd_pd( ymm0, b, c);
205 }
206};
207
208struct MultComplex{
209 // Complex float
210 inline __m512 operator()(__m512 a, __m512 b){
211 // dup, dup, perm, mul, madd
212 __m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar
213 __m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai
214 a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
215 return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
216 }
217 // Complex double
218 inline __m512d operator()(__m512d a, __m512d b){
219 __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
220 __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
221 a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
222 return _mm512_fmaddsub_pd( a_real, b, a_imag );
223 }
224};
225
226struct Mult{
227
228 inline void mac(__m512 &a, __m512 b, __m512 c){
229 a= _mm512_fmadd_ps( b, c, a);
230 }
231 inline void mac(__m512d &a, __m512d b, __m512d c){
232 a= _mm512_fmadd_pd( b, c, a);
233 }
234 // Real float
235 inline __m512 operator()(__m512 a, __m512 b){
236 return _mm512_mul_ps(a,b);
237 }
238 // Real double
239 inline __m512d operator()(__m512d a, __m512d b){
240 return _mm512_mul_pd(a,b);
241 }
242 // Integer
243 inline __m512i operator()(__m512i a, __m512i b){
244 return _mm512_mullo_epi32(a,b);
245 }
246};
247
248struct Div{
249 // Real float
250 inline __m512 operator()(__m512 a, __m512 b){
251 return _mm512_div_ps(a,b);
252 }
253 // Real double
254 inline __m512d operator()(__m512d a, __m512d b){
255 return _mm512_div_pd(a,b);
256 }
257};
258
259
260struct Conj{
261 // Complex single
262 inline __m512 operator()(__m512 in){
263 return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag
264 }
265 // Complex double
266 inline __m512d operator()(__m512d in){
267 return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
268 }
269 // do not define for integer input
270};
271
272struct TimesMinusI{
273 //Complex single
274 inline __m512 operator()(__m512 in){
275 //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
276 //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E??
277 __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
278 return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
279 }
280 //Complex double
281 inline __m512d operator()(__m512d in){
282 //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
283 //return _mm512_shuffle_pd(tmp,tmp,0x55);
284 __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
285 return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
286 }
287};
288
289struct TimesI{
290 //Complex single
291 inline __m512 operator()(__m512 in){
292 __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
293 return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp);
294 }
295 //Complex double
296 inline __m512d operator()(__m512d in){
297 __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
298 return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp);
299 }
300
301};
302
303// Gpermute utilities consider coalescing into 1 Gpermute
304struct Permute{
305
306 static inline __m512 Permute0(__m512 in){
307 return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
308 };
309 static inline __m512 Permute1(__m512 in){
310 return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
311 };
312 static inline __m512 Permute2(__m512 in){
313 return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
314 };
315 static inline __m512 Permute3(__m512 in){
316 return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
317 };
318
319 static inline __m512d Permute0(__m512d in){
320 return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
321 };
322 static inline __m512d Permute1(__m512d in){
323 return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
324 };
325 static inline __m512d Permute2(__m512d in){
326 return _mm512_shuffle_pd(in,in,0x55);
327 };
328 static inline __m512d Permute3(__m512d in){
329 return in;
330 };
331
332};
333#define USE_FP16
334struct PrecisionChange {
335 static inline __m512i StoH (__m512 a,__m512 b) {
336 __m512i h;
337#ifdef USE_FP16
338 __m256i ha = _mm512_cvtps_ph(a,0);
339 __m256i hb = _mm512_cvtps_ph(b,0);
340 h =(__m512i) _mm512_castps256_ps512((__m256)ha);
341 h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1);
342#else
343 assert(0);
344#endif
345 return h;
346 }
347
348 static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) {
349#ifdef USE_FP16
350 sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0));
351 sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1));
352#else
353 assert(0);
354#endif
355 }
356
357 static inline __m512 DtoS (__m512d a,__m512d b) {
358 __m256 sa = _mm512_cvtpd_ps(a);
359 __m256 sb = _mm512_cvtpd_ps(b);
360 __m512 s = _mm512_castps256_ps512(sa);
361 s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1);
362 return s;
363 }
364
365 static inline void StoD (__m512 s,__m512d &a,__m512d &b) {
366 a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0));
367 b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1));
368 }
369
370 static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) {
371 __m512 sa,sb;
372 sa = DtoS(a,b);
373 sb = DtoS(c,d);
374 return StoH(sa,sb);
375 }
376
377 static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) {
378 __m512 sa,sb;
379 HtoS(h,sa,sb);
380 StoD(sa,a,b);
381 StoD(sb,c,d);
382 }
383};
384// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
385// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
386// The operation is its own inverse
387struct Exchange{
388 // 3210 ordering
389 static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
390 out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
391 out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
392 };
393 static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
394 out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
395 out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
396 out1= _mm512_shuffle_f32x4(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
397 out2= _mm512_shuffle_f32x4(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
398 };
399 static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
400 out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
401 out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
402 };
403 static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
404 out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
405 out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
406 out1= _mm512_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
407 out2= _mm512_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
408 };
409
410 static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
411 out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
412 out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
413 };
414 static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
415 out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
416 out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
417 out1= _mm512_shuffle_f64x2(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
418 out2= _mm512_shuffle_f64x2(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
419 };
420 static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
421 out1 = _mm512_shuffle_pd(in1,in2,0x00);
422 out2 = _mm512_shuffle_pd(in1,in2,0xFF);
423 };
424 static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
425 assert(0);
426 return;
427 };
428};
429
430
431struct Rotate{
432
433 static inline __m512 rotate(__m512 in,int n){
434 switch(n){
435 case 0: return tRotate<0>(in);break;
436 case 1: return tRotate<1>(in);break;
437 case 2: return tRotate<2>(in);break;
438 case 3: return tRotate<3>(in);break;
439 case 4: return tRotate<4>(in);break;
440 case 5: return tRotate<5>(in);break;
441 case 6: return tRotate<6>(in);break;
442 case 7: return tRotate<7>(in);break;
443
444 case 8 : return tRotate<8>(in);break;
445 case 9 : return tRotate<9>(in);break;
446 case 10: return tRotate<10>(in);break;
447 case 11: return tRotate<11>(in);break;
448 case 12: return tRotate<12>(in);break;
449 case 13: return tRotate<13>(in);break;
450 case 14: return tRotate<14>(in);break;
451 case 15: return tRotate<15>(in);break;
452 default: assert(0);
453 }
454 }
455 static inline __m512d rotate(__m512d in,int n){
456 switch(n){
457 case 0: return tRotate<0>(in);break;
458 case 1: return tRotate<1>(in);break;
459 case 2: return tRotate<2>(in);break;
460 case 3: return tRotate<3>(in);break;
461 case 4: return tRotate<4>(in);break;
462 case 5: return tRotate<5>(in);break;
463 case 6: return tRotate<6>(in);break;
464 case 7: return tRotate<7>(in);break;
465 default: assert(0);
466 }
467 }
468
469 template<int n> static inline __m512 tRotate(__m512 in){
470 return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
471 };
472
473 template<int n> static inline __m512d tRotate(__m512d in){
474 return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);
475 };
476
477};
478
480// Some Template specialization
481
482// Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
483//Complex float Reduce
484template<>
485inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
486 return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
487}
488//Real float Reduce
489template<>
490inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
491 return _mm512_reduce_add_ps(in);
492}
493
494//Complex double Reduce
495template<>
496inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
497 return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
498}
499
500//Real double Reduce
501template<>
502inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
503 return _mm512_reduce_add_pd(in);
504}
505
506//Integer Reduce
507template<>
509 return _mm512_reduce_add_epi32(in);
510}
511
512NAMESPACE_END(Optimization);
513
515// Here assign types
516
517typedef __m512i SIMD_Htype; // Single precision type
518typedef __m512 SIMD_Ftype; // Single precision type
519typedef __m512d SIMD_Dtype; // Double precision type
520typedef __m512i SIMD_Itype; // Integer type
521
522// prefecth
523inline void v_prefetch0(int size, const char *ptr){
524 for(int i=0;i<size;i+=64){ // Define L1 linesize above
525 _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
526 _mm_prefetch(ptr+i+512,_MM_HINT_T0);
527 }
528}
529inline void prefetch_HINT_T0(const char *ptr){
530 _mm_prefetch(ptr,_MM_HINT_T0);
531}
532
533// Function name aliases
534typedef Optimization::Vsplat VsplatSIMD;
535typedef Optimization::Vstore VstoreSIMD;
536typedef Optimization::Vset VsetSIMD;
537typedef Optimization::Vstream VstreamSIMD;
538template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
539
540// Arithmetic operations
541typedef Optimization::Sum SumSIMD;
542typedef Optimization::Sub SubSIMD;
543typedef Optimization::Mult MultSIMD;
544typedef Optimization::Div DivSIMD;
545typedef Optimization::MultComplex MultComplexSIMD;
546typedef Optimization::MultRealPart MultRealPartSIMD;
547typedef Optimization::MaddRealPart MaddRealPartSIMD;
548typedef Optimization::Conj ConjSIMD;
549typedef Optimization::TimesMinusI TimesMinusISIMD;
550typedef Optimization::TimesI TimesISIMD;
551
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
void prefetch_HINT_T0(const char *ptr)
void v_prefetch0(int size, const char *ptr)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
uint32_t Integer
Definition Simd.h:58
#define _MM_SELECT_FOUR_FOUR(A, B, C, D)
Definition Simd.h:48
static INTERNAL_PRECISION F
Definition Zolotarev.cc:230
__m512 operator()(__m512 in)
__m512d operator()(__m512d in)
__m512 operator()(__m512 a, __m512 b)
__m512d operator()(__m512d a, __m512d b)
static void Exchange2(__m512d &out1, __m512d &out2, __m512d in1, __m512d in2)
static void Exchange2(__m512 &out1, __m512 &out2, __m512 in1, __m512 in2)
static void Exchange0(__m512d &out1, __m512d &out2, __m512d in1, __m512d in2)
static void Exchange1(__m512 &out1, __m512 &out2, __m512 in1, __m512 in2)
static void Exchange3(__m512d &out1, __m512d &out2, __m512d in1, __m512d in2)
static void Exchange3(__m512 &out1, __m512 &out2, __m512 in1, __m512 in2)
static void Exchange0(__m512 &out1, __m512 &out2, __m512 in1, __m512 in2)
static void Exchange1(__m512d &out1, __m512d &out2, __m512d in1, __m512d in2)
__m512d operator()(__m512d a, __m512d b, __m512d c)
__m512 operator()(__m512 a, __m512 b, __m512 c)
__m512 operator()(__m512 a, __m512 b)
__m512d operator()(__m512d a, __m512d b)
__m512 operator()(__m512 a, __m512 b)
__m512d operator()(__m512d a, __m512d b)
__m512 operator()(__m512 a, __m512 b)
void mac(__m512d &a, __m512d b, __m512d c)
void mac(__m512 &a, __m512 b, __m512 c)
__m512d operator()(__m512d a, __m512d b)
__m512i operator()(__m512i a, __m512i b)
static __m512d Permute0(__m512d in)
static __m512d Permute1(__m512d in)
static __m512 Permute2(__m512 in)
static __m512d Permute2(__m512d in)
static __m512 Permute3(__m512 in)
static __m512d Permute3(__m512d in)
static __m512 Permute1(__m512 in)
static __m512 Permute0(__m512 in)
static vech StoH(const vecf &sa, const vecf &sb)
static void HtoD(__m512i h, __m512d &a, __m512d &b, __m512d &c, __m512d &d)
static __m512i DtoH(__m512d a, __m512d b, __m512d c, __m512d d)
static void HtoS(__m512i h, __m512 &sa, __m512 &sb)
static void StoD(vecf s, vecd &a, vecd &b)
static vecf DtoS(vecd a, vecd b)
static __m512i StoH(__m512 a, __m512 b)
static __m512 DtoS(__m512d a, __m512d b)
static void HtoS(vech h, vecf &sa, vecf &sb)
static void StoD(__m512 s, __m512d &a, __m512d &b)
Out_type operator()(In_type in)
static vec< T > tRotate(vec< T > in)
static __m512 rotate(__m512 in, int n)
static __m512 tRotate(__m512 in)
static __m512d rotate(__m512d in, int n)
static __m512d tRotate(__m512d in)
__m512 operator()(__m512 a, __m512 b)
__m512i operator()(__m512i a, __m512i b)
__m512d operator()(__m512d a, __m512d b)
__m512 operator()(__m512 a, __m512 b)
__m512i operator()(__m512i a, __m512i b)
__m512d operator()(__m512d a, __m512d b)
__m512d operator()(__m512d in)
__m512 operator()(__m512 in)
__m512 operator()(__m512 in)
__m512d operator()(__m512d in)
__m512 operator()(float *a)
__m512d operator()(Grid::ComplexD *a)
__m512i operator()(Integer *a)
__m512d operator()(double *a)
__m512 operator()(Grid::ComplexF *a)
__m512d operator()(double a, double b)
Definition Grid_avx512.h:55
__m512d operator()(double a)
Definition Grid_avx512.h:59
__m512 operator()(float a)
Definition Grid_avx512.h:51
__m512i operator()(Integer a)
Definition Grid_avx512.h:63
__m512 operator()(float a, float b)
Definition Grid_avx512.h:47
void operator()(__m512 a, float *F)
Definition Grid_avx512.h:70
void operator()(__m512d a, double *D)
Definition Grid_avx512.h:74
void operator()(__m512i a, Integer *I)
Definition Grid_avx512.h:78
void operator()(float *a, __m512 b)
Definition Grid_avx512.h:86
void operator()(double *a, __m512d b)
Definition Grid_avx512.h:91
__m512d v
Definition Grid_avx512.h:41
double f[8]
Definition Grid_avx512.h:42
float f[16]
Definition Grid_avx512.h:37
__m512 v
Definition Grid_avx512.h:36