Grid 0.7.0
Grid_imci.h
Go to the documentation of this file.
1 /*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Grid_imci.h
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10Author: paboyle <paboyle@ph.ed.ac.uk>
11
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License along
23 with this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25
26 See the full license in the file "LICENSE" in the top level distribution directory
27 *************************************************************************************/
28 /* END LEGAL */
29
30#include <immintrin.h>
31#include <zmmintrin.h>
32
33namespace Grid{
34namespace Optimization {
35
36 struct Vsplat{
37 //Complex float
38 inline __m512 operator()(float a, float b){
39 return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
40 }
41 // Real float
42 inline __m512 operator()(float a){
43 return _mm512_set1_ps(a);
44 }
45 //Complex double
46 inline __m512d operator()(double a, double b){
47 return _mm512_set_pd(b,a,b,a,b,a,b,a);
48 }
49 //Real double
50 inline __m512d operator()(double a){
51 return _mm512_set1_pd(a);
52 }
53 //Integer
54 inline __m512i operator()(Integer a){
55 return _mm512_set1_epi32(a);
56 }
57 };
58
59 struct Vstore{
60 //Float
61 inline void operator()(__m512 a, float* F){
62 _mm512_store_ps(F,a);
63 }
64 //Double
65 inline void operator()(__m512d a, double* D){
66 _mm512_store_pd(D,a);
67 }
68 //Integer
69 inline void operator()(__m512i a, Integer* I){
70 _mm512_store_si512((__m512i *)I,a);
71 }
72
73 };
74
75
76 struct Vstream{
77 //Float
78 inline void operator()(float * a, __m512 b){
79 _mm512_storenrngo_ps(a,b);
80 }
81 //Double
82 inline void operator()(double * a, __m512d b){
83 _mm512_storenrngo_pd(a,b);
84 }
85
86
87 };
88
89
90
91 struct Vset{
92 // Complex float
93 inline __m512 operator()(Grid::ComplexF *a){
94 return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
95 a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
96 a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
97 a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
98 }
99 // Complex double
100 inline __m512d operator()(Grid::ComplexD *a){
101 return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
102 a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
103 }
104 // Real float
105 inline __m512 operator()(float *a){
106 return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
107 a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
108 }
109 // Real double
110 inline __m512d operator()(double *a){
111 return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
112 }
113 // Integer
114 inline __m512i operator()(Integer *a){
115 return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
116 a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
117 }
118
119
120 };
121
122 template <typename Out_type, typename In_type>
123 struct Reduce{
124 //Need templated class to overload output type
125 //General form must generate error if compiled
126 inline Out_type operator()(In_type in){
127 printf("Error, using wrong Reduce function\n");
128 exit(1);
129 return 0;
130 }
131 };
132
133
134
135
137 // Arithmetic operations
139 struct Sum{
140 //Complex/Real float
141 inline __m512 operator()(__m512 a, __m512 b){
142 return _mm512_add_ps(a,b);
143 }
144 //Complex/Real double
145 inline __m512d operator()(__m512d a, __m512d b){
146 return _mm512_add_pd(a,b);
147 }
148 //Integer
149 inline __m512i operator()(__m512i a, __m512i b){
150 return _mm512_add_epi32(a,b);
151 }
152 };
153
154 struct Sub{
155 //Complex/Real float
156 inline __m512 operator()(__m512 a, __m512 b){
157 return _mm512_sub_ps(a,b);
158 }
159 //Complex/Real double
160 inline __m512d operator()(__m512d a, __m512d b){
161 return _mm512_sub_pd(a,b);
162 }
163 //Integer
164 inline __m512i operator()(__m512i a, __m512i b){
165 return _mm512_sub_epi32(a,b);
166 }
167 };
168
169
170 struct MultComplex{
171 // Complex float
172 inline __m512 operator()(__m512 a, __m512 b){
173 __m512 vzero,ymm0,ymm1,real, imag;
174 vzero = _mm512_setzero_ps();
175 ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); //
176 real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
177 imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
178 ymm1 = _mm512_mul_ps(real, b);
179 ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
180 return _mm512_fmadd_ps(ymm0,imag,ymm1);
181 }
182 // Complex double
183 inline __m512d operator()(__m512d a, __m512d b){
184 /* This is from
185 * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets
186 * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
187 * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
188 * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
189 * booktitle = {Proceedings of the International Conference on Supercomputing},
190 * series = {ICS '11},
191 * year = {2011},
192 * isbn = {978-1-4503-0102-2},
193 * location = {Tucson, Arizona, USA},
194 * pages = {265--274},
195 * numpages = {10},
196 * url = {http://doi.acm.org/10.1145/1995896.1995938},
197 * doi = {10.1145/1995896.1995938},
198 * acmid = {1995938},
199 * publisher = {ACM},
200 * address = {New York, NY, USA},
201 * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
202 * }
203 */
204 __m512d vzero,ymm0,ymm1,real,imag;
205 vzero =_mm512_setzero_pd();
206 ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); //
207 real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
208 imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
209 ymm1 = _mm512_mul_pd(real, b);
210 ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
211 return _mm512_fmadd_pd(ymm0,imag,ymm1);
212 }
213 };
214
215 struct Mult{
216
217 inline void mac(__m512 &a, __m512 b, __m512 c){
218 a= _mm512_fmadd_ps( b, c, a);
219 }
220
221 inline void mac(__m512d &a, __m512d b, __m512d c){
222 a= _mm512_fmadd_pd( b, c, a);
223 }
224
225 // Real float
226 inline __m512 operator()(__m512 a, __m512 b){
227 return _mm512_mul_ps(a,b);
228 }
229 // Real double
230 inline __m512d operator()(__m512d a, __m512d b){
231 return _mm512_mul_pd(a,b);
232 }
233 // Integer
234 inline __m512i operator()(__m512i a, __m512i b){
235 return _mm512_mullo_epi32(a,b);
236 }
237 };
238
239 struct Div{
240 // Real float
241 inline __m512 operator()(__m512 a, __m512 b){
242 return _mm512_div_ps(a,b);
243 }
244 // Real double
245 inline __m512d operator()(__m512d a, __m512d b){
246 return _mm512_div_pd(a,b);
247 }
248 };
249
250
251 struct Conj{
252 // Complex single
253 inline __m512 operator()(__m512 in){
254 return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag
255 }
256 // Complex double
257 inline __m512d operator()(__m512d in){
258 return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
259 }
260 // do not define for integer input
261 };
262
263 struct TimesMinusI{
264 //Complex single
265 inline __m512 operator()(__m512 in, __m512 ret){
266 __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
267 return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
268 }
269 //Complex double
270 inline __m512d operator()(__m512d in, __m512d ret){
271 __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
272 return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
273 }
274
275
276 };
277
278 struct TimesI{
279 //Complex single
280 inline __m512 operator()(__m512 in, __m512 ret){
281 __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
282 return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
283 }
284 //Complex double
285 inline __m512d operator()(__m512d in, __m512d ret){
286 __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
287 return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
288 }
289
290
291 };
292
293
294 struct Permute{
295
296 static inline __m512 Permute0(__m512 in){
297 return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
298 };
299 static inline __m512 Permute1(__m512 in){
300 return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
301 };
302 static inline __m512 Permute2(__m512 in){
303 return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
304 };
305 static inline __m512 Permute3(__m512 in){
306 return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB);
307 };
308
309 static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
310 return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
311 };
312 static inline __m512d Permute1(__m512d in){
313 return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
314 };
315 static inline __m512d Permute2(__m512d in){
316 return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
317 };
318 static inline __m512d Permute3(__m512d in){
319 return in;
320 };
321
322 };
323
324 struct Rotate{
325
326 static inline __m512 rotate(__m512 in,int n){
327 switch(n){
328 case 0: return tRotate<0>(in);break;
329 case 1: return tRotate<1>(in);break;
330 case 2: return tRotate<2>(in);break;
331 case 3: return tRotate<3>(in);break;
332 case 4: return tRotate<4>(in);break;
333 case 5: return tRotate<5>(in);break;
334 case 6: return tRotate<6>(in);break;
335 case 7: return tRotate<7>(in);break;
336
337 case 8 : return tRotate<8>(in);break;
338 case 9 : return tRotate<9>(in);break;
339 case 10: return tRotate<10>(in);break;
340 case 11: return tRotate<11>(in);break;
341 case 12: return tRotate<12>(in);break;
342 case 13: return tRotate<13>(in);break;
343 case 14: return tRotate<14>(in);break;
344 case 15: return tRotate<15>(in);break;
345 default: assert(0);
346 }
347 }
348 static inline __m512d rotate(__m512d in,int n){
349 switch(n){
350 case 0: return tRotate<0>(in);break;
351 case 1: return tRotate<1>(in);break;
352 case 2: return tRotate<2>(in);break;
353 case 3: return tRotate<3>(in);break;
354 case 4: return tRotate<4>(in);break;
355 case 5: return tRotate<5>(in);break;
356 case 6: return tRotate<6>(in);break;
357 case 7: return tRotate<7>(in);break;
358 default: assert(0);
359 }
360 }
361
362 template<int n> static inline __m512 tRotate(__m512 in){
363 return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
364 };
365
366 template<int n> static inline __m512d tRotate(__m512d in){
367 return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);
368 };
369
370 };
371
372
373
375 // Some Template specialization
376
377 //Complex float Reduce
378 template<>
379 inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
380 return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
381 }
382 //Real float Reduce
383 template<>
384 inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
385 return _mm512_reduce_add_ps(in);
386 }
387
388
389 //Complex double Reduce
390 template<>
391 inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
392 return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
393 }
394
395 //Real double Reduce
396 template<>
397 inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
398 return _mm512_reduce_add_pd(in);
399 }
400
401 //Integer Reduce
402 template<>
404 return _mm512_reduce_add_epi32(in);
405 }
406
407
408}
409
411// Here assign types
412
413 typedef __m512 SIMD_Ftype; // Single precision type
414 typedef __m512d SIMD_Dtype; // Double precision type
415 typedef __m512i SIMD_Itype; // Integer type
416
417 // prefecth
418 inline void v_prefetch0(int size, const char *ptr){
419 for(int i=0;i<size;i+=64){ // Define L1 linesize above
420 _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
421 _mm_prefetch(ptr+i+512,_MM_HINT_T0);
422 }
423 }
424 inline void prefetch_HINT_T0(const char *ptr){
425 _mm_prefetch(ptr,_MM_HINT_T0);
426 }
427
428
429
430 // Function name aliases
435 template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
436
437
438 // Arithmetic operations
447
448}
accelerator_inline void vzero(Grid_simd2< S, V > &ret)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
uint32_t Integer
Definition Simd.h:58
#define _MM_SELECT_FOUR_FOUR(A, B, C, D)
Definition Simd.h:48
static INTERNAL_PRECISION F
Definition Zolotarev.cc:230
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Div DivSIMD
Optimization::MultComplex MultComplexSIMD
Optimization::Conj ConjSIMD
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
accelerator_inline void prefetch_HINT_T0(const char *ptr)
Optimization::TimesI TimesISIMD
GpuVectorRD SIMD_Dtype
Optimization::Mult MultSIMD
accelerator_inline void v_prefetch0(int size, const char *ptr)
GpuVectorRF SIMD_Ftype
GpuVectorI SIMD_Itype
Optimization::Vset VsetSIMD
Optimization::Vstore VstoreSIMD
Optimization::Sub SubSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::Vstream VstreamSIMD
__m512d operator()(__m512d in)
Definition Grid_imci.h:257
__m512 operator()(__m512 in)
Definition Grid_imci.h:253
__m512d operator()(__m512d a, __m512d b)
Definition Grid_imci.h:245
__m512 operator()(__m512 a, __m512 b)
Definition Grid_imci.h:241
__m512 operator()(__m512 a, __m512 b)
Definition Grid_imci.h:172
__m512d operator()(__m512d a, __m512d b)
Definition Grid_imci.h:183
__m512i operator()(__m512i a, __m512i b)
Definition Grid_imci.h:234
__m512d operator()(__m512d a, __m512d b)
Definition Grid_imci.h:230
void mac(__m512d &a, __m512d b, __m512d c)
Definition Grid_imci.h:221
__m512 operator()(__m512 a, __m512 b)
Definition Grid_imci.h:226
void mac(__m512 &a, __m512 b, __m512 c)
Definition Grid_imci.h:217
static __m512 Permute3(__m512 in)
Definition Grid_imci.h:305
static __m512d Permute1(__m512d in)
Definition Grid_imci.h:312
static __m512d Permute3(__m512d in)
Definition Grid_imci.h:318
static __m512d Permute2(__m512d in)
Definition Grid_imci.h:315
static __m512 Permute1(__m512 in)
Definition Grid_imci.h:299
static __m512d Permute0(__m512d in)
Definition Grid_imci.h:309
static __m512 Permute0(__m512 in)
Definition Grid_imci.h:296
static __m512 Permute2(__m512 in)
Definition Grid_imci.h:302
Out_type operator()(In_type in)
Definition Grid_imci.h:126
accelerator_inline Out_type operator()(In_type in)
static __m512 rotate(__m512 in, int n)
Definition Grid_imci.h:326
static __m512 tRotate(__m512 in)
Definition Grid_imci.h:362
static __m512d rotate(__m512d in, int n)
Definition Grid_imci.h:348
static accelerator_inline vec tRotate(vec in)
static __m512d tRotate(__m512d in)
Definition Grid_imci.h:366
__m512 operator()(__m512 a, __m512 b)
Definition Grid_imci.h:156
__m512i operator()(__m512i a, __m512i b)
Definition Grid_imci.h:164
__m512d operator()(__m512d a, __m512d b)
Definition Grid_imci.h:160
__m512d operator()(__m512d a, __m512d b)
Definition Grid_imci.h:145
__m512i operator()(__m512i a, __m512i b)
Definition Grid_imci.h:149
__m512 operator()(__m512 a, __m512 b)
Definition Grid_imci.h:141
__m512d operator()(__m512d in, __m512d ret)
Definition Grid_imci.h:285
__m512 operator()(__m512 in, __m512 ret)
Definition Grid_imci.h:280
__m512 operator()(__m512 in, __m512 ret)
Definition Grid_imci.h:265
__m512d operator()(__m512d in, __m512d ret)
Definition Grid_imci.h:270
__m512 operator()(Grid::ComplexF *a)
Definition Grid_imci.h:93
__m512i operator()(Integer *a)
Definition Grid_imci.h:114
__m512d operator()(double *a)
Definition Grid_imci.h:110
__m512 operator()(float *a)
Definition Grid_imci.h:105
__m512d operator()(Grid::ComplexD *a)
Definition Grid_imci.h:100
__m512 operator()(float a, float b)
Definition Grid_imci.h:38
__m512 operator()(float a)
Definition Grid_imci.h:42
__m512d operator()(double a)
Definition Grid_imci.h:50
__m512d operator()(double a, double b)
Definition Grid_imci.h:46
__m512i operator()(Integer a)
Definition Grid_imci.h:54
void operator()(__m512d a, double *D)
Definition Grid_imci.h:65
void operator()(__m512 a, float *F)
Definition Grid_imci.h:61
void operator()(__m512i a, Integer *I)
Definition Grid_imci.h:69
void operator()(double *a, __m512d b)
Definition Grid_imci.h:82
void operator()(float *a, __m512 b)
Definition Grid_imci.h:78