Grid 0.7.0
Grid_generic.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Grid_generic.h
6
7 Copyright (C) 2015
8 Copyright (C) 2017
9
10Author: Antonin Portelli <antonin.portelli@me.com>
11 Andrew Lawson <andrew.lawson1991@gmail.com>
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26
27 See the full license in the file "LICENSE" in the top level distribution directory
28*************************************************************************************/
29/* END LEGAL */
30
31#include "Grid_generic_types.h"
32
34NAMESPACE_BEGIN(Optimization);
35
36struct Vsplat{
37 // Complex
38 template <typename T>
40 vec<T> out;
41
42 VECTOR_FOR(i, W<T>::r, 2)
43 {
44 out.v[i] = a;
45 out.v[i+1] = b;
46 }
47
48 return out;
49 }
50
51 // Real
52 template <typename T>
54 vec<T> out;
55
56 VECTOR_FOR(i, W<T>::r, 1)
57 {
58 out.v[i] = a;
59 }
60
61 return out;
62 }
63};
64
65struct Vstore{
66 // Real
67 template <typename T>
69 *((vec<T> *)D) = a;
70 }
71};
72
73struct Vstream{
74 // Real
75 template <typename T>
77 *((vec<T> *)a) = b;
78 }
79};
80
81struct Vset{
82 // Complex
83 template <typename T>
85 vec<T> out;
86
87 VECTOR_FOR(i, W<T>::c, 1)
88 {
89 out.v[2*i] = a[i].real();
90 out.v[2*i+1] = a[i].imag();
91 }
92
93 return out;
94 }
95
96 // Real
97 template <typename T>
99 vec<T> out;
100
101 out = *((vec<T> *)a);
102
103 return out;
104 }
105};
106
108// Arithmetic operations
110struct Sum{
111 // Complex/Real
112 template <typename T>
114 vec<T> out;
115
116 VECTOR_FOR(i, W<T>::r, 1)
117 {
118 out.v[i] = a.v[i] + b.v[i];
119 }
120
121 return out;
122 }
123};
124
125struct Sub{
126 // Complex/Real
127 template <typename T>
129 vec<T> out;
130
131 VECTOR_FOR(i, W<T>::r, 1)
132 {
133 out.v[i] = a.v[i] - b.v[i];
134 }
135
136 return out;
137 }
138};
139
140struct Mult{
141 // Real
142 template <typename T>
144 vec<T> out;
145
146 VECTOR_FOR(i, W<T>::r, 1)
147 {
148 out.v[i] = a.v[i]*b.v[i];
149 }
150
151 return out;
152 }
153};
154
155#define cmul(a, b, c, i) \
156 c[i] = a[i]*b[i] - a[i+1]*b[i+1]; \
157 c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
158
159struct MultRealPart{
160 template <typename T>
162 vec<T> out;
163
164 VECTOR_FOR(i, W<T>::c, 1)
165 {
166 out.v[2*i] = a.v[2*i]*b.v[2*i];
167 out.v[2*i+1] = a.v[2*i]*b.v[2*i+1];
168 }
169 return out;
170 }
171};
172
173struct MaddRealPart{
174 template <typename T>
176 vec<T> out;
177
178 VECTOR_FOR(i, W<T>::c, 1)
179 {
180 out.v[2*i] = a.v[2*i]*b.v[2*i] + c.v[2*i];
181 out.v[2*i+1] = a.v[2*i]*b.v[2*i+1] + c.v[2*i+1];
182 }
183 return out;
184 }
185};
186
187struct MultComplex{
188 // Complex
189 template <typename T>
191 vec<T> out;
192
193 VECTOR_FOR(i, W<T>::c, 1)
194 {
195 cmul(a.v, b.v, out.v, 2*i);
196 }
197
198 return out;
199 }
200};
201
202#undef cmul
203
204struct Div{
205 // Real
206 template <typename T>
208 vec<T> out;
209
210 VECTOR_FOR(i, W<T>::r, 1)
211 {
212 out.v[i] = a.v[i]/b.v[i];
213 }
214
215 return out;
216 }
217};
218
219#define conj(a, b, i) \
220 b[i] = a[i]; \
221 b[i+1] = -a[i+1];
222
223struct Conj{
224 // Complex
225 template <typename T>
227 vec<T> out;
228
229 VECTOR_FOR(i, W<T>::c, 1)
230 {
231 conj(a.v, out.v, 2*i);
232 }
233
234 return out;
235 }
236};
237
238#undef conj
239
240#define timesmi(a, b, i) \
241 b[i] = a[i+1]; \
242 b[i+1] = -a[i];
243
244struct TimesMinusI{
245 // Complex
246 template <typename T>
248 vec<T> out;
249
250 VECTOR_FOR(i, W<T>::c, 1)
251 {
252 timesmi(a.v, out.v, 2*i);
253 }
254
255 return out;
256 }
257};
258
259#undef timesmi
260
261#define timesi(a, b, i) \
262 b[i] = -a[i+1]; \
263 b[i+1] = a[i];
264
265struct TimesI{
266 // Complex
267 template <typename T>
269 vec<T> out;
270
271 VECTOR_FOR(i, W<T>::c, 1)
272 {
273 timesi(a.v, out.v, 2*i);
274 }
275
276 return out;
277 }
278};
279
280#undef timesi
281
282struct PrecisionChange {
283 static accelerator_inline vech StoH (const vecf &a,const vecf &b) {
284 vech ret;
285 const int nf = W<float>::r;
286#ifdef USE_FP16
287 vech *ha = (vech *)&a;
288 vech *hb = (vech *)&b;
289 // VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; }
290 // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; }
291 VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; }
292 VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; }
293#else
294 VECTOR_FOR(i, nf,1){ ret.v[i]=0; }
295 assert(0);
296#endif
297 return ret;
298 }
299 static accelerator_inline void HtoS (vech h,vecf &sa,vecf &sb) {
300#ifdef USE_FP16
301 const int nf = W<float>::r;
302 const int nh = W<uint16_t>::r;
303 vech *ha = (vech *)&sa;
304 vech *hb = (vech *)&sb;
305 VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; }
306 // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];}
307 // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];}
308 VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; }
309 VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; }
310#else
311 assert(0);
312#endif
313 }
315 const int nd = W<double>::r;
316 vecf ret;
317 VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; }
318 VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; }
319 return ret;
320 }
321 static accelerator_inline void StoD (vecf s,vecd &a,vecd &b) {
322 const int nd = W<double>::r;
323 VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; }
324 VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; }
325 }
327 vecf sa,sb;
328 sa = DtoS(a,b);
329 sb = DtoS(c,d);
330 return StoH(sa,sb);
331 }
332 static accelerator_inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) {
333 vecf sa,sb;
334 HtoS(h,sa,sb);
335 StoD(sa,a,b);
336 StoD(sb,c,d);
337 }
338};
339
341// Exchange support
342struct Exchange{
343
344 template <typename T,int n>
345 static accelerator_inline void ExchangeN(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
346 const int w = W<T>::r;
347 unsigned int mask = w >> (n + 1);
348 // std::cout << " Exchange "<<n<<" nsimd "<<w<<" mask 0x" <<std::hex<<mask<<std::dec<<std::endl;
349 VECTOR_FOR(i, w, 1) {
350 int j1 = i&(~mask);
351 if ( (i&mask) == 0 ) { out1.v[i]=in1.v[j1];}
352 else { out1.v[i]=in2.v[j1];}
353 int j2 = i|mask;
354 if ( (i&mask) == 0 ) { out2.v[i]=in1.v[j2];}
355 else { out2.v[i]=in2.v[j2];}
356 }
357 }
358 template <typename T>
359 static accelerator_inline void Exchange0(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
360 ExchangeN<T,0>(out1,out2,in1,in2);
361 };
362 template <typename T>
363 static accelerator_inline void Exchange1(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
364 ExchangeN<T,1>(out1,out2,in1,in2);
365 };
366 template <typename T>
367 static accelerator_inline void Exchange2(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
368 ExchangeN<T,2>(out1,out2,in1,in2);
369 };
370 template <typename T>
371 static accelerator_inline void Exchange3(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
372 ExchangeN<T,3>(out1,out2,in1,in2);
373 };
374};
375
376
378// Some Template specialization
379#define perm(a, b, n, w) \
380 unsigned int _mask = w >> (n + 1); \
381 VECTOR_FOR(i, w, 1) \
382 { \
383 b[i] = a[i^_mask]; \
384 }
385
386#define DECL_PERMUTE_N(n) \
387 template <typename T> \
388 static accelerator_inline vec<T> Permute##n(vec<T> in) { \
389 vec<T> out; \
390 perm(in.v, out.v, n, W<T>::r); \
391 return out; \
392 }
393
394struct Permute{
399};
400
401#undef perm
402#undef DECL_PERMUTE_N
403
404#define rot(a, b, n, w) \
405 VECTOR_FOR(i, w, 1) \
406 { \
407 b[i] = a[(i + n)%w]; \
408 }
409
410struct Rotate{
411
412 template <int n, typename T> static accelerator_inline vec<T> tRotate(vec<T> in){
413 return rotate(in, n);
414 }
415
416 template <typename T>
418 vec<T> out;
419
420 rot(in.v, out.v, n, W<T>::r);
421
422 return out;
423 }
424};
425
426#undef rot
427
428#define acc(v, a, off, step, n) \
429 for (unsigned int i = off; i < n; i += step) \
430 { \
431 a += v[i]; \
432 }
433
434template <typename Out_type, typename In_type>
435struct Reduce{
436 //Need templated class to overload output type
437 //General form must generate error if compiled
438 accelerator_inline Out_type operator()(In_type in){
439 printf("Error, using wrong Reduce function\n");
440 exit(1);
441 return 0;
442 }
443};
444
445//Complex float Reduce
446template <>
448 float a = 0.f, b = 0.f;
449
450 acc(in.v, a, 0, 2, W<float>::r);
451 acc(in.v, b, 1, 2, W<float>::r);
452
453 return Grid::ComplexF(a, b);
454}
455
456//Real float Reduce
457template<>
459 float a = 0.;
460
461 acc(in.v, a, 0, 1, W<float>::r);
462
463 return a;
464}
465
466//Complex double Reduce
467template<>
469 double a = 0., b = 0.;
470
471 acc(in.v, a, 0, 2, W<double>::r);
472 acc(in.v, b, 1, 2, W<double>::r);
473
474 return Grid::ComplexD(a, b);
475}
476
477//Real double Reduce
478template<>
480 double a = 0.f;
481
482 acc(in.v, a, 0, 1, W<double>::r);
483
484 return a;
485}
486
487//Integer Reduce
488template<>
490 Integer a = 0;
491
492 acc(in.v, a, 0, 1, W<Integer>::r);
493
494 return a;
495}
496
497#undef acc // EIGEN compatibility
498NAMESPACE_END(Optimization)
499
500
501// Here assign types
502
503typedef Optimization::vech SIMD_Htype; // Reduced precision type
504typedef Optimization::vecf SIMD_Ftype; // Single precision type
505typedef Optimization::vecd SIMD_Dtype; // Double precision type
506typedef Optimization::veci SIMD_Itype; // Integer type
507
508// prefetch utilities
509accelerator_inline void v_prefetch0(int size, const char *ptr){};
510accelerator_inline void prefetch_HINT_T0(const char *ptr){};
511
512// Function name aliases
513typedef Optimization::Vsplat VsplatSIMD;
514typedef Optimization::Vstore VstoreSIMD;
515typedef Optimization::Vset VsetSIMD;
516typedef Optimization::Vstream VstreamSIMD;
517template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
518
519// Arithmetic operations
520typedef Optimization::Sum SumSIMD;
521typedef Optimization::Sub SubSIMD;
522typedef Optimization::Div DivSIMD;
523typedef Optimization::Mult MultSIMD;
524typedef Optimization::MultComplex MultComplexSIMD;
525typedef Optimization::MultRealPart MultRealPartSIMD;
526typedef Optimization::MaddRealPart MaddRealPartSIMD;
527typedef Optimization::Conj ConjSIMD;
528typedef Optimization::TimesMinusI TimesMinusISIMD;
529typedef Optimization::TimesI TimesISIMD;
530
532
#define accelerator_inline
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
Optimization::Reduce< S, T > ReduceSIMD
vec< double > vecd
vec< Integer > veci
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
vec< float > vecf
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::Div DivSIMD
vec< uint16_t > vech
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
accelerator_inline void v_prefetch0(int size, const char *ptr)
#define timesmi(a, b, i)
#define rot(a, b, n, w)
#define timesi(a, b, i)
#define acc(v, a, off, step, n)
#define conj(a, b, i)
accelerator_inline void prefetch_HINT_T0(const char *ptr)
#define cmul(a, b, c, i)
#define VECTOR_FOR(i, w, inc)
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
uint32_t Integer
Definition Simd.h:58
accelerator_inline vec< T > operator()(vec< T > a)
accelerator_inline vec< T > operator()(vec< T > a, vec< T > b)
static accelerator_inline void Exchange1(vec< T > &out1, vec< T > &out2, vec< T > &in1, vec< T > &in2)
static accelerator_inline void Exchange3(vec< T > &out1, vec< T > &out2, vec< T > &in1, vec< T > &in2)
static accelerator_inline void Exchange2(vec< T > &out1, vec< T > &out2, vec< T > &in1, vec< T > &in2)
static accelerator_inline void ExchangeN(vec< T > &out1, vec< T > &out2, vec< T > &in1, vec< T > &in2)
static accelerator_inline void Exchange0(vec< T > &out1, vec< T > &out2, vec< T > &in1, vec< T > &in2)
accelerator_inline vec< T > operator()(vec< T > a, vec< T > b, vec< T > c)
accelerator_inline vec< T > operator()(vec< T > a, vec< T > b)
accelerator_inline vec< T > operator()(vec< T > a, vec< T > b)
accelerator_inline vec< T > operator()(vec< T > a, vec< T > b)
DECL_PERMUTE_N(2)
DECL_PERMUTE_N(1)
DECL_PERMUTE_N(0)
DECL_PERMUTE_N(3)
static vech StoH(const vecf &sa, const vecf &sb)
static accelerator_inline vech StoH(const vecf &a, const vecf &b)
static void StoD(vecf s, vecd &a, vecd &b)
static accelerator_inline void HtoD(vech h, vecd &a, vecd &b, vecd &c, vecd &d)
static accelerator_inline vecf DtoS(vecd a, vecd b)
static accelerator_inline void HtoS(vech h, vecf &sa, vecf &sb)
static vecf DtoS(vecd a, vecd b)
static accelerator_inline void StoD(vecf s, vecd &a, vecd &b)
static void HtoS(vech h, vecf &sa, vecf &sb)
static accelerator_inline vech DtoH(vecd a, vecd b, vecd c, vecd d)
accelerator_inline Out_type operator()(In_type in)
Out_type operator()(In_type in)
static vec< T > rotate(vec< T > in, int n)
static accelerator_inline vec< T > tRotate(vec< T > in)
static accelerator_inline vec< T > rotate(vec< T > in, int n)
accelerator_inline vec< T > operator()(vec< T > a, vec< T > b)
accelerator_inline vec< T > operator()(vec< T > a, vec< T > b)
accelerator_inline vec< T > operator()(vec< T > a)
accelerator_inline vec< T > operator()(vec< T > a)
accelerator_inline vec< T > operator()(T *a)
accelerator_inline vec< T > operator()(std::complex< T > *a)
accelerator_inline vec< T > operator()(T a, T b)
accelerator_inline vec< T > operator()(T a)
accelerator_inline void operator()(vec< T > a, T *D)
accelerator_inline void operator()(T *a, vec< T > b)
T v[W< T >::r]