Grid 0.7.0
Simd.h
Go to the documentation of this file.
1/*************************************************************************************
2
3Grid physics library, www.github.com/paboyle/Grid
4
5Source file: ./lib/Simd.h
6
7Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10Author: neo <cossu@post.kek.jp>
11Author: paboyle <paboyle@ph.ed.ac.uk>
12
13This program is free software; you can redistribute it and/or modify
14it under the terms of the GNU General Public License as published by
15the Free Software Foundation; either version 2 of the License, or
16(at your option) any later version.
17
18This program is distributed in the hope that it will be useful,
19but WITHOUT ANY WARRANTY; without even the implied warranty of
20MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21GNU General Public License for more details.
22
23You should have received a copy of the GNU General Public License along
24with this program; if not, write to the Free Software Foundation, Inc.,
2551 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26
27See the full license in the file "LICENSE" in the top level distribution
28directory
29*************************************************************************************/
30/* END LEGAL */
31#ifndef GRID_SIMD_H
32#define GRID_SIMD_H
33
34#if defined(GRID_CUDA) || defined(GRID_HIP)
35#include <thrust/complex.h>
36#endif
37
39// Define scalar and vector floating point types
40//
41// Scalar: RealF, RealD, ComplexF, ComplexD
42//
43// Vector: vRealF, vRealD, vComplexF, vComplexD
44//
45// Vector types are arch dependent
47
48#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
49#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
50#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
51#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
52#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
53
54#define RotateBit (0x100)
55
57
58typedef uint32_t Integer;
59
60typedef float RealF;
61typedef double RealD;
62#ifdef GRID_DEFAULT_PRECISION_DOUBLE
63typedef RealD Real;
64#else
65typedef RealF Real;
66#endif
67
68#if defined(GRID_CUDA) || defined(GRID_HIP)
69typedef thrust::complex<RealF> ComplexF;
70typedef thrust::complex<RealD> ComplexD;
71typedef thrust::complex<Real> Complex;
72typedef thrust::complex<uint16_t> ComplexH;
73template<class T> using complex = thrust::complex<T>;
74
75accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(thrust::pow(r,(double)y)); }
76accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(thrust::pow(r,(float)y)); }
77#else
78typedef std::complex<RealF> ComplexF;
79typedef std::complex<RealD> ComplexD;
80typedef std::complex<Real> Complex;
81typedef std::complex<uint16_t> ComplexH; // Hack
82template<class T> using complex = std::complex<T>;
83
84accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(std::pow(r,y)); }
85accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(std::pow(r,y)); }
86#endif
87
88//accelerator_inline RealD pow(const RealD& r,RealD y){ return(std::pow(r,y)); }
89//accelerator_inline RealD sqrt(const RealD & r){ return std::sqrt(r); }
90
91// This comes from ::pow already from math.h and CUDA
92// Calls either Grid::pow for complex, or std::pow for real
93// Problem is CUDA math_functions is exposing ::pow, and I can't define
94
95using std::abs;
96using std::pow;
97using std::sqrt;
98using std::log;
99using std::exp;
100using std::sin;
101using std::cos;
102using std::asin;
103using std::acos;
104
105
110
111accelerator_inline RealF adj(const RealF & r){ return r; }
112accelerator_inline RealD adj(const RealD & r){ return r; }
115
116accelerator_inline RealF real(const RealF & r){ return r; }
117accelerator_inline RealD real(const RealD & r){ return r; }
118accelerator_inline RealF real(const ComplexF & r){ return r.real(); }
119accelerator_inline RealD real(const ComplexD & r){ return r.real(); }
120
121accelerator_inline RealF imag(const ComplexF & r){ return r.imag(); }
122accelerator_inline RealD imag(const ComplexD & r){ return r.imag(); }
123
126accelerator_inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
127accelerator_inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
128
131accelerator_inline RealD Reduce(const RealD& r){ return r; }
132accelerator_inline RealF Reduce(const RealF& r){ return r; }
133
134accelerator_inline RealD toReal(const ComplexD& r){ return r.real(); }
135accelerator_inline RealF toReal(const ComplexF& r){ return r.real(); }
136accelerator_inline RealD toReal(const RealD& r){ return r; }
137accelerator_inline RealF toReal(const RealF& r){ return r; }
138
140//Provide support functions for basic real and complex data types required by Grid
141//Single and double precision versions. Should be able to template this once only.
143accelerator_inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
144accelerator_inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
145accelerator_inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
146accelerator_inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
147// conjugate already supported for complex
148
149accelerator_inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
150accelerator_inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
151accelerator_inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
152accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
153
154//conjugate already supported for complex
155accelerator_inline ComplexF timesI(const ComplexF &r) { return(ComplexF(-r.imag(),r.real()));}
156accelerator_inline ComplexD timesI(const ComplexD &r) { return(ComplexD(-r.imag(),r.real()));}
157accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(ComplexF(r.imag(),-r.real()));}
158accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(ComplexD(r.imag(),-r.real()));}
159//accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
160//accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
161//accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
162//accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
163
164// define projections to real and imaginay parts
165accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));}
166accelerator_inline ComplexD projReal(const ComplexD &r){return( ComplexD(r.real(), 0.0));}
167accelerator_inline ComplexF projImag(const ComplexF &r){return (ComplexF(r.imag(), 0.0 ));}
168accelerator_inline ComplexD projImag(const ComplexD &r){return (ComplexD(r.imag(), 0.0));}
169
170// define auxiliary functions for complex computations
171accelerator_inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);}
172accelerator_inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);}
175
176accelerator_inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
177accelerator_inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
178accelerator_inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
179accelerator_inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
180
181accelerator_inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
182accelerator_inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
183accelerator_inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
184accelerator_inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
185
188accelerator_inline void vstream(RealF &l, const RealF &r){ l=r;}
189accelerator_inline void vstream(RealD &l, const RealD &r){ l=r;}
190
193
194class Zero{};
195//static Zero Zero();
196template<class itype> accelerator_inline void zeroit(itype &arg) { arg=Zero();};
197template<> accelerator_inline void zeroit(ComplexF &arg){ arg=0; };
198template<> accelerator_inline void zeroit(ComplexD &arg){ arg=0; };
199template<> accelerator_inline void zeroit(RealF &arg) { arg=0; };
200template<> accelerator_inline void zeroit(RealD &arg) { arg=0; };
201
202// More limited Integer support
204accelerator_inline void mac (Integer * __restrict__ y,const Integer * __restrict__ a,const Integer *__restrict__ x){ *y = (*a) * (*x)+(*y); }
205accelerator_inline void mult(Integer * __restrict__ y,const Integer * __restrict__ l,const Integer *__restrict__ r){ *y = (*l) * (*r); }
206accelerator_inline void sub (Integer * __restrict__ y,const Integer * __restrict__ l,const Integer *__restrict__ r){ *y = (*l) - (*r); }
207accelerator_inline void add (Integer * __restrict__ y,const Integer * __restrict__ l,const Integer *__restrict__ r){ *y = (*l) + (*r); }
208accelerator_inline void vstream(Integer &l, const RealD &r){ l=r;}
209template<> accelerator_inline void zeroit(Integer &arg) { arg=0; };
210
213//accelerator_inline Integer abs (Integer &a) { return a%y;}
214
216// Permute
217// Permute 0 every ABCDEFGH -> BA DC FE HG
218// Permute 1 every ABCDEFGH -> CD AB GH EF
219// Permute 2 every ABCDEFGH -> EFGH ABCD
220// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
221// Permute 4 possible on half precision @512bit vectors.
222//
223// Defined inside SIMD specialization files
225template<class VectorSIMD>
226accelerator_inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);
227
229
233
235
236// Default precision is wired to double
237typedef vRealD vReal;
239
240inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
241 int nn=vComplexF::Nsimd();
242 std::vector<ComplexF,alignedAllocator<ComplexF> > buf(nn);
243 vstore(o,&buf[0]);
244 stream<<"<";
245 for(int i=0;i<nn;i++){
246 stream<<buf[i];
247 if(i<nn-1) stream<<",";
248 }
249 stream<<">";
250 return stream;
251}
252
253inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
254 int nn=vComplexD::Nsimd();
255 std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
256 vstore(o,&buf[0]);
257 stream<<"<";
258 for(int i=0;i<nn;i++){
259 stream<<buf[i];
260 if(i<nn-1) stream<<",";
261 }
262 stream<<">";
263 return stream;
264}
265inline std::ostream& operator<< (std::ostream& stream, const vComplexD2 &o){
266 stream<<"<";
267 stream<<o.v[0];
268 stream<<o.v[1];
269 stream<<">";
270 return stream;
271}
272
273inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
274 int nn=vRealF::Nsimd();
275 std::vector<RealF,alignedAllocator<RealF> > buf(nn);
276 vstore(o,&buf[0]);
277 stream<<"<";
278 for(int i=0;i<nn;i++){
279 stream<<buf[i];
280 if(i<nn-1) stream<<",";
281 }
282 stream<<">";
283 return stream;
284}
285
286inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){
287 int nn=vRealD::Nsimd();
288 std::vector<RealD,alignedAllocator<RealD> > buf(nn);
289 vstore(o,&buf[0]);
290 stream<<"<";
291 for(int i=0;i<nn;i++){
292 stream<<buf[i];
293 if(i<nn-1) stream<<",";
294 }
295 stream<<">";
296 return stream;
297}
298inline std::ostream& operator<< (std::ostream& stream, const vInteger &o){
299 int nn=vInteger::Nsimd();
300 std::vector<Integer,alignedAllocator<Integer> > buf(nn);
301 vstore(o,&buf[0]);
302 stream<<"<";
303 for(int i=0;i<nn;i++){
304 stream<<buf[i];
305 if(i<nn-1) stream<<",";
306 }
307 stream<<">";
308 return stream;
309}
310
312#endif
#define accelerator_inline
Grid_simd2< complex< double >, vComplexD > vComplexD2
#define perm(a, b, n, w)
#define conj(a, b, i)
Defines templated class Grid_simd to deal with inner vector types.
Grid_simd< complex< float >, SIMD_Ftype > vComplexF
Grid_simd< float, SIMD_Ftype > vRealF
Grid_simd< complex< double >, SIMD_Dtype > vComplexD
Grid_simd< Integer, SIMD_Itype > vInteger
Grid_simd< double, SIMD_Dtype > vRealD
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
accelerator_inline RealF imag(const ComplexF &r)
Definition Simd.h:121
accelerator_inline RealD toReal(const ComplexD &r)
Definition Simd.h:134
accelerator_inline ComplexD pow(const ComplexD &r, RealD y)
Definition Simd.h:84
accelerator_inline void vstream(ComplexF &l, const ComplexF &r)
Definition Simd.h:186
accelerator_inline Integer mod(Integer a, Integer y)
Definition Simd.h:211
accelerator_inline ComplexF timesMinusI(const ComplexF &r)
Definition Simd.h:157
accelerator_inline ComplexF timesI(const ComplexF &r)
Definition Simd.h:155
accelerator_inline RealF conjugate(const RealF &r)
Definition Simd.h:106
accelerator_inline void mult(ComplexD *__restrict__ y, const ComplexD *__restrict__ l, const ComplexD *__restrict__ r)
Definition Simd.h:144
std::complex< uint16_t > ComplexH
Definition Simd.h:81
uint32_t Integer
Definition Simd.h:58
std::complex< T > complex
Definition Simd.h:82
accelerator_inline void mac(ComplexD *__restrict__ y, const ComplexD *__restrict__ a, const ComplexD *__restrict__ x)
Definition Simd.h:143
accelerator_inline void sub(ComplexD *__restrict__ y, const ComplexD *__restrict__ l, const ComplexD *__restrict__ r)
Definition Simd.h:145
std::ostream & operator<<(std::ostream &stream, const vComplexF &o)
Definition Simd.h:240
RealF Real
Definition Simd.h:65
vComplexD vComplex
Definition Simd.h:238
accelerator_inline void add(ComplexD *__restrict__ y, const ComplexD *__restrict__ l, const ComplexD *__restrict__ r)
Definition Simd.h:146
accelerator_inline ComplexD innerProduct(const ComplexD &l, const ComplexD &r)
Definition Simd.h:124
accelerator_inline RealF adj(const RealF &r)
Definition Simd.h:111
std::complex< RealF > ComplexF
Definition Simd.h:78
float RealF
Definition Simd.h:60
accelerator_inline Integer div(Integer a, Integer y)
Definition Simd.h:212
vRealD vReal
Definition Simd.h:237
accelerator_inline ComplexF projImag(const ComplexF &r)
Definition Simd.h:167
std::complex< RealD > ComplexD
Definition Simd.h:79
accelerator_inline void zeroit(itype &arg)
Definition Simd.h:196
accelerator_inline void Gpermute(VectorSIMD &y, const VectorSIMD &b, int perm)
accelerator_inline RealF real(const RealF &r)
Definition Simd.h:116
accelerator_inline ComplexF projReal(const ComplexF &r)
Definition Simd.h:165
accelerator_inline ComplexD toComplex(const RealD &in)
Definition Simd.h:191
std::complex< Real > Complex
Definition Simd.h:80
double RealD
Definition Simd.h:61
accelerator_inline ComplexD Reduce(const ComplexD &r)
Definition Simd.h:129
Vector_type v[nvec]
static accelerator_inline constexpr int Nsimd(void)
Definition Simd.h:194