Grid 0.7.0
Grid_gpu_vec.h
Go to the documentation of this file.
1 /*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Grid_gpu.h
6
7 Copyright (C) 2018
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26 *************************************************************************************/
27 /* END LEGAL */
28//----------------------------------------------------------------------
33//----------------------------------------------------------------------
34
35#ifdef GRID_CUDA
36#include <cuda_fp16.h>
37#endif
38#ifdef GRID_HIP
39#include <hip/hip_fp16.h>
40#endif
41#if !defined(GRID_CUDA) && !defined(GRID_HIP)
42namespace Grid {
43 typedef struct { uint16_t x;} half;
44 typedef struct { half x; half y;} half2;
45 typedef struct { float x; float y;} float2;
46 typedef struct { double x; double y;} double2;
47}
48#endif
49
50
51namespace Grid {
52
53
54
55typedef struct Half2_t { half x; half y; } Half2;
56
57#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
58
59template<class pair>
61public:
62 pair z;
63 typedef decltype(z.x) Real;
64public:
66 accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; };
68 accelerator_inline Real real(void) const { return z.x; };
69 accelerator_inline Real imag(void) const { return z.y; };
70 accelerator_inline GpuComplex &operator=(const Zero &zz) { z.x = 0; z.y=0; return *this; };
72 *this = (*this) * r;
73 return *this;
74 }
76 *this = (*this) + r;
77 return *this;
78 }
80 *this = (*this) - r;
81 return *this;
82 }
84 GpuComplex r ;
85 r.z.x = lhs.z.x + rhs.z.x;
86 r.z.y = lhs.z.y + rhs.z.y;
87 return r;
88 }
90 GpuComplex r ;
91 r.z.x = lhs.z.x - rhs.z.x;
92 r.z.y = lhs.z.y - rhs.z.y;
93 return r;
94 }
96 GpuComplex r ;
97 r.z.x= lhs.z.x*rhs.z.x - lhs.z.y*rhs.z.y; // rr-ii
98 r.z.y= lhs.z.x*rhs.z.y + lhs.z.y*rhs.z.x; // ri+ir
99 return r;
100 }
102 {
103 GpuComplex ret;
104 ret.z.x = l.z.x*r.z.x;
105 ret.z.y = l.z.x*r.z.y;
106 return ret;
107 }
108 friend std::ostream& operator<< (std::ostream& stream, const GpuComplex o){
109 stream << "("<< o.z.x << ","<< o.z.y <<")";
110 return stream;
111 }
112};
113
114template<int _N, class _datum>
115struct GpuVector {
116 _datum v[_N];
117 static const int N = _N;
118 typedef _datum datum;
119};
120
121
122template<int N,class datum>
125 for(int i=0;i<N;i++) {
126 ret.v[i] = l.v[i]*r.v[i];
127 }
128 return ret;
129}
130template<int N,class datum>
133 for(int i=0;i<N;i++) {
134 ret.v[i] = l.v[i]-r.v[i];
135 }
136 return ret;
137}
138template<int N,class datum>
141 for(int i=0;i<N;i++) {
142 ret.v[i] = l.v[i]+r.v[i];
143 }
144 return ret;
145}
146template<int N,class datum>
149 for(int i=0;i<N;i++) {
150 ret.v[i] = l.v[i]/r.v[i];
151 }
152 return ret;
153}
154
155constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half);
156constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2);
157constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float);
158constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
159constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double);
160constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
161constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer);
162
166
174
179
180accelerator_inline float half2float(half h)
181{
182 float f;
183#if defined(GRID_CUDA) || defined(GRID_HIP)
184 f = __half2float(h);
185#else
186 Grid_half hh;
187 hh.x = h.x;
188 f= sfw_half_to_float(hh);
189#endif
190 return f;
191}
193{
194 half h;
195#if defined(GRID_CUDA) || defined(GRID_HIP)
196 h = __float2half(f);
197#else
198 Grid_half hh = sfw_float_to_half(f);
199 h.x = hh.x;
200#endif
201 return h;
202}
203
204namespace Optimization {
205
206 struct Vsplat{
207 //Complex float
209 GpuVectorCF ret;
210 for(int i=0;i<GpuVectorCF::N;i++){
211 ret.v[i] = typename GpuVectorCF::datum(a,b);
212 }
213 return ret;
214 }
215 // Real float
217 GpuVectorRF ret;
218 for(int i=0;i<GpuVectorRF::N;i++){
219 ret.v[i] = typename GpuVectorRF::datum(a);
220 }
221 return ret;
222 }
223 //Complex double
225 GpuVectorCD ret;
226 for(int i=0;i<GpuVectorCD::N;i++){
227 ret.v[i] = typename GpuVectorCD::datum(a,b);
228 }
229 return ret;
230 }
231 //Real double
233 GpuVectorRD ret;
234 for(int i=0;i<GpuVectorRD::N;i++){
235 ret.v[i] = typename GpuVectorRD::datum(a);
236 }
237 return ret;
238 }
239 //Integer
241 GpuVectorI ret;
242 for(int i=0;i<GpuVectorI::N;i++){
243 ret.v[i] = typename GpuVectorI::datum(a);
244 }
245 return ret;
246 }
247 };
248
249 struct Vstore{
250 template<int N,class datum,class P>
253 *vF = a;
254 }
255 };
256
257 struct Vstream{
258 template<int N,class datum, class P>
263 };
264
265 struct Vset{
266 // Complex float
268 typedef GpuVectorCF vec;
269 vec ret;
270 for(int i=0;i<vec::N;i++){
271 ret.v[i] = vec::datum(a[i].real(),a[i].imag());
272 }
273 return ret;
274 }
275 // Complex double
277 typedef GpuVectorCD vec;
278 vec ret;
279 for(int i=0;i<vec::N;i++){
280 ret.v[i] = vec::datum(a[i].real(),a[i].imag());
281 }
282 return ret;
283 }
284 // Real float
286 typedef GpuVectorRF vec;
287 vec ret;
288 for(int i=0;i<vec::N;i++){
289 ret.v[i] = vec::datum(a[i]);
290 }
291 return ret;
292 }
293 // Real double
295 typedef GpuVectorRD vec;
296 vec ret;
297 for(int i=0;i<vec::N;i++){
298 ret.v[i] = vec::datum(a[i]);
299 }
300 return ret;
301 }
302 // Integer
304 typedef GpuVectorI vec;
305 vec ret;
306 for(int i=0;i<vec::N;i++){
307 ret.v[i] = vec::datum(a[i]);
308 }
309 return ret;
310 }
311 };
312
313 template <typename Out_type, typename In_type>
314 struct Reduce{
315 //Need templated class to overload output type
316 //General form must generate error if compiled
317 accelerator_inline Out_type operator()(In_type in){
318 printf("Error, using wrong Reduce function\n");
319 exit(1);
320 return 0;
321 }
322 };
323
325 // Arithmetic operations
327 struct Sum{
328 //Real float
344 };
345
346 struct Sub{
362 };
363
364 struct MultRealPart{
366 typedef GpuVectorCF vec;
367 vec ret;
368 for(int i=0;i<vec::N;i++){
369 ret.v[i] = real_mult(a.v[i],b.v[i]);
370 }
371 return ret;
372 }
374 typedef GpuVectorCD vec;
375 vec ret;
376 for(int i=0;i<vec::N;i++){
377 ret.v[i] = real_mult(a.v[i],b.v[i]);
378 }
379 return ret;
380 }
381 };
382
383 struct MaddRealPart{
385 typedef GpuVectorCF vec;
386 vec ret;
387 for(int i=0;i<vec::N;i++){
388 ret.v[i] = real_mult(a.v[i],b.v[i]) +c.v[i];
389 }
390 return ret;
391 }
393 typedef GpuVectorCD vec;
394 vec ret;
395 for(int i=0;i<vec::N;i++){
396 ret.v[i] = real_mult(a.v[i],b.v[i]) +c.v[i];
397 }
398 return ret;
399 }
400 };
401
402 struct MultComplex{
403
410 };
411
412 struct Mult{
414 a= a+b*c;
415 }
417 a= a+b*c;
418 }
419 // Real float
423 // Real double
430 };
431
432 struct Div{
433 // Real float
443
444 // Danger -- element wise divide fro complex, not complex div.
445 // See Grid_vector_types.h lines around 735, applied after "toReal"
447 GpuVectorCF ret;
448 for(int i=0;i< GpuVectorCF::N;i++){
449 ret.v[i].z.x = a.v[i].z.x / b.v[i].z.x;
450 ret.v[i].z.y = a.v[i].z.y / b.v[i].z.y;
451 }
452 return ret;
453 }
455 GpuVectorCD ret;
456 for(int i=0;i< GpuVectorCD::N;i++){
457 ret.v[i].z.x = a.v[i].z.x / b.v[i].z.x;
458 ret.v[i].z.y = a.v[i].z.y / b.v[i].z.y;
459 }
460 return ret;
461 }
462 };
463
464
465 struct Conj{
466 // Complex single
468 typedef GpuVectorCF vec;
469 vec ret;
470 for(int i=0;i<vec::N;i++){
471 ret.v[i].z.x = in.v[i].z.x;
472 ret.v[i].z.y =-in.v[i].z.y;
473 }
474 return ret;
475 }
477 typedef GpuVectorCD vec;
478 vec ret;
479 for(int i=0;i<vec::N;i++){
480 ret.v[i].z.x = in.v[i].z.x;
481 ret.v[i].z.y =-in.v[i].z.y;
482 }
483 return ret;
484 }
485 };
486
487 struct TimesMinusI{
488 //Complex single
490 typedef GpuVectorCF vec;
491 vec ret;
492 for(int i=0;i<vec::N;i++){
493 ret.v[i].z.x = in.v[i].z.y;
494 ret.v[i].z.y =-in.v[i].z.x;
495 }
496 return ret;
497 }
499 typedef GpuVectorCD vec;
500 vec ret;
501 for(int i=0;i<vec::N;i++){
502 ret.v[i].z.x = in.v[i].z.y;
503 ret.v[i].z.y =-in.v[i].z.x;
504 }
505 return ret;
506 }
507 };
508
509 struct TimesI{
510 //Complex single
512 typedef GpuVectorCF vec;
513 vec ret;
514 for(int i=0;i<vec::N;i++){
515 ret.v[i].z.x =-in.v[i].z.y;
516 ret.v[i].z.y = in.v[i].z.x;
517 }
518 return ret;
519 }
521 typedef GpuVectorCD vec;
522 vec ret;
523 for(int i=0;i<vec::N;i++){
524 ret.v[i].z.x =-in.v[i].z.y;
525 ret.v[i].z.y = in.v[i].z.x;
526 }
527 return ret;
528 }
529 };
530
531 struct Permute{
532
533 template <int n,typename vec>
535 vec out;
536 unsigned int _mask = vec::N >> (n + 1);
537 for(int i=0;i<vec::N;i++) {
538 out.v[i] = in.v[i^_mask];
539 }
540 return out;
541 }
542
543 template <typename vec> static accelerator_inline vec Permute0(vec in) { return PermuteN<0,vec>(in); }
544 template <typename vec> static accelerator_inline vec Permute1(vec in) { return PermuteN<1,vec>(in); }
545 template <typename vec> static accelerator_inline vec Permute2(vec in) { return PermuteN<2,vec>(in); }
546 template <typename vec> static accelerator_inline vec Permute3(vec in) { return PermuteN<3,vec>(in); }
547
548 };
549
550 struct PrecisionChange {
551
553 // Single / Half
556 int N = GpuVectorCF::N;
557 GpuVectorCH h;
558 for(int i=0;i<N;i++) {
559 h.v[i ].z.x = float2half(a.v[i].z.x);
560 h.v[i ].z.y = float2half(a.v[i].z.y);
561 h.v[i+N].z.x = float2half(b.v[i].z.x);
562 h.v[i+N].z.y = float2half(b.v[i].z.y);
563 }
564 return h;
565 }
567 int N = GpuVectorCF::N;
568 for(int i=0;i<N;i++) {
569 sa.v[i].z.x = half2float(h.v[i ].z.x);
570 sa.v[i].z.y = half2float(h.v[i ].z.y);
571 sb.v[i].z.x = half2float(h.v[i+N].z.x);
572 sb.v[i].z.y = half2float(h.v[i+N].z.y);
573 }
574 }
576 int N = GpuVectorRF::N;
577 GpuVectorRH h;
578 for(int i=0;i<N;i++) {
579 h.v[i ] = float2half(a.v[i]);
580 h.v[i+N] = float2half(b.v[i]);
581 }
582 return h;
583 }
585 int N = GpuVectorRF::N;
586 for(int i=0;i<N;i++) {
587 sa.v[i] = half2float(h.v[i ]);
588 sb.v[i] = half2float(h.v[i+N]);
589 }
590 }
591
593 // Double Single
596 int N = GpuVectorCD::N;
597 GpuVectorCF h;
598 for(int i=0;i<N;i++) {
599 h.v[i ].z.x = a.v[i].z.x;
600 h.v[i ].z.y = a.v[i].z.y;
601 h.v[i+N].z.x = b.v[i].z.x;
602 h.v[i+N].z.y = b.v[i].z.y;
603 }
604 return h;
605 }
606
608 int N = GpuVectorCD::N;
609 for(int i=0;i<N;i++) {
610 sa.v[i].z.x = h.v[i ].z.x;
611 sa.v[i].z.y = h.v[i ].z.y;
612 sb.v[i].z.x = h.v[i+N].z.x;
613 sb.v[i].z.y = h.v[i+N].z.y;
614 }
615 }
616
618 int N = GpuVectorRD::N;
619 GpuVectorRF h;
620 for(int i=0;i<N;i++) {
621 h.v[i ] = a.v[i];
622 h.v[i+N] = b.v[i];
623 }
624 return h;
625 }
626
628 int N = GpuVectorRD::N;
629 for(int i=0;i<N;i++) {
630 sa.v[i] = h.v[i ];
631 sb.v[i] = h.v[i+N];
632 }
633 }
634
636 // Double Half
639 GpuVectorCF sa,sb;
640 sa = DtoS(a,b);
641 sb = DtoS(c,d);
642 return StoH(sa,sb);
643 }
645 GpuVectorCF sa,sb;
646 HtoS(h,sa,sb);
647 StoD(sa,a,b);
648 StoD(sb,c,d);
649 }
651 GpuVectorRF sa,sb;
652 sa = DtoS(a,b);
653 sb = DtoS(c,d);
654 return StoH(sa,sb);
655 }
657 GpuVectorRF sa,sb;
658 HtoS(h,sa,sb);
659 StoD(sa,a,b);
660 StoD(sb,c,d);
661 }
662 };
663
664struct Exchange{
665
666 template <typename vec,int n>
667 static accelerator_inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){
668 unsigned int mask = vec::N >> (n + 1);
669 for(int i=0;i<vec::N;i++) {
670 int j1 = i&(~mask);
671 if ( (i&mask) == 0 ) { out1.v[i]=in1.v[j1];}
672 else { out1.v[i]=in2.v[j1];}
673 int j2 = i|mask;
674 if ( (i&mask) == 0 ) { out2.v[i]=in1.v[j2];}
675 else { out2.v[i]=in2.v[j2];}
676 }
677 }
678 template <typename vec>
679 static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){
680 ExchangeN<vec,0>(out1,out2,in1,in2);
681 };
682 template <typename vec>
683 static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){
684 ExchangeN<vec,1>(out1,out2,in1,in2);
685 };
686 template <typename vec>
687 static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){
688 ExchangeN<vec,2>(out1,out2,in1,in2);
689 };
690 template <typename vec>
691 static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){
692 ExchangeN<vec,3>(out1,out2,in1,in2);
693 };
694
695};
696
697struct Rotate{
698
699 template <int n, typename vec> static accelerator_inline vec tRotate(vec in){
700 return rotate(in, n);
701 }
702
703 template <typename vec>
705 vec out;
706 for(int i=0;i<vec::N;i++){
707 out.v[i] = in.v[(i + n)%vec::N];
708 }
709 return out;
710 }
711
712 typedef GpuVectorRH SIMD_Htype; // Single precision type
713 typedef GpuVectorRF SIMD_Ftype; // Single precision type
714 typedef GpuVectorRD SIMD_Dtype; // Double precision type
715 typedef GpuVectorI SIMD_Itype; // Integer type
716
717 typedef GpuVectorCH SIMD_CHtype; // Single precision type
718 typedef GpuVectorCF SIMD_CFtype; // Single precision type
719 typedef GpuVectorCD SIMD_CDtype; // Double precision type
720
725 static accelerator_inline GpuVectorCH rotate(GpuVectorCH in, int n){ return rotate_template(in,n/2);} // Measure in complex not float
728
729};
730
732// Some Template specialization
733
734 //Complex float Reduce
735 template<>
736 accelerator_inline Grid::ComplexF
738 {
739 GpuComplexF greduce = in.v[0];
740 for(int i=1;i<GpuVectorCF::N;i++) {
741 greduce = greduce+in.v[i];
742 }
743 Grid::ComplexF ret(greduce.z.x,greduce.z.y);
744 return ret;
745 }
746
747 template<>
748 accelerator_inline Grid::ComplexD
750 {
751 GpuComplexD greduce = in.v[0];
752 for(int i=1;i<GpuVectorCD::N;i++) {
753 greduce = greduce+in.v[i];
754 }
755 Grid::ComplexD ret(greduce.z.x,greduce.z.y);
756 return ret;
757 }
758
759 // Real
760 template<>
761 accelerator_inline Grid::RealF
763 {
764 RealF ret = in.v[0];
765 for(int i=1;i<GpuVectorRF::N;i++) {
766 ret = ret+in.v[i];
767 }
768 return ret;
769 }
770
771 template<>
772 accelerator_inline Grid::RealD
774 {
775 RealD ret = in.v[0];
776 for(int i=1;i<GpuVectorRD::N;i++) {
777 ret = ret+in.v[i];
778 }
779 return ret;
780 }
781
782 template<>
785 {
786 Integer ret = in.v[0];
787 for(int i=1;i<GpuVectorI::N;i++) {
788 ret = ret+in.v[i];
789 }
790 return ret;
791 }
792
793}// End optimizatoin
794
796// Here assign types
798 typedef GpuVectorRH SIMD_Htype; // Single precision type
799 typedef GpuVectorRF SIMD_Ftype; // Single precision type
800 typedef GpuVectorRD SIMD_Dtype; // Double precision type
801 typedef GpuVectorI SIMD_Itype; // Integer type
802
803 typedef GpuVectorCH SIMD_CHtype; // Single precision type
804 typedef GpuVectorCF SIMD_CFtype; // Single precision type
805 typedef GpuVectorCD SIMD_CDtype; // Double precision type
806
807 // prefetch utilities
808 accelerator_inline void v_prefetch0(int size, const char *ptr){};
809 accelerator_inline void prefetch_HINT_T0(const char *ptr){};
810
811 // Function name aliases
816 template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
817
818 // Arithmetic operations
829
830}
#define accelerator_inline
#define accelerator
accelerator_inline Grid_simd2< S, V > real_mult(Grid_simd2< S, V > a, Grid_simd2< S, V > b)
#define COALESCE_GRANULARITY
accelerator_inline float sfw_half_to_float(Grid_half h)
accelerator_inline Grid_half sfw_float_to_half(float ff)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
uint32_t Integer
Definition Simd.h:58
float RealF
Definition Simd.h:60
double RealD
Definition Simd.h:61
static INTERNAL_PRECISION F
Definition Zolotarev.cc:230
accelerator_inline GpuComplex()=default
accelerator_inline GpuComplex & operator-=(const GpuComplex &r)
accelerator_inline GpuComplex & operator+=(const GpuComplex &r)
accelerator_inline Real real(void) const
accelerator_inline Real imag(void) const
friend accelerator_inline GpuComplex operator*(const GpuComplex &lhs, const GpuComplex &rhs)
accelerator_inline GpuComplex(const GpuComplex &zz)
friend std::ostream & operator<<(std::ostream &stream, const GpuComplex o)
accelerator_inline GpuComplex & operator=(const Zero &zz)
friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs, const GpuComplex &rhs)
friend accelerator_inline GpuComplex operator-(const GpuComplex &lhs, const GpuComplex &rhs)
decltype(z.x) Real
accelerator_inline GpuComplex & operator*=(const GpuComplex &r)
friend accelerator_inline GpuComplex real_mult(const GpuComplex &l, const GpuComplex &r)
accelerator_inline GpuComplex(Real re, Real im)
Definition Simd.h:194
accelerator_inline half float2half(float f)
constexpr int NSIMD_ComplexF
Optimization::Reduce< S, T > ReduceSIMD
Optimization::MaddRealPart MaddRealPartSIMD
struct Grid::Half2_t Half2
Optimization::Div DivSIMD
accelerator GpuVector< N, datum > operator/(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
GpuComplexVector< NSIMD_ComplexF, float > GpuVectorCF
GpuComplex< double2 > GpuComplexD
GpuComplex< Half2 > GpuComplexH
Optimization::MultComplex MultComplexSIMD
Optimization::Conj ConjSIMD
Optimization::Vsplat VsplatSIMD
accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r)
Optimization::Sum SumSIMD
accelerator_inline void prefetch_HINT_T0(const char *ptr)
constexpr int NSIMD_Integer
constexpr int NSIMD_ComplexH
GpuComplex< float2 > GpuComplexF
GpuVector< NSIMD_RealF, float > GpuVectorRF
Optimization::MultRealPart MultRealPartSIMD
Optimization::TimesI TimesISIMD
constexpr int NSIMD_ComplexD
GpuVectorCD SIMD_CDtype
GpuVectorRD SIMD_Dtype
Optimization::Mult MultSIMD
accelerator_inline void v_prefetch0(int size, const char *ptr)
accelerator_inline GpuComplexF timesI(const GpuComplexF &r)
GpuComplexVector< NSIMD_ComplexH, half > GpuVectorCH
GpuVectorRF SIMD_Ftype
GpuVectorCH SIMD_CHtype
accelerator_inline float half2float(half h)
constexpr int NSIMD_RealH
GpuVectorI SIMD_Itype
Optimization::Vset VsetSIMD
GpuVector< NSIMD_RealD, double > GpuVectorRD
Optimization::Vstore VstoreSIMD
GpuVector< NSIMD_RealH, half > GpuVectorRH
constexpr int NSIMD_RealF
Optimization::Sub SubSIMD
GpuVectorRH SIMD_Htype
Optimization::TimesMinusI TimesMinusISIMD
GpuVector< NSIMD_Integer, Integer > GpuVectorI
accelerator GpuVector< N, datum > operator*(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
GpuVectorCF SIMD_CFtype
accelerator GpuVector< N, datum > operator+(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
Optimization::Vstream VstreamSIMD
accelerator GpuVector< N, datum > operator-(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
constexpr int NSIMD_RealD
GpuComplexVector< NSIMD_ComplexD, double > GpuVectorCD
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
static accelerator_inline void ExchangeN(GpuVector< _N, _datum > &out1, GpuVector< _N, _datum > &out2, GpuVector< _N, _datum > &in1, GpuVector< _N, _datum > &in2)
static accelerator_inline void Exchange1(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange3(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void ExchangeN(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange0(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange2(vec &out1, vec &out2, vec &in1, vec &in2)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b, GpuVectorCD c)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b, GpuVectorCF c)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline void mac(GpuVectorRF &a, GpuVectorRF b, GpuVectorRF c)
accelerator_inline void mac(GpuVectorRD &a, GpuVectorRD b, GpuVectorRD c)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
static accelerator_inline vec Permute0(vec in)
static accelerator_inline vec Permute1(vec in)
static accelerator_inline vec PermuteN(vec in)
static accelerator_inline vec Permute2(vec in)
static accelerator_inline GpuVector< _N, _datum > PermuteN(GpuVector< _N, _datum > &in)
static accelerator_inline vec Permute3(vec in)
static accelerator_inline GpuVectorRH DtoH(GpuVectorRD a, GpuVectorRD b, GpuVectorRD c, GpuVectorRD d)
static accelerator_inline GpuVectorCH DtoH(GpuVectorCD a, GpuVectorCD b, GpuVectorCD c, GpuVectorCD d)
static accelerator_inline void HtoS(GpuVectorCH h, GpuVectorCF &sa, GpuVectorCF &sb)
static accelerator_inline GpuVectorRF DtoS(GpuVectorRD a, GpuVectorRD b)
static accelerator_inline void HtoD(GpuVectorRH h, GpuVectorRD &a, GpuVectorRD &b, GpuVectorRD &c, GpuVectorRD &d)
static accelerator_inline void HtoS(GpuVectorRH h, GpuVectorRF &sa, GpuVectorRF &sb)
static accelerator_inline GpuVectorCF DtoS(GpuVectorCD a, GpuVectorCD b)
static accelerator_inline void HtoD(GpuVectorCH h, GpuVectorCD &a, GpuVectorCD &b, GpuVectorCD &c, GpuVectorCD &d)
static accelerator_inline void StoD(GpuVectorRF h, GpuVectorRD &sa, GpuVectorRD &sb)
static accelerator_inline GpuVectorRH StoH(GpuVectorRF a, GpuVectorRF b)
static accelerator_inline GpuVectorCH StoH(GpuVectorCF a, GpuVectorCF b)
static accelerator_inline void StoD(GpuVectorCF h, GpuVectorCD &sa, GpuVectorCD &sb)
accelerator_inline Out_type operator()(In_type in)
static accelerator_inline GpuVectorCH rotate(GpuVectorCH in, int n)
static accelerator_inline GpuVectorI rotate(GpuVectorI in, int n)
static accelerator_inline GpuVectorRF rotate(GpuVectorRF in, int n)
static accelerator_inline GpuComplexVector< _N, _datum > rotate_template(GpuComplexVector< _N, _datum > &in, int n)
static accelerator_inline GpuVectorCF rotate(GpuVectorCF in, int n)
static accelerator_inline GpuVectorRH rotate(GpuVectorRH in, int n)
static accelerator_inline GpuVectorRD rotate(GpuVectorRD in, int n)
static accelerator_inline GpuVectorCD rotate(GpuVectorCD in, int n)
static accelerator_inline vec rotate_template(vec in, int n)
static accelerator_inline vec tRotate(vec in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a)
accelerator_inline GpuVectorRD operator()(double *a)
accelerator_inline GpuVectorCD operator()(Grid::ComplexD *a)
accelerator_inline GpuVectorRF operator()(float *a)
accelerator_inline GpuVectorI operator()(Integer *a)
accelerator_inline GpuVectorCF operator()(float a, float b)
accelerator_inline GpuVectorI operator()(Integer a)
accelerator_inline GpuVectorRD operator()(double a)
accelerator_inline GpuVectorCD operator()(double a, double b)
accelerator_inline GpuVectorRF operator()(float a)
accelerator_inline void operator()(GpuVector< N, datum > a, P *Fp)
accelerator_inline void operator()(P *F, GpuVector< N, datum > a)
T v[W< T >::r]