Grid 0.7.0
Grid_gpu_rrii.h
Go to the documentation of this file.
1 /*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Grid_gpu.h
6
7 Copyright (C) 2021
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26 *************************************************************************************/
27 /* END LEGAL */
28//----------------------------------------------------------------------
30//----------------------------------------------------------------------
31
33// fp16
35#ifdef GRID_CUDA
36#include <cuda_fp16.h>
37#endif
38#ifdef GRID_HIP
39#include <hip/hip_fp16.h>
40#endif
41#if !defined(GRID_HIP) && !defined(GRID_CUDA)
42namespace Grid {
43 typedef struct { uint16_t x;} half;
44}
45#endif
46namespace Grid {
48 {
49 float f;
50#if defined(GRID_CUDA) || defined(GRID_HIP)
51 f = __half2float(h);
52#else
53 Grid_half hh;
54 hh.x = h.x;
55 f= sfw_half_to_float(hh);
56#endif
57 return f;
58 }
60 {
61 half h;
62#if defined(GRID_CUDA) || defined(GRID_HIP)
63 h = __float2half(f);
64#else
66 h.x = hh.x;
67#endif
68 return h;
69 }
70}
71
72
73#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
74
75namespace Grid {
76
78// Real vector
80template<int _N, class _datum>
81struct GpuVector {
82 _datum rrrr[_N];
83 static const int N = _N;
84 typedef _datum datum;
85};
86template<int N,class datum>
89 for(int i=0;i<N;i++) {
90 ret.rrrr[i] = l.rrrr[i]*r.rrrr[i];
91 }
92 return ret;
93}
94template<int N,class datum>
97 for(int i=0;i<N;i++) {
98 ret.rrrr[i] = l.rrrr[i]-r.rrrr[i];
99 }
100 return ret;
101}
102template<int N,class datum>
105 for(int i=0;i<N;i++) {
106 ret.rrrr[i] = l.rrrr[i]+r.rrrr[i];
107 }
108 return ret;
109}
110template<int N,class datum>
113 for(int i=0;i<N;i++) {
114 ret.rrrr[i] = l.rrrr[i]/r.rrrr[i];
115 }
116 return ret;
117}
118
120// Complex vector
122template<int _N, class _datum>
124 _datum rrrr[_N];
125 _datum iiii[_N];
126 static const int N = _N;
127 typedef _datum datum;
128};
129template<int N,class datum>
132 for(int i=0;i<N;i++) {
133 ret.rrrr[i] = l.rrrr[i]*r.rrrr[i] - l.iiii[i]*r.iiii[i];
134 ret.iiii[i] = l.rrrr[i]*r.iiii[i] + l.iiii[i]*r.rrrr[i];
135 }
136 return ret;
137}
138template<int N,class datum>
141 for(int i=0;i<N;i++) {
142 ret.rrrr[i] = l.rrrr[i]-r.rrrr[i];
143 ret.iiii[i] = l.iiii[i]-r.iiii[i];
144 }
145 return ret;
146}
147template<int N,class datum>
150 for(int i=0;i<N;i++) {
151 ret.rrrr[i] = l.rrrr[i]+r.rrrr[i];
152 ret.iiii[i] = l.iiii[i]+r.iiii[i];
153 }
154 return ret;
155}
156template<int N,class datum>
159 for(int i=0;i<N;i++) {
160 ret.rrrr[i] = l.rrrr[i]/r.rrrr[i];
161 ret.iiii[i] = l.iiii[i]/r.iiii[i];
162 }
163 return ret;
164}
165
167// SIMD counts
169
170constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half);
171constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half);
172constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float);
173constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float);
174constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double);
175constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double);
177
185
186namespace Optimization {
187
188 struct Vsplat{
189 //Complex float
191 GpuVectorCF ret;
192 for(int i=0;i<GpuVectorCF::N;i++){
193 ret.rrrr[i] = typename GpuVectorCF::datum(a);
194 ret.iiii[i] = typename GpuVectorCF::datum(b);
195 }
196 return ret;
197 }
198 // Real float
200 GpuVectorRF ret;
201 for(int i=0;i<GpuVectorRF::N;i++){
202 ret.rrrr[i] = typename GpuVectorRF::datum(a);
203 }
204 return ret;
205 }
206 //Complex double
208 GpuVectorCD ret;
209 for(int i=0;i<GpuVectorCD::N;i++){
210 ret.rrrr[i] = typename GpuVectorCD::datum(a);
211 ret.iiii[i] = typename GpuVectorCD::datum(b);
212 }
213 return ret;
214 }
215 //Real double
217 GpuVectorRD ret;
218 for(int i=0;i<GpuVectorRD::N;i++){
219 ret.rrrr[i] = typename GpuVectorRD::datum(a);
220 }
221 return ret;
222 }
223 //Integer
225 GpuVectorI ret;
226 for(int i=0;i<GpuVectorI::N;i++){
227 ret.rrrr[i] = typename GpuVectorI::datum(a);
228 }
229 return ret;
230 }
231 };
232
233 struct Vstore{
234 template<int N,class datum,class P>
237 *vF = a;
238 }
239 template<int N,class datum,class P>
244 };
245
246 struct Vstream{
247 template<int N,class datum, class P>
252 template<int N,class datum, class P>
257 };
258
259 struct Vset{
260 // Complex float
262 typedef GpuVectorCF vec;
263 vec ret;
264 for(int i=0;i<vec::N;i++){
265 ret.rrrr[i] = vec::datum(a[i].real());
266 ret.iiii[i] = vec::datum(a[i].imag());
267 }
268 return ret;
269 }
270 // Complex double
272 typedef GpuVectorCD vec;
273 vec ret;
274 for(int i=0;i<vec::N;i++){
275 ret.rrrr[i] = vec::datum(a[i].real());
276 ret.iiii[i] = vec::datum(a[i].imag());
277 }
278 return ret;
279 }
280 // Real float
282 typedef GpuVectorRF vec;
283 vec ret;
284 for(int i=0;i<vec::N;i++){
285 ret.rrrr[i] = vec::datum(a[i]);
286 }
287 return ret;
288 }
289 // Real double
291 typedef GpuVectorRD vec;
292 vec ret;
293 for(int i=0;i<vec::N;i++){
294 ret.rrrr[i] = vec::datum(a[i]);
295 }
296 return ret;
297 }
298 // Integer
300 typedef GpuVectorI vec;
301 vec ret;
302 for(int i=0;i<vec::N;i++){
303 ret.rrrr[i] = vec::datum(a[i]);
304 }
305 return ret;
306 }
307 };
308
309 template <typename Out_type, typename In_type>
310 struct Reduce{
311 //Need templated class to overload output type
312 //General form must generate error if compiled
313 accelerator_inline Out_type operator()(In_type in){
314 printf("Error, using wrong Reduce function\n");
315 exit(1);
316 return 0;
317 }
318 };
319
321 // Arithmetic operations
341
359
362 typedef GpuVectorCF vec;
363 vec ret;
364 for(int i=0;i<vec::N;i++){
365 ret.rrrr[i] = a.rrrr[i]*b.rrrr[i];
366 ret.iiii[i] = a.rrrr[i]*b.iiii[i];
367 }
368 return ret;
369 }
371 typedef GpuVectorCD vec;
372 vec ret;
373 for(int i=0;i<vec::N;i++){
374 ret.rrrr[i] = a.rrrr[i]*b.rrrr[i];
375 ret.iiii[i] = a.rrrr[i]*b.iiii[i];
376 }
377 return ret;
378 }
379 };
380
383 typedef GpuVectorCF vec;
384 vec ret;
385 for(int i=0;i<vec::N;i++){
386 ret.rrrr[i] = a.rrrr[i]*b.rrrr[i]+c.rrrr[i];
387 ret.iiii[i] = a.rrrr[i]*b.iiii[i]+c.iiii[i];
388 }
389 return ret;
390 }
392 typedef GpuVectorCD vec;
393 vec ret;
394 for(int i=0;i<vec::N;i++){
395 ret.rrrr[i] = a.rrrr[i]*b.rrrr[i]+c.rrrr[i];
396 ret.iiii[i] = a.rrrr[i]*b.iiii[i]+c.iiii[i];
397 }
398 return ret;
399 }
400 };
401
411
412 struct Mult{
414 a= a+b*c;
415 }
417 a= a+b*c;
418 }
419 // Real float
423 // Real double
430 };
431
432 struct Div{
433 // Real float
443
444 // Danger -- element wise divide fro complex, not complex div.
445 // See Grid_vector_types.h lines around 735, applied after "toReal"
452 };
453
454
455 struct Conj{
456 // Complex single
458 typedef GpuVectorCF vec;
459 vec ret;
460 for(int i=0;i<vec::N;i++){
461 ret.rrrr[i] = in.rrrr[i];
462 ret.iiii[i] =-in.iiii[i];
463 }
464 return ret;
465 }
467 typedef GpuVectorCD vec;
468 vec ret;
469 for(int i=0;i<vec::N;i++){
470 ret.rrrr[i] = in.rrrr[i];
471 ret.iiii[i] =-in.iiii[i];
472 }
473 return ret;
474 }
475 };
476
478 //Complex single
480 typedef GpuVectorCF vec;
481 vec ret;
482 for(int i=0;i<vec::N;i++){
483 ret.rrrr[i] = in.iiii[i];
484 ret.iiii[i] =-in.rrrr[i];
485 }
486 return ret;
487 }
489 typedef GpuVectorCD vec;
490 vec ret;
491 for(int i=0;i<vec::N;i++){
492 ret.rrrr[i] = in.iiii[i];
493 ret.iiii[i] =-in.rrrr[i];
494 }
495 return ret;
496 }
497 };
498
499 struct TimesI{
500 //Complex single
502 typedef GpuVectorCF vec;
503 vec ret;
504 for(int i=0;i<vec::N;i++){
505 ret.rrrr[i] =-in.iiii[i];
506 ret.iiii[i] = in.rrrr[i];
507 }
508 return ret;
509 }
511 typedef GpuVectorCD vec;
512 vec ret;
513 for(int i=0;i<vec::N;i++){
514 ret.rrrr[i] =-in.iiii[i];
515 ret.iiii[i] = in.rrrr[i];
516 }
517 return ret;
518 }
519 };
520
521 struct Permute{
522
523 template <int n,int _N, class _datum >
526 vec out;
527 unsigned int _mask = vec::N >> (n + 1);
528 for(int i=0;i<vec::N;i++) {
529 out.rrrr[i] = in.rrrr[i^_mask];
530 }
531 return out;
532 }
533 template <int n,int _N, class _datum >
536 vec out;
537 unsigned int _mask = vec::N >> (n + 1);
538 for(int i=0;i<vec::N;i++) {
539 out.rrrr[i] = in.rrrr[i^_mask];
540 out.iiii[i] = in.iiii[i^_mask];
541 }
542 return out;
543 }
544
545 template <typename vec> static accelerator_inline vec Permute0(vec in) { return PermuteN<0,vec::N,typename vec::datum>(in); }
546 template <typename vec> static accelerator_inline vec Permute1(vec in) { return PermuteN<1,vec::N,typename vec::datum>(in); }
547 template <typename vec> static accelerator_inline vec Permute2(vec in) { return PermuteN<2,vec::N,typename vec::datum>(in); }
548 template <typename vec> static accelerator_inline vec Permute3(vec in) { return PermuteN<3,vec::N,typename vec::datum>(in); }
549
550 };
551
553
555 // Single / Half
558 int N = GpuVectorCF::N;
559 GpuVectorCH h;
560 for(int i=0;i<N;i++) {
561 h.rrrr[i ] = float2half(a.rrrr[i]);
562 h.iiii[i ] = float2half(a.iiii[i]);
563 h.rrrr[i+N] = float2half(b.rrrr[i]);
564 h.iiii[i+N] = float2half(b.iiii[i]);
565 }
566 return h;
567 }
569 int N = GpuVectorCF::N;
570 for(int i=0;i<N;i++) {
571 sa.rrrr[i] = half2float(h.rrrr[i ]);
572 sa.iiii[i] = half2float(h.iiii[i ]);
573 sb.rrrr[i] = half2float(h.rrrr[i+N]);
574 sb.iiii[i] = half2float(h.iiii[i+N]);
575 }
576 }
578 int N = GpuVectorRF::N;
579 GpuVectorRH h;
580 for(int i=0;i<N;i++) {
581 h.rrrr[i ] = float2half(a.rrrr[i]);
582 h.rrrr[i+N] = float2half(b.rrrr[i]);
583 }
584 return h;
585 }
587 int N = GpuVectorRF::N;
588 for(int i=0;i<N;i++) {
589 sa.rrrr[i] = half2float(h.rrrr[i ]);
590 sb.rrrr[i] = half2float(h.rrrr[i+N]);
591 }
592 }
593
595 // Double Single
598 int N = GpuVectorCD::N;
599 GpuVectorCF h;
600 for(int i=0;i<N;i++) {
601 h.rrrr[i ] = a.rrrr[i];
602 h.iiii[i ] = a.iiii[i];
603 h.rrrr[i+N] = b.rrrr[i];
604 h.iiii[i+N] = b.iiii[i];
605 }
606 return h;
607 }
608
610 int N = GpuVectorCD::N;
611 for(int i=0;i<N;i++) {
612 sa.rrrr[i] = h.rrrr[i ];
613 sa.iiii[i] = h.iiii[i ];
614 sb.rrrr[i] = h.rrrr[i+N];
615 sb.iiii[i] = h.iiii[i+N];
616 }
617 }
618
620 int N = GpuVectorRD::N;
621 GpuVectorRF h;
622 for(int i=0;i<N;i++) {
623 h.rrrr[i ] = a.rrrr[i];
624 h.rrrr[i+N] = b.rrrr[i];
625 }
626 return h;
627 }
628
630 int N = GpuVectorRD::N;
631 for(int i=0;i<N;i++) {
632 sa.rrrr[i] = h.rrrr[i ];
633 sb.rrrr[i] = h.rrrr[i+N];
634 }
635 }
636
638 // Double Half
641 GpuVectorCF sa,sb;
642 sa = DtoS(a,b);
643 sb = DtoS(c,d);
644 return StoH(sa,sb);
645 }
647 GpuVectorCF sa,sb;
648 HtoS(h,sa,sb);
649 StoD(sa,a,b);
650 StoD(sb,c,d);
651 }
653 GpuVectorRF sa,sb;
654 sa = DtoS(a,b);
655 sb = DtoS(c,d);
656 return StoH(sa,sb);
657 }
659 GpuVectorRF sa,sb;
660 HtoS(h,sa,sb);
661 StoD(sa,a,b);
662 StoD(sb,c,d);
663 }
664 };
665
666struct Exchange{
667
668 template <int n,int _N, class _datum >
673 {
675 unsigned int mask = vec::N >> (n + 1);
676 for(int i=0;i<vec::N;i++) {
677 int j1 = i&(~mask);
678 if ( (i&mask) == 0 ) { out1.rrrr[i]=in1.rrrr[j1];}
679 else { out1.rrrr[i]=in2.rrrr[j1];}
680 int j2 = i|mask;
681 if ( (i&mask) == 0 ) { out2.rrrr[i]=in1.rrrr[j2];}
682 else { out2.rrrr[i]=in2.rrrr[j2];}
683 }
684 }
685 template <int n,int _N, class _datum >
690 {
692 unsigned int mask = vec::N >> (n + 1);
693 for(int i=0;i<vec::N;i++) {
694 int j1 = i&(~mask);
695 if ( (i&mask) == 0 ) {
696 out1.rrrr[i]=in1.rrrr[j1];
697 out1.iiii[i]=in1.iiii[j1];
698 }
699 else {
700 out1.rrrr[i]=in2.rrrr[j1];
701 out1.iiii[i]=in2.iiii[j1];
702 }
703 int j2 = i|mask;
704 if ( (i&mask) == 0 ) {
705 out2.rrrr[i]=in1.rrrr[j2];
706 out2.iiii[i]=in1.iiii[j2];
707 }
708 else {
709 out2.rrrr[i]=in2.rrrr[j2];
710 out2.iiii[i]=in2.iiii[j2];
711 }
712 }
713 }
714 template <typename vec>
715 static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){
716 ExchangeN<0>(out1,out2,in1,in2);
717 };
718 template <typename vec>
719 static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){
720 ExchangeN<1>(out1,out2,in1,in2);
721 };
722 template <typename vec>
723 static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){
724 ExchangeN<2>(out1,out2,in1,in2);
725 };
726 template <typename vec>
727 static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){
728 ExchangeN<3>(out1,out2,in1,in2);
729 };
730
731};
732
733struct Rotate{
734
735 template <int n, typename vec> static accelerator_inline vec tRotate(vec in){
736 return rotate(in, n);
737 }
738
739 template <int _N, class _datum >
741 {
743 vec out;
744 for(int i=0;i<vec::N;i++){
745 out.rrrr[i] = in.rrrr[(i + n)%vec::N];
746 out.iiii[i] = in.iiii[(i + n)%vec::N];
747 }
748 return out;
749 }
750
751 template <int _N, class _datum >
753 {
755 vec out;
756 for(int i=0;i<vec::N;i++){
757 out.rrrr[i] = in.rrrr[(i + n)%vec::N];
758 }
759 return out;
760 }
761
762 typedef GpuVectorRH SIMD_Htype; // Single precision type
763 typedef GpuVectorRF SIMD_Ftype; // Single precision type
764 typedef GpuVectorRD SIMD_Dtype; // Double precision type
765 typedef GpuVectorI SIMD_Itype; // Integer type
766
767 typedef GpuVectorCH SIMD_CHtype; // Single precision type
768 typedef GpuVectorCF SIMD_CFtype; // Single precision type
769 typedef GpuVectorCD SIMD_CDtype; // Double precision type
770
775 static accelerator_inline GpuVectorCH rotate(GpuVectorCH in, int n){ return rotate_template(in,n/2);} // Measure in complex not float
778
779};
780
782// Some Template specialization
783
784 //Complex float Reduce
785 template<>
786 accelerator_inline Grid::ComplexF
788 {
789 Grid::ComplexF greduce(in.rrrr[0],in.iiii[0]);
790 for(int i=1;i<GpuVectorCF::N;i++) {
791 greduce = greduce+Grid::ComplexF(in.rrrr[i],in.iiii[i]);
792 }
793 return greduce;
794 }
795
796 template<>
797 accelerator_inline Grid::ComplexD
799 {
800 Grid::ComplexD greduce(in.rrrr[0],in.iiii[0]);
801 for(int i=1;i<GpuVectorCD::N;i++) {
802 greduce = greduce+ Grid::ComplexD(in.rrrr[i],in.iiii[i]);
803 }
804 return greduce;
805 }
806
807 // Real
808 template<>
809 accelerator_inline Grid::RealF
811 {
812 RealF ret = in.rrrr[0];
813 for(int i=1;i<GpuVectorRF::N;i++) {
814 ret = ret+in.rrrr[i];
815 }
816 return ret;
817 }
818
819 template<>
820 accelerator_inline Grid::RealD
822 {
823 RealD ret = in.rrrr[0];
824 for(int i=1;i<GpuVectorRD::N;i++) {
825 ret = ret+in.rrrr[i];
826 }
827 return ret;
828 }
829
830 template<>
833 {
834 Integer ret = in.rrrr[0];
835 for(int i=1;i<GpuVectorI::N;i++) {
836 ret = ret+in.rrrr[i];
837 }
838 return ret;
839 }
840
841}// End optimizatoin
842
844// Here assign types
846 typedef GpuVectorRH SIMD_Htype; // Single precision type
847 typedef GpuVectorRF SIMD_Ftype; // Single precision type
848 typedef GpuVectorRD SIMD_Dtype; // Double precision type
849 typedef GpuVectorI SIMD_Itype; // Integer type
850
851 typedef GpuVectorCH SIMD_CHtype; // Single precision type
852 typedef GpuVectorCF SIMD_CFtype; // Single precision type
853 typedef GpuVectorCD SIMD_CDtype; // Double precision type
854
855 // prefetch utilities
856 accelerator_inline void v_prefetch0(int size, const char *ptr){};
857 accelerator_inline void prefetch_HINT_T0(const char *ptr){};
858
859 // Function name aliases
864 template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
865
866 // Arithmetic operations
877
878}
#define accelerator_inline
#define accelerator
#define COALESCE_GRANULARITY
accelerator_inline float sfw_half_to_float(Grid_half h)
accelerator_inline Grid_half sfw_float_to_half(float ff)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
uint32_t Integer
Definition Simd.h:58
float RealF
Definition Simd.h:60
double RealD
Definition Simd.h:61
static INTERNAL_PRECISION F
Definition Zolotarev.cc:230
accelerator_inline half float2half(float f)
constexpr int NSIMD_ComplexF
Optimization::Reduce< S, T > ReduceSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::Div DivSIMD
accelerator GpuVector< N, datum > operator/(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
GpuComplexVector< NSIMD_ComplexF, float > GpuVectorCF
Optimization::MultComplex MultComplexSIMD
Optimization::Conj ConjSIMD
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
accelerator_inline void prefetch_HINT_T0(const char *ptr)
constexpr int NSIMD_Integer
constexpr int NSIMD_ComplexH
GpuVector< NSIMD_RealF, float > GpuVectorRF
Optimization::MultRealPart MultRealPartSIMD
Optimization::TimesI TimesISIMD
constexpr int NSIMD_ComplexD
GpuVectorCD SIMD_CDtype
GpuVectorRD SIMD_Dtype
Optimization::Mult MultSIMD
accelerator_inline void v_prefetch0(int size, const char *ptr)
GpuComplexVector< NSIMD_ComplexH, half > GpuVectorCH
GpuVectorRF SIMD_Ftype
GpuVectorCH SIMD_CHtype
accelerator_inline float half2float(half h)
constexpr int NSIMD_RealH
GpuVectorI SIMD_Itype
Optimization::Vset VsetSIMD
GpuVector< NSIMD_RealD, double > GpuVectorRD
Optimization::Vstore VstoreSIMD
GpuVector< NSIMD_RealH, half > GpuVectorRH
constexpr int NSIMD_RealF
Optimization::Sub SubSIMD
GpuVectorRH SIMD_Htype
Optimization::TimesMinusI TimesMinusISIMD
GpuVector< NSIMD_Integer, Integer > GpuVectorI
accelerator GpuVector< N, datum > operator*(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
GpuVectorCF SIMD_CFtype
accelerator GpuVector< N, datum > operator+(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
Optimization::Vstream VstreamSIMD
accelerator GpuVector< N, datum > operator-(const GpuVector< N, datum > l, const GpuVector< N, datum > r)
constexpr int NSIMD_RealD
GpuComplexVector< NSIMD_ComplexD, double > GpuVectorCD
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
static accelerator_inline void ExchangeN(GpuVector< _N, _datum > &out1, GpuVector< _N, _datum > &out2, GpuVector< _N, _datum > &in1, GpuVector< _N, _datum > &in2)
static accelerator_inline void Exchange1(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange3(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange0(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void Exchange2(vec &out1, vec &out2, vec &in1, vec &in2)
static accelerator_inline void ExchangeN(GpuComplexVector< _N, _datum > &out1, GpuComplexVector< _N, _datum > &out2, GpuComplexVector< _N, _datum > &in1, GpuComplexVector< _N, _datum > &in2)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b, GpuVectorCD c)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b, GpuVectorCF c)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline void mac(GpuVectorRF &a, GpuVectorRF b, GpuVectorRF c)
accelerator_inline void mac(GpuVectorRD &a, GpuVectorRD b, GpuVectorRD c)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
static accelerator_inline GpuComplexVector< _N, _datum > PermuteN(GpuComplexVector< _N, _datum > &in)
static accelerator_inline vec Permute0(vec in)
static accelerator_inline vec Permute1(vec in)
static accelerator_inline vec Permute2(vec in)
static accelerator_inline GpuVector< _N, _datum > PermuteN(GpuVector< _N, _datum > &in)
static accelerator_inline vec Permute3(vec in)
static accelerator_inline GpuVectorRH DtoH(GpuVectorRD a, GpuVectorRD b, GpuVectorRD c, GpuVectorRD d)
static accelerator_inline GpuVectorCH DtoH(GpuVectorCD a, GpuVectorCD b, GpuVectorCD c, GpuVectorCD d)
static accelerator_inline void HtoS(GpuVectorCH h, GpuVectorCF &sa, GpuVectorCF &sb)
static accelerator_inline GpuVectorRF DtoS(GpuVectorRD a, GpuVectorRD b)
static accelerator_inline void HtoD(GpuVectorRH h, GpuVectorRD &a, GpuVectorRD &b, GpuVectorRD &c, GpuVectorRD &d)
static accelerator_inline void HtoS(GpuVectorRH h, GpuVectorRF &sa, GpuVectorRF &sb)
static accelerator_inline GpuVectorCF DtoS(GpuVectorCD a, GpuVectorCD b)
static accelerator_inline void HtoD(GpuVectorCH h, GpuVectorCD &a, GpuVectorCD &b, GpuVectorCD &c, GpuVectorCD &d)
static accelerator_inline void StoD(GpuVectorRF h, GpuVectorRD &sa, GpuVectorRD &sb)
static accelerator_inline GpuVectorRH StoH(GpuVectorRF a, GpuVectorRF b)
static accelerator_inline GpuVectorCH StoH(GpuVectorCF a, GpuVectorCF b)
static accelerator_inline void StoD(GpuVectorCF h, GpuVectorCD &sa, GpuVectorCD &sb)
accelerator_inline Out_type operator()(In_type in)
static accelerator_inline GpuVector< _N, _datum > rotate_template(GpuVector< _N, _datum > &in, int n)
static accelerator_inline GpuVectorCH rotate(GpuVectorCH in, int n)
static accelerator_inline GpuVectorI rotate(GpuVectorI in, int n)
static accelerator_inline GpuVectorRF rotate(GpuVectorRF in, int n)
static accelerator_inline GpuComplexVector< _N, _datum > rotate_template(GpuComplexVector< _N, _datum > &in, int n)
static accelerator_inline GpuVectorCF rotate(GpuVectorCF in, int n)
static accelerator_inline GpuVectorRH rotate(GpuVectorRH in, int n)
static accelerator_inline GpuVectorRD rotate(GpuVectorRD in, int n)
static accelerator_inline GpuVectorCD rotate(GpuVectorCD in, int n)
static accelerator_inline vec tRotate(vec in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b)
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b)
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b)
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCF operator()(GpuVectorCF in)
accelerator_inline GpuVectorCD operator()(GpuVectorCD in)
accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a)
accelerator_inline GpuVectorRD operator()(double *a)
accelerator_inline GpuVectorCD operator()(Grid::ComplexD *a)
accelerator_inline GpuVectorRF operator()(float *a)
accelerator_inline GpuVectorI operator()(Integer *a)
accelerator_inline GpuVectorCF operator()(float a, float b)
accelerator_inline GpuVectorI operator()(Integer a)
accelerator_inline GpuVectorRD operator()(double a)
accelerator_inline GpuVectorCD operator()(double a, double b)
accelerator_inline GpuVectorRF operator()(float a)
accelerator_inline void operator()(GpuComplexVector< N, datum > a, P *Fp)
accelerator_inline void operator()(GpuVector< N, datum > a, P *Fp)
accelerator_inline void operator()(P *F, GpuVector< N, datum > a)
accelerator_inline void operator()(P *F, GpuComplexVector< N, datum > a)
uint16_t x