Grid 0.7.0
Grid_vector_types.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Grid_vector_types.h
6
7 Copyright (C) 2015
8
9Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
10Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
11Author: Peter Boyle <paboyle@ph.ed.ac.uk>
12Author: neo <cossu@post.kek.jp>
13Author: paboyle <paboyle@ph.ed.ac.uk>
14Author: Michael Marshall <michael.marshall@ed.ac.au>
15
16 This program is free software; you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation; either version 2 of the License, or
19 (at your option) any later version.
20
21 This program is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
25
26 You should have received a copy of the GNU General Public License along
27 with this program; if not, write to the Free Software Foundation, Inc.,
28 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
29
30 See the full license in the file "LICENSE" in the top level distribution directory
31*************************************************************************************/
32/* END LEGAL */
33//---------------------------------------------------------------------------
37// Time-stamp: <2015-07-10 17:45:33 neo>
38//---------------------------------------------------------------------------
39#ifndef GRID_VECTOR_TYPES
40#define GRID_VECTOR_TYPES
41
42// PAB - Lifted and adapted from Eigen, which is GPL V2
43struct Grid_half {
45 accelerator Grid_half(uint16_t raw) : x(raw) {}
46 uint16_t x;
47};
48union FP32 {
49 unsigned int u;
50 float f;
51};
53 const FP32 magic = { 113 << 23 };
54 const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
55 FP32 o;
56 o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
57 unsigned int exp = shifted_exp & o.u; // just the exponent
58 o.u += (127 - 15) << 23; // exponent adjust
59 // handle exponent special cases
60 if (exp == shifted_exp) { // Inf/NaN?
61 o.u += (128 - 16) << 23; // extra exp adjust
62 } else if (exp == 0) { // Zero/Denormal?
63 o.u += 1 << 23; // extra exp adjust
64 o.f -= magic.f; // renormalize
65 }
66 o.u |= (h.x & 0x8000) << 16; // sign bit
67 return o.f;
68}
70 FP32 f; f.f = ff;
71 const FP32 f32infty = { 255 << 23 };
72 const FP32 f16max = { (127 + 16) << 23 };
73 const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
74 unsigned int sign_mask = 0x80000000u;
75 Grid_half o;
76
77 o.x = static_cast<unsigned short>(0x0u);
78 unsigned int sign = f.u & sign_mask;
79 f.u ^= sign;
80 // NOTE all the integer compares in this function can be safely
81 // compiled into signed compares since all operands are below
82 // 0x80000000. Important if you want fast straight SSE2 code
83 // (since there's no unsigned PCMPGTD).
84 if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
85 o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
86 } else { // (De)normalized number or zero
87 if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
88 // use a magic value to align our 10 mantissa bits at the bottom of
89 // the float. as long as FP addition is round-to-nearest-even this
90 // just works.
91 f.f += denorm_magic.f;
92 // and one integer subtract of the bias later, we have our final float!
93 o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
94 } else {
95 unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
96
97 // update exponent, rounding bias part 1
98 f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
99 // rounding bias part 2
100 f.u += mant_odd;
101 // take the bits!
102 o.x = static_cast<unsigned short>(f.u >> 13);
103 }
104 }
105 o.x |= static_cast<unsigned short>(sign >> 16);
106 return o;
107}
108
109
110#ifdef GPU_VEC
111#include "Grid_gpu_vec.h"
112#endif
113
114#ifdef GPU_RRII
115#include "Grid_gpu_rrii.h"
116#endif
117
118#ifdef GEN
119 #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here
120 #include <arm_sve.h>
121 #if defined(A64FX) // VLA
122 #pragma message("building A64FX / SVE ACLE VLA")
123 #if defined(ARMCLANGCOMPAT)
124 #pragma message("applying data types patch")
125 #endif
126 #include "Grid_a64fx-2.h"
127 #endif
128 #if defined(A64FXFIXEDSIZE) // fixed size data types
129 #pragma message("building for A64FX / SVE ACLE fixed size")
130 #include "Grid_a64fx-fixedsize.h"
131 #endif
132 #else
133 #include "Grid_generic.h"
134 #endif
135#endif
136
137#ifdef A64FX
138 #include <arm_sve.h>
139 #ifdef __ARM_FEATURE_SVE_BITS
140 //#pragma message("building A64FX SVE VLS")
141 #include "Grid_a64fx-fixedsize.h"
142 #else
143 #pragma message("building A64FX SVE VLA")
144 #if defined(ARMCLANGCOMPAT)
145 #pragma message("applying data types patch")
146 #endif
147 #include "Grid_a64fx-2.h"
148 #endif
149#endif
150
151#ifdef SSE4
152#include "Grid_sse4.h"
153#endif
154#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
155#include "Grid_avx.h"
156#endif
157#if defined AVX512
158#include "Grid_avx512.h"
159#endif
160#if defined IMCI
161#include "Grid_imci.h"
162#endif
163#ifdef NEONV8
164#include "Grid_neon.h"
165#endif
166#if defined QPX
167#include "Grid_qpx.h"
168#endif
169
171
172
174// To take the floating point type of real/complex type
176template <typename T>
177struct RealPart {
178 typedef T type;
179};
180template <typename T>
181struct RealPart<complex<T> > {
182 typedef T type;
183};
184
185#include <type_traits>
186
188// demote a vector to real type
190// type alias used to simplify the syntax of std::enable_if
191template <typename T> using Invoke = typename T::type;
192template <typename Condition, typename ReturnType = void> using EnableIf = Invoke<std::enable_if<Condition::value, ReturnType> >;
193template <typename Condition, typename ReturnType = void> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
194
196// Check for complexity with type traits
197template <typename T> struct is_complex : public std::false_type {};
198template <> struct is_complex<ComplexD> : public std::true_type {};
199template <> struct is_complex<ComplexF> : public std::true_type {};
200
201template <typename T> struct is_ComplexD : public std::false_type {};
202template <> struct is_ComplexD<ComplexD> : public std::true_type {};
203
204template <typename T> struct is_ComplexF : public std::false_type {};
205template <> struct is_ComplexF<ComplexF> : public std::true_type {};
206
207template<typename T, typename V=void> struct is_real : public std::false_type {};
208template<typename T> struct is_real<T, typename std::enable_if<std::is_floating_point<T>::value,
209 void>::type> : public std::true_type {};
210
211template<typename T, typename V=void> struct is_integer : public std::false_type {};
212template<typename T> struct is_integer<T, typename std::enable_if<std::is_integral<T>::value,
213 void>::type> : public std::true_type {};
214
215template <typename T> using IfReal = Invoke<std::enable_if<is_real<T>::value, int> >;
216template <typename T> using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
217template <typename T> using IfInteger = Invoke<std::enable_if<is_integer<T>::value, int> >;
218template <typename T1,typename T2> using IfSame = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
219
220template <typename T> using IfNotReal = Invoke<std::enable_if<!is_real<T>::value, int> >;
221template <typename T> using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
222template <typename T> using IfNotInteger = Invoke<std::enable_if<!is_integer<T>::value, int> >;
223template <typename T1,typename T2> using IfNotSame = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
224
226// Define the operation templates functors
227// general forms to allow for vsplat syntax
228// need explicit declaration of types when used since
229// clang cannot automatically determine the output type sometimes
230template <class Out, class Input1, class Input2, class Input3, class Operation>
231Out accelerator_inline trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
232 return op(src_1, src_2, src_3);
233}
234template <class Out, class Input1, class Input2, class Operation>
235Out accelerator_inline binary(Input1 src_1, Input2 src_2, Operation op) {
236 return op(src_1, src_2);
237}
238template <class Out, class Input, class Operation>
239Out accelerator_inline unary(Input src, Operation op) {
240 return op(src);
241}
242
243
244/*
245 @brief Grid_simd class for the SIMD vector type operations
246*/
247template <class Scalar_type, class Vector_type>
249public:
251 typedef Vector_type vector_type;
252 typedef Scalar_type scalar_type;
253
254 /*
255 typedef union conv_t_union {
256 Vector_type v;
257 Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
258 accelerator_inline conv_t_union(){};
259 } conv_t;
260 */
261
262 Vector_type v;
263
264 static accelerator_inline constexpr int Nsimd(void) {
265 static_assert( (sizeof(Vector_type) / sizeof(Scalar_type) >= 1), " size mismatch " );
266 return sizeof(Vector_type) / sizeof(Scalar_type);
267 }
268
269 #ifdef ARMCLANGCOMPAT
270 template <class S = Scalar_type>
271 accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) {
272 //v = rhs.v;
273 svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
274 return *this;
275 };
276
277 template <class S = Scalar_type>
278 accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) {
279 //v = rhs.v;
280 svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
281 return *this;
282 };
283
284 /*
285 template <class S = Scalar_type>
286 accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) {
287 //v = rhs.v;
288 svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
289 return *this;
290 };
291
292 template <class S = Scalar_type>
293 accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) {
294 //v = rhs.v;
295 svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
296 return *this;
297 };
298 */
299
300 // ComplexF
301 template <class S = Scalar_type>
302 accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_ComplexF<S>::value, S>::type, Vector_type> &&rhs) {
303 //v = rhs.v;
304 svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v)));
305 return *this;
306 };
307
308 template <class S = Scalar_type>
309 accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_ComplexF<S>::value, S>::type, Vector_type> &rhs) {
310 //v = rhs.v;
311 svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v)));
312 return *this;
313 };
314
315 // ComplexD
316 template <class S = Scalar_type>
317 accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_ComplexD<S>::value, S>::type, Vector_type> &&rhs) {
318 //v = rhs.v;
319 svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v)));
320 return *this;
321 };
322
323 template <class S = Scalar_type>
324 accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_ComplexD<S>::value, S>::type, Vector_type> &rhs) {
325 //v = rhs.v;
326 svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v)));
327 return *this;
328 };
329
330 #else
331
333 v = rhs.v;
334 return *this;
335 };
337 v = rhs.v;
338 return *this;
339 }; // faster than not declaring it and leaving to the compiler
340
341 #endif
342
344
345 #ifdef ARMCLANGCOMPAT
346 template <class S = Scalar_type>
347 accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
348 template <class S = Scalar_type>
349 accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
350 template <class S = Scalar_type>
351 accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
352 template <class S = Scalar_type>
353 accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
354 #else
355 accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps
356 accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
357 #endif
358 accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
359 // Enable if complex type
360 template <typename S = Scalar_type> accelerator_inline
361 Grid_simd(const typename std::enable_if<is_complex<S>::value, S>::type a) {
362 vsplat(*this, a);
363 };
364
366 // Constructors
369 vzero(*this);
370 return (*this);
371 }
372
373
374
376 // mac, mult, sub, add, adj
378
379 // FIXME -- alias this to an accelerator_inline MAC struct.
380
381 #if defined(A64FX) || defined(A64FXFIXEDSIZE)
382 friend accelerator_inline void mac(Grid_simd *__restrict__ y,
383 const Grid_simd *__restrict__ a,
384 const Grid_simd *__restrict__ x) {
385 *y = fxmac((*a), (*x), (*y));
386 };
387 #else
388 friend accelerator_inline void mac(Grid_simd *__restrict__ y,
389 const Grid_simd *__restrict__ a,
390 const Grid_simd *__restrict__ x) {
391 *y = (*a) * (*x) + (*y);
392 };
393 #endif
394
395 friend accelerator_inline void mult(Grid_simd *__restrict__ y,
396 const Grid_simd *__restrict__ l,
397 const Grid_simd *__restrict__ r) {
398 *y = (*l) * (*r);
399 }
400
401 friend accelerator_inline void sub(Grid_simd *__restrict__ y,
402 const Grid_simd *__restrict__ l,
403 const Grid_simd *__restrict__ r) {
404 *y = (*l) - (*r);
405 }
406 friend accelerator_inline void add(Grid_simd *__restrict__ y,
407 const Grid_simd *__restrict__ l,
408 const Grid_simd *__restrict__ r) {
409 *y = (*l) + (*r);
410 }
411 friend accelerator_inline void mac(Grid_simd *__restrict__ y,
412 const Scalar_type *__restrict__ a,
413 const Grid_simd *__restrict__ x) {
414 *y = (*a) * (*x) + (*y);
415 };
416 friend accelerator_inline void mult(Grid_simd *__restrict__ y,
417 const Scalar_type *__restrict__ l,
418 const Grid_simd *__restrict__ r) {
419 *y = (*l) * (*r);
420 }
421 friend accelerator_inline void sub(Grid_simd *__restrict__ y,
422 const Scalar_type *__restrict__ l,
423 const Grid_simd *__restrict__ r) {
424 *y = (*l) - (*r);
425 }
426 friend accelerator_inline void add(Grid_simd *__restrict__ y,
427 const Scalar_type *__restrict__ l,
428 const Grid_simd *__restrict__ r) {
429 *y = (*l) + (*r);
430 }
431
432 friend accelerator_inline void mac(Grid_simd *__restrict__ y,
433 const Grid_simd *__restrict__ a,
434 const Scalar_type *__restrict__ x) {
435 *y = (*a) * (*x) + (*y);
436 };
437 friend accelerator_inline void mult(Grid_simd *__restrict__ y,
438 const Grid_simd *__restrict__ l,
439 const Scalar_type *__restrict__ r) {
440 *y = (*l) * (*r);
441 }
442 friend accelerator_inline void sub(Grid_simd *__restrict__ y,
443 const Grid_simd *__restrict__ l,
444 const Scalar_type *__restrict__ r) {
445 *y = (*l) - (*r);
446 }
447 friend accelerator_inline void add(Grid_simd *__restrict__ y,
448 const Grid_simd *__restrict__ l,
449 const Scalar_type *__restrict__ r) {
450 *y = (*l) + (*r);
451 }
452
454 // FIXME: gonna remove these load/store, get, set, prefetch
456 friend accelerator_inline void vset(Grid_simd &ret, Scalar_type *a) {
457 ret.v = unary<Vector_type>(a, VsetSIMD());
458 }
459
461 // Vstore
463 friend accelerator_inline void vstore(const Grid_simd &ret, Scalar_type *a) {
464 binary<void>(ret.v, (Real *) a, VstoreSIMD());
465 }
466
468 // Vprefetch
471 prefetch_HINT_T0((const char *)&v.v);
472 }
473
475 // Reduce
477 friend accelerator_inline Scalar_type Reduce(const Grid_simd &in) {
479 }
480
482 // operator scalar * simd
484 friend accelerator_inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
485 Grid_simd va;
486 vsplat(va, a);
487 return va * b;
488 }
489 friend accelerator_inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) {
490 return a * b;
491 }
492
494 // Divides
496 friend accelerator_inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) {
497 Grid_simd va;
498 vsplat(va, a);
499 return va / b;
500 }
501 friend accelerator_inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) {
502 Grid_simd va;
503 vsplat(va, a);
504 return b / a;
505 }
506
508 // Unary negation
511 Grid_simd ret;
512 vzero(ret);
513 ret = ret - r;
514 return ret;
515 }
516 // *=,+=,-= operators
518 *this = (*this) * r;
519 return *this;
520 // return (*this)*r; ?
521 }
523 *this = *this + r;
524 return *this;
525 }
527 *this = *this - r;
528 return *this;
529 }
530
532 // Not all functions are supported
533 // through SIMD and must breakout to
534 // scalar type and back again. This
535 // provides support
537
538 template <class functor>
539 friend accelerator_inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
540 Grid_simd ret;
542
543 for (int i = 0; i < Nsimd(); i++) {
544 s = v.getlane(i);
545 s = func(s);
546 ret.putlane(s,i);
547 }
548 return ret;
549 }
550 template <class functor>
551 friend accelerator_inline Grid_simd SimdApplyBinop(const functor &func,
552 const Grid_simd &x,
553 const Grid_simd &y) {
554 Grid_simd ret;
556
557 for (int i = 0; i < Nsimd(); i++) {
558 sx = x.getlane(i);
559 sy = y.getlane(i);
560 sx = func(sx,sy);
561 ret.putlane(sx,i);
562 }
563 return ret;
564 }
565
566 // Exchange
567 // Al Ah , Bl Bh -> Al Bl Ah,Bh
569 friend accelerator_inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
570 {
571 if (n==3) {
572 Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
573 } else if(n==2) {
574 Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
575 } else if(n==1) {
576 Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
577 } else if(n==0) {
578 Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
579 }
580 }
582 Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
583 }
585 Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
586 }
588 Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
589 }
591 Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
592 }
593
594 // General permute; assumes vector length is same across
595 // all subtypes; may not be a good assumption, but could
596 // add the vector width as a template param for BG/Q for example
599 y.v = Optimization::Permute::Permute0(b.v);
600 }
602 y.v = Optimization::Permute::Permute1(b.v);
603 }
605 y.v = Optimization::Permute::Permute2(b.v);
606 }
608 y.v = Optimization::Permute::Permute3(b.v);
609 }
611 if (perm & RotateBit) {
612 int dist = perm & 0xF;
613 y = rotate(b, dist);
614 return;
615 }
616 else if(perm==3) permute3(y, b);
617 else if(perm==2) permute2(y, b);
618 else if(perm==1) permute1(y, b);
619 else if(perm==0) permute0(y, b);
620 }
621
623 // Getting single lanes
625#ifdef GPU_RRII
626 template <class S = Scalar_type,IfComplex<S> = 0>
627 accelerator_inline Scalar_type getlane(int lane) const {
628 return Scalar_type(v.rrrr[lane],v.iiii[lane]);
629 }
630 template <class S = Scalar_type,IfComplex<S> = 0>
631 accelerator_inline void putlane(const Scalar_type &_S, int lane){
632 v.rrrr[lane] = real(_S);
633 v.iiii[lane] = imag(_S);
634 }
635 template <class S = Scalar_type,IfNotComplex<S> = 0>
636 accelerator_inline Scalar_type getlane(int lane) const {
637 return ((S*)&v)[lane];
638 }
639 template <class S = Scalar_type,IfNotComplex<S> = 0>
640 accelerator_inline void putlane(const S &_S, int lane){
641 ((Scalar_type*)&v)[lane] = _S;
642 }
643#else // Can pun to an array of complex
644 accelerator_inline Scalar_type getlane(int lane) const {
645 return ((Scalar_type*)&v)[lane];
646 }
647 accelerator_inline void putlane(const Scalar_type &S, int lane){
648 ((Scalar_type*)&v)[lane] = S;
649 }
650#endif
651
652}; // end of Grid_simd class definition
653
654
656// Define available types
658
663
664#if defined(GPU_VEC) || defined(GPU_RRII)
665typedef Grid_simd<complex<uint16_t>, SIMD_CHtype> vComplexH;
666typedef Grid_simd<complex<float> , SIMD_CFtype> vComplexF;
667typedef Grid_simd<complex<double> , SIMD_CDtype> vComplexD;
668#else
672#endif
673
675// Pointer type to use on extractLane
677template<class _scalar> class ExtractTypeMap { public: typedef _scalar extract_type;};
678#ifdef GPU_VEC
679template<> class ExtractTypeMap< complex<uint16_t> > { public: typedef half2 extract_type;};
680template<> class ExtractTypeMap< complex< float> > { public: typedef float2 extract_type;};
681template<> class ExtractTypeMap< complex< double> > { public: typedef double2 extract_type;};
682#endif
683
685// Permute
687
690accelerator_inline void permute(RealD &y,RealD b, int perm) { y=b; }
691accelerator_inline void permute(RealF &y,RealF b, int perm) { y=b; }
692
694// General rotate
696template <class S, class V, IfNotComplex<S> = 0>
698 nrot = nrot % Grid_simd<S, V>::Nsimd();
699 Grid_simd<S, V> ret;
700 ret.v = Optimization::Rotate::rotate(b.v, nrot);
701 return ret;
702}
703template <class S, class V, IfComplex<S> = 0>
705 nrot = nrot % Grid_simd<S, V>::Nsimd();
706 Grid_simd<S, V> ret;
707 ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
708 return ret;
709}
710template <class S, class V, IfNotComplex<S> =0>
712{
713 nrot = nrot % Grid_simd<S,V>::Nsimd();
714 ret.v = Optimization::Rotate::rotate(b.v,nrot);
715}
716template <class S, class V, IfComplex<S> =0>
718{
719 nrot = nrot % Grid_simd<S,V>::Nsimd();
720 ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
721}
722
723template <class S, class V>
725 S* typepun =(S*) &src;
726 vsplat(ret,typepun[lane]);
727}
728template <class S, class V, IfComplex<S> =0>
730 S* typepun =(S*) &src;
731 ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
732}
733
734
735
737// Splat
739
740// this is only for the complex version
741template <class S, class V, IfComplex<S> = 0, class ABtype>
742accelerator_inline void vsplat(Grid_simd<S, V> &ret, ABtype a, ABtype b) {
743 ret.v = binary<V>(a, b, VsplatSIMD());
744}
745
746// overload if complex
747template <class S, class V>
751template <class S, class V>
755
756// if real fill with a, if complex fill with a in the real part (first function
757// above)
758template <class S, class V>
762
763
764
766// Initialise to 1,0,i for the correct types
768// For complex types
769template <class S, class V, IfComplex<S> = 0>
771 vsplat(ret, S(1.0, 0.0));
772}
773template <class S, class V, IfComplex<S> = 0>
775 vsplat(ret, S(0.0, 0.0));
776} // use xor?
777template <class S, class V, IfComplex<S> = 0>
779 vsplat(ret, S(0.0, 1.0));
780}
781
782template <class S, class V, IfComplex<S> = 0>
784 vsplat(ret, S(1.0, -1.0));
785}
786template <class S, class V, IfComplex<S> = 0>
788 vsplat(ret, S(-1.0, 1.0));
789}
790
791// if not complex overload here
792template <class S, class V, IfReal<S> = 0>
794 vsplat(ret, S(1.0));
795}
796template <class S, class V, IfReal<S> = 0>
798 vsplat(ret, S(0.0));
799}
800
801// For integral types
802template <class S, class V, IfInteger<S> = 0>
804 vsplat(ret, 1);
805}
806template <class S, class V, IfInteger<S> = 0>
808 vsplat(ret, 0);
809}
810template <class S, class V, IfInteger<S> = 0>
812 vsplat(ret, 0xFFFFFFFF);
813}
814template <class S, class V, IfInteger<S> = 0>
816 vsplat(ret, 0);
817}
818template <class S, class V>
822
824// Vstream
826template <class S, class V, IfReal<S> = 0>
828 binary<void>((S *)&out.v, in.v, VstreamSIMD());
829}
830template <class S, class V, IfComplex<S> = 0>
832 typedef typename S::value_type T;
833 binary<void>((T *)&out.v, in.v, VstreamSIMD());
834}
835template <class S, class V, IfInteger<S> = 0>
837 out = in;
838}
839
841// Arithmetic operator overloads +,-,*
843template <class S, class V>
849
850template <class S, class V>
856
857// Distinguish between complex types and others
858template <class S, class V, IfComplex<S> = 0>
864template <class S, class V, IfComplex<S> = 0>
870
871
872// Distinguish between complex types and others
873template <class S, class V, IfComplex<S> = 0>
879
880// Real/Integer types
881template <class S, class V, IfNotComplex<S> = 0>
887
888// ---------------- A64FX MAC -------------------
889// Distinguish between complex types and others
890#if defined(A64FX) || defined(A64FXFIXEDSIZE)
891template <class S, class V, IfComplex<S> = 0>
893 Grid_simd<S, V> ret;
894 ret.v = trinary<V>(a.v, b.v, c.v, MultAddComplexSIMD());
895 return ret;
896};
897
898// Real/Integer types
899template <class S, class V, IfNotComplex<S> = 0>
901 Grid_simd<S, V> ret;
902 ret.v = trinary<V>(a.v, b.v, c.v, MultSIMD());
903 return ret;
904};
905#endif
906// ----------------------------------------------
907
908
910// Conjugate
912template <class S, class V, IfComplex<S> = 0>
914 Grid_simd<S, V> ret;
915 ret.v = unary<V>(in.v, ConjSIMD());
916 return ret;
917}
918template <class S, class V, IfNotComplex<S> = 0>
920 return in; // for real objects
921}
922// Suppress adj for integer types... // odd; why conjugate above but not adj??
923template <class S, class V, IfNotInteger<S> = 0>
927
929// timesMinusI
931template <class S, class V, IfComplex<S> = 0>
935template <class S, class V, IfComplex<S> = 0>
941template <class S, class V, IfNotComplex<S> = 0>
945
946// timesI
948template <class S, class V, IfComplex<S> = 0>
950 ret.v = unary<V>(in.v, TimesISIMD());
951}
952template <class S, class V, IfComplex<S> = 0>
954 Grid_simd<S, V> ret;
955 ret.v= unary<V>(in.v, TimesISIMD());
956 return ret;
957}
958template <class S, class V, IfNotComplex<S> = 0>
962
963
964// Distinguish between complex types and others
965template <class S, class V, IfComplex<S> = 0>
967 typedef Grid_simd<S, V> simd;
968
969 simd ret;
970 simd den;
971
972 ret = a * conjugate(b) ;
973 den = b * conjugate(b) ;
974
975 // duplicates real part
976 auto real_den = toReal(den);
977 simd zden;
978 memcpy((void *)&zden.v,(void *)&real_den.v,sizeof(zden));
979 ret.v=binary<V>(ret.v, zden.v, DivSIMD());
980 return ret;
981};
982
983// Real/Integer types
984template <class S, class V, IfNotComplex<S> = 0>
990
991
993// Inner, outer
995template <class S, class V>
999template <class S, class V>
1003
1004template <class S, class V>
1006 return arg;
1007}
1008
1010// copy/splat complex real parts into real;
1011// insert real into complex and zero imag;
1013
1014
1015template <class T> struct toRealMapper {};
1016template<> struct toRealMapper<vComplexF> { typedef vRealF Realified; };
1017template<> struct toRealMapper<vComplexD> { typedef vRealD Realified; };
1018// real = toReal( complex )
1019template <class Csimd> // must be a real arg
1021 typedef typename toRealMapper<Csimd>::Realified Rsimd;
1022 Rsimd ret;
1023 int j=0;
1024 for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
1025 auto s = real(in.getlane(j++));
1026 ret.putlane(s,i);
1027 ret.putlane(s,i+1);
1028 }
1029 return ret;
1030}
1031
1032template <class T> struct toComplexMapper {};
1033template<> struct toComplexMapper<vRealF> { typedef vComplexF Complexified; };
1034template<> struct toComplexMapper<vRealD> { typedef vComplexD Complexified; };
1035
1036// complex = toComplex( real )
1037template <class Rsimd> // must be a real arg
1039
1040 typedef typename toComplexMapper<Rsimd>::Complexified Csimd;
1041 typedef typename Csimd::scalar_type scalar_type;
1042 int j=0;
1043 Csimd ret;
1044 for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
1045 auto rr = in.getlane(i);
1046 auto ri = in.getlane(i+1);
1047 assert(rr==ri);
1048 // trap any cases where real was not duplicated
1049 // indicating the SIMD grids of real and imag assignment did not correctly
1050 // match
1051 scalar_type s(rr,0.0);
1052 ret.putlane(s,j++);
1053 }
1054 return ret;
1055}
1056
1057
1059{
1060 assert((nvec&0x1)==0);
1061 for(int m=0;m*2<nvec;m++){
1062 int n=m*2;
1063 out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
1064 }
1065}
1067{
1068 assert((nvec&0x3)==0);
1069 for(int m=0;m*4<nvec;m++){
1070 int n=m*4;
1071 out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
1072 }
1073}
1075{
1076 assert((nvec&0x1)==0);
1077 for(int m=0;m*2<nvec;m++){
1078 int n=m*2;
1079 out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
1080 }
1081}
1083{
1084 assert((nvec&0x1)==0);
1085 for(int m=0;m*2<nvec;m++){
1086 int n=m*2;
1087 Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
1088 // Bug in gcc 10.0.1 and gcc 10.1 using fixed-size SVE ACLE data types CAS-159553-Y1K4C6
1089 // function call results in compile-time error:
1090 // In function ‘void Grid::precisionChange(Grid::vRealD*, Grid::vRealF*, int)’:
1091 // .../Grid_vector_types.h:961:56: error:
1092 // cannot bind non-const lvalue reference of type ‘vecd&’ {aka ‘svfloat64_t&’}
1093 // to an rvalue of type ‘vecd’ {aka ‘svfloat64_t’}
1094 // 961 | Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
1095 // | ~~~~~~~^
1096 }
1097}
1099{
1100 assert((nvec&0x3)==0);
1101 for(int m=0;m*4<nvec;m++){
1102 int n=m*4;
1103 Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
1104 }
1105}
1107{
1108 assert((nvec&0x1)==0);
1109 for(int m=0;m*2<nvec;m++){
1110 int n=m*2;
1111 Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
1112 }
1113}
1114accelerator_inline void precisionChange(vComplexF *out,const vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);}
1115accelerator_inline void precisionChange(vComplexH *out,const vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);}
1116accelerator_inline void precisionChange(vComplexH *out,const vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);}
1117accelerator_inline void precisionChange(vComplexD *out,const vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);}
1118accelerator_inline void precisionChange(vComplexD *out,const vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
1119accelerator_inline void precisionChange(vComplexF *out,const vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
1120
1121// Check our vector types are of an appropriate size.
1122
1123#if defined QPX
1124static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
1125static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
1126#else
1127#ifndef GENERIC_SCALAR
1128static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
1129static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
1130#endif
1131#endif
1132
1133// Fixme need coalesced read gpermute
1134template<class vobj> void gpermute(vobj & inout,int perm){
1135 vobj tmp=inout;
1136 if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
1137 if (perm & 0x2 ) { permute(inout,tmp,1); tmp=inout;}
1138 if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
1139 if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
1140}
1141
1143
1144#ifdef GRID_SYCL
1145template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
1146template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
1147template<> struct sycl::is_device_copyable<Grid::vRealF > : public std::true_type {};
1148template<> struct sycl::is_device_copyable<Grid::vRealD > : public std::true_type {};
1149template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
1150#endif
1151
1152
1153#endif
#define accelerator_inline
#define accelerator
Optimization::Vstream VstreamSIMD
Optimization::TimesMinusI TimesMinusISIMD
Optimization::MultComplex MultComplexSIMD
Optimization::TimesI TimesISIMD
void prefetch_HINT_T0(const char *ptr)
Optimization::Reduce< S, T > ReduceSIMD
Optimization::Mult MultSIMD
Optimization::MaddRealPart MaddRealPartSIMD
Optimization::vecd SIMD_Dtype
Optimization::veci SIMD_Itype
Optimization::Vstore VstoreSIMD
Optimization::Conj ConjSIMD
Optimization::vecf SIMD_Ftype
Optimization::Vsplat VsplatSIMD
Optimization::Sum SumSIMD
Optimization::Sub SubSIMD
Optimization::MultAddComplex MultAddComplexSIMD
Optimization::Div DivSIMD
Optimization::MultRealPart MultRealPartSIMD
Optimization::Vset VsetSIMD
Optimization::vech SIMD_Htype
#define perm(a, b, n, w)
Optimization libraries for SSE4 instructions set.
accelerator_inline Grid_simd< S, V > operator/(Grid_simd< S, V > a, Grid_simd< S, V > b)
accelerator_inline void timesMinusI(Grid_simd< S, V > &ret, const Grid_simd< S, V > &in)
accelerator_inline Grid_simd< S, V > real_mult(Grid_simd< S, V > a, Grid_simd< S, V > b)
accelerator_inline void vone(Grid_simd< S, V > &ret)
accelerator_inline void permute(ComplexD &y, ComplexD b, int perm)
accelerator_inline Grid_simd< S, V > operator+(Grid_simd< S, V > a, Grid_simd< S, V > b)
Invoke< std::enable_if<!std::is_same< T1, T2 >::value, int > > IfNotSame
accelerator_inline void timesI(Grid_simd< S, V > &ret, const Grid_simd< S, V > &in)
Invoke< std::enable_if< is_complex< T >::value, int > > IfComplex
Grid_simd< complex< float >, SIMD_Ftype > vComplexF
accelerator_inline void vcomplex_i(Grid_simd< S, V > &ret)
Out accelerator_inline unary(Input src, Operation op)
Grid_simd< uint16_t, SIMD_Htype > vRealH
accelerator_inline void vfalse(Grid_simd< S, V > &ret)
Invoke< std::enable_if< std::is_same< T1, T2 >::value, int > > IfSame
accelerator_inline float sfw_half_to_float(Grid_half h)
accelerator_inline Grid_simd< S, V > operator-(Grid_simd< S, V > a, Grid_simd< S, V > b)
Grid_simd< complex< uint16_t >, SIMD_Htype > vComplexH
accelerator_inline Grid_simd< S, V > outerProduct(const Grid_simd< S, V > &l, const Grid_simd< S, V > &r)
Grid_simd< float, SIMD_Ftype > vRealF
accelerator_inline void zeroit(Grid_simd< S, V > &z)
accelerator_inline void vstream(Grid_simd< S, V > &out, const Grid_simd< S, V > &in)
accelerator_inline toRealMapper< Csimd >::Realified toReal(const Csimd &in)
Invoke< std::enable_if<!is_integer< T >::value, int > > IfNotInteger
Out accelerator_inline trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op)
Invoke< std::enable_if<!is_real< T >::value, int > > IfNotReal
accelerator_inline Grid_half sfw_float_to_half(float ff)
accelerator_inline void rbroadcast(Grid_simd< S, V > &ret, const Grid_simd< S, V > &src, int lane)
accelerator_inline Grid_simd< S, V > adj(const Grid_simd< S, V > &in)
accelerator_inline Grid_simd< S, V > rotate(Grid_simd< S, V > b, int nrot)
Grid_simd< complex< double >, SIMD_Dtype > vComplexD
Invoke< std::enable_if< is_integer< T >::value, int > > IfInteger
accelerator_inline Grid_simd< S, V > operator*(Grid_simd< S, V > a, Grid_simd< S, V > b)
accelerator_inline Grid_simd< S, V > real_madd(Grid_simd< S, V > a, Grid_simd< S, V > b, Grid_simd< S, V > c)
accelerator_inline Grid_simd< S, V > trace(const Grid_simd< S, V > &arg)
accelerator_inline void vsplat(Grid_simd< S, V > &ret, ABtype a, ABtype b)
accelerator_inline void rsplat(Grid_simd< S, V > &ret, EnableIf< is_complex< S >, S > c)
accelerator_inline void visign(Grid_simd< S, V > &ret)
accelerator_inline Grid_simd< S, V > conjugate(const Grid_simd< S, V > &in)
typename T::type Invoke
Invoke< std::enable_if<!Condition::value, ReturnType > > NotEnableIf
accelerator_inline void precisionChange(vRealF *out, const vRealD *in, int nvec)
accelerator_inline void vzero(Grid_simd< S, V > &ret)
accelerator_inline void vrsign(Grid_simd< S, V > &ret)
accelerator_inline Grid_simd< S, V > innerProduct(const Grid_simd< S, V > &l, const Grid_simd< S, V > &r)
Invoke< std::enable_if< Condition::value, ReturnType > > EnableIf
void gpermute(vobj &inout, int perm)
accelerator_inline toComplexMapper< Rsimd >::Complexified toComplex(const Rsimd &in)
accelerator_inline void vbroadcast(Grid_simd< S, V > &ret, const Grid_simd< S, V > &src, int lane)
Invoke< std::enable_if< is_real< T >::value, int > > IfReal
Grid_simd< Integer, SIMD_Itype > vInteger
Grid_simd< double, SIMD_Dtype > vRealD
Out accelerator_inline binary(Input1 src_1, Input2 src_2, Operation op)
Invoke< std::enable_if<!is_complex< T >::value, int > > IfNotComplex
accelerator_inline void vtrue(Grid_simd< S, V > &ret)
accelerator_inline Grid_simd< S, V > exp(const Grid_simd< S, V > &r)
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
std::complex< T > complex
Definition Simd.h:82
#define RotateBit
Definition Simd.h:54
std::complex< RealF > ComplexF
Definition Simd.h:78
float RealF
Definition Simd.h:60
std::complex< RealD > ComplexD
Definition Simd.h:79
double RealD
Definition Simd.h:61
friend accelerator_inline void exchange3(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2)
accelerator_inline Grid_simd & operator=(const Grid_simd &rhs)
friend accelerator_inline void vprefetch(const Grid_simd &v)
friend accelerator_inline void add(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r)
accelerator_inline Scalar_type getlane(int lane) const
static accelerator_inline constexpr int Nsimd(void)
friend accelerator_inline void vset(Grid_simd &ret, Scalar_type *a)
accelerator Grid_simd()=default
friend accelerator_inline Grid_simd operator*(const Scalar_type &a, Grid_simd b)
Vector_type vector_type
friend accelerator_inline void sub(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ l, const Grid_simd *__restrict__ r)
friend accelerator_inline void sub(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r)
friend accelerator_inline void add(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ l, const Grid_simd *__restrict__ r)
accelerator_inline Grid_simd(const Grid_simd &&rhs)
friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x)
friend accelerator_inline Grid_simd operator/(const Scalar_type &a, Grid_simd b)
friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ a, const Grid_simd *__restrict__ x)
accelerator_inline Grid_simd(const typename std::enable_if< is_complex< S >::value, S >::type a)
friend accelerator_inline void permute2(Grid_simd &y, Grid_simd b)
friend accelerator_inline Grid_simd operator/(Grid_simd b, const Scalar_type &a)
friend accelerator_inline void permute3(Grid_simd &y, Grid_simd b)
accelerator_inline Grid_simd & operator=(const Grid_simd &&rhs)
friend accelerator_inline void sub(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Scalar_type *__restrict__ r)
friend accelerator_inline void permute0(Grid_simd &y, Grid_simd b)
accelerator_inline void putlane(const Scalar_type &S, int lane)
friend accelerator_inline void add(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Scalar_type *__restrict__ r)
accelerator_inline Grid_simd & operator*=(const Grid_simd &r)
friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ l, const Grid_simd *__restrict__ r)
friend accelerator_inline void exchange1(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2)
friend accelerator_inline Grid_simd operator-(const Grid_simd &r)
friend accelerator_inline Scalar_type Reduce(const Grid_simd &in)
friend accelerator_inline Grid_simd operator*(Grid_simd b, const Scalar_type &a)
accelerator_inline Grid_simd(const Real a)
Scalar_type scalar_type
RealPart< Scalar_type >::type Real
friend accelerator_inline void exchange0(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2)
accelerator_inline Grid_simd(const Grid_simd &rhs)
accelerator_inline Grid_simd & operator=(const Zero &z)
accelerator_inline Grid_simd & operator+=(const Grid_simd &r)
friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Scalar_type *__restrict__ r)
friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r)
friend accelerator_inline void vstore(const Grid_simd &ret, Scalar_type *a)
friend accelerator_inline void exchange(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2, int n)
friend accelerator_inline void permute(Grid_simd &y, Grid_simd b, int perm)
friend accelerator_inline void exchange2(Grid_simd &out1, Grid_simd &out2, Grid_simd in1, Grid_simd in2)
accelerator_inline Grid_simd & operator-=(const Grid_simd &r)
friend accelerator_inline Grid_simd SimdApplyBinop(const functor &func, const Grid_simd &x, const Grid_simd &y)
friend accelerator_inline Grid_simd SimdApply(const functor &func, const Grid_simd &v)
friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Scalar_type *__restrict__ x)
friend accelerator_inline void permute1(Grid_simd &y, Grid_simd b)
Definition Simd.h:194
accelerator Grid_half()
accelerator Grid_half(uint16_t raw)
Scalar_type type
unsigned int u